コード例 #1
0
    def _fetch_products_with_thread_pool_executor(self, links: Iterable[str], *, max_workers: Optional[int] = None
                                                  ) -> Iterator[Product]:
        """
        Fetch iterator of the products.
        Returned iterator of the results of ``_get_product`` method.

        Note: iterator might contain error that has occurred during
        thread pool execution.

        :param links: links that refer on the product web pages
        :type links: Iterable[str]
        :param max_workers: max workers of the pool
        :type max_workers: Optional[int]

        :return: iterator of products
        :rtype: Iterator[Product]
        """

        thread_pool_params = {
            'max_workers': max_workers,
            'thread_name_prefix': f'{self.__class__.__name__}.dump_products'
        }

        with thread.ThreadPoolExecutor(**thread_pool_params) as executor:
            logger.info(f'Quantity of used workers in pool: {executor._max_workers}.')
            products_iterator: Iterator[Product] = executor.map(
                self._get_product,
                links
            )

        return products_iterator
コード例 #2
0
 def run(self, arguments_set):
     total_executors = min(self.max_thread_workers, len(arguments_set))
     executor = thread.ThreadPoolExecutor(max_workers=total_executors)
     future_items = [executor.submit(self.function, argument) for argument in arguments_set]
     wait(future_items)
     for result in future_items:
         result.result()
コード例 #3
0
ファイル: clients.py プロジェクト: Natsurii/b00t
    def __init__(
        self,
        *args,
        max_workers: int = None,
        enable_default_help: bool = True,
        ignore_event_decorator_call: bool = False,
        ignore_overwrite_on_message: bool = False,
        shut_up: bool = False,
        **kwargs,
    ):
        self._thread_pool = thread.ThreadPoolExecutor(
            max_workers=max_workers,
            thread_name_prefix=f"libneko.clients.{type(self).__name__} worker",
        )

        # Update: we do not need this; Python automatically does this anyway.
        # Ensure thread pool shuts down.
        # @atexit.register
        # def kill_pool():
        #    try:
        #        self._thread_pool.shutdown(wait=False)
        #    except Exception:
        #        pass
        self._has_logged_out_triggered = False

        self._ignore_event_decorator_call = ignore_event_decorator_call or shut_up
        self._ignore_overwrite_on_message = ignore_overwrite_on_message or shut_up

        super().__init__(*args, **kwargs)

        if not enable_default_help:
            self.remove_command("help")
コード例 #4
0
def init():
    global _downloaders_thread_pool

    if _downloaders_thread_pool:
        _downloaders_thread_pool.shutdown()
    _downloaders_thread_pool = thread.ThreadPoolExecutor(
        int(const.config.max_downloads))
コード例 #5
0
    def start(self):
        '''开启服务,线程控制'''
        try:
            print(
                f"\033[33m{'-'*10} {f'开 始 监 听({self.ip}:{self.port})':^32} {'-'*10}\033[0m\r\n"
            )
            t = thread.ThreadPoolExecutor(self.thread)
            while True:
                try:
                    # 接收套接字
                    conn, addr = self.s.accept()

                    # 每个会话一个实例
                    s = session()

                    # 传入设置
                    s.server = self.server
                    s.dirlist = self.dirlist
                    s.image = self.image
                    s.header = self.header
                    s.footer = self.footer
                    s._index = self._index
                    s.__method__ = self.__method__

                    t.submit(s.http, conn, addr)
                except BlockingIOError as e:  # 无连接pass继续查询
                    pass
            t.shutdown(wait=True)
        except KeyboardInterrupt as e:
            logging.exception(e)
            print("\033[31m手动停止\033[0m")
            self.s.shutdown(2)
            self.s.close()
            exit()
コード例 #6
0
ファイル: bot.py プロジェクト: Natsurii/nicabot-monkee
 def __enter__(self):
     threads = 4 * os.cpu_count() - 1 or 4
     processes = len(os.sched_getaffinity(0)) or 4
     self.logger.info(
         "Acquiring up to %s thread workers and up to %s process workers for asyncio executors",
         threads, processes)
     self.thread_pool = thread.ThreadPoolExecutor(max_workers=threads)
     self.process_pool = process.ProcessPoolExecutor(max_workers=processes)
     return self
コード例 #7
0
ファイル: CharacterHandle.py プロジェクト: watry/dieloli
def initCharacterPosition():
    characterPositionPool = thread.ThreadPoolExecutor(
        max_workers=GameConfig.threading_pool_max)
    characterList = CacheContorl.npcTemData
    for i in range(0, len(characterList)):
        characterIdS = str(i + 1)
        characterData = characterList[i]
        characterPositionPool.submit(initCharacterPositionNow, characterData,
                                     characterIdS)
    characterPositionPool.shutdown()
コード例 #8
0
ファイル: CharacterHandle.py プロジェクト: watry/dieloli
def initCharacterList():
    initCharacterThreadPool = thread.ThreadPoolExecutor(
        max_workers=GameConfig.threading_pool_max)
    initCharacterTem()
    characterList = CacheContorl.npcTemData
    i = 1
    for character in characterList:
        initCharacterThreadPool.submit(initCharacter, i, character)
        i += 1
    initCharacterThreadPool.shutdown()
    initCharacterPosition()
コード例 #9
0
    def maybe_start_xprof(seconds):
        if jax.host_id() == 0 and FLAGS.xprof:
            xprof = xprof_session.XprofSession()
            xprof.start_session('REDACTED', True, 2)

            def sleep_and_end_xprof():
                time.sleep(seconds)
                logging.info(
                    'Xprof URL: %s',
                    xprof.end_session_and_get_url(
                        tag='flax resnet, {} devices, batch {} per device'.
                        format(jax.device_count(), device_batch_size)))

            thread.ThreadPoolExecutor(1, 'xprof').submit(sleep_and_end_xprof)
コード例 #10
0
def main():
    pool = thread.ThreadPoolExecutor(20)
    threading_list = []
    for i in range(1, 267):
        url = 'https://piao.qunar.com/ticket/list.htm?keyword=%E5%8C%97%E4%BA%AC&region=&from=mps_search_suggest&page={}'.format(
            i)
        t = pool.submit(task, url)
        threading_list.append(t)

    for future in as_completed(threading_list):
        ret = future.result()
        print(ret)

    with open('qunaer.json', mode='w', encoding='utf-8') as fp:
        json.dump(data, fp, ensure_ascii=False)
コード例 #11
0
    def maybe_start_xprof(seconds):
        if jax.host_id() == 0 and FLAGS.xprof:
            xprof = xprof_session.XprofSession()
            xprof.start_session('REDACTED', True, 2)

            def sleep_and_end_xprof():
                time.sleep(seconds)
                logging.info(
                    'Xprof URL: %s',
                    xprof.end_session_and_get_url(
                        tag=
                        'flax transformer, {} devices, {}-way, batch {} per replica'
                        .format(jax.device_count(), num_partitions,
                                device_train_input_shape[0])))

            thread.ThreadPoolExecutor(1, 'xprof').submit(sleep_and_end_xprof)
コード例 #12
0
ファイル: download.py プロジェクト: Senyeah/data301-project
def all_dates():
  # Stop polluting the working directory by creating an download folder
  if not os.path.exists(GDELT_OUTPUT_DIRECTORY):
    # I have already downloaded a majority of this data and compressed it, so attempt to
    # restore most of what is needed from the bucket (this will save about an hour)
    CACHED_ARCHIVE = 'gdelt.tar.gz'
    BUCKET_URL = f'https://storage.googleapis.com/data301-bucket-9n5z0ph0/{CACHED_ARCHIVE}'

    # Place to store all the data, needs about 30 GB disk space
    os.mkdir(GDELT_OUTPUT_DIRECTORY)

    try:
      # Download the file, then just call tar to do the extraction (sorry Windows)
      print('Downloading cached archive from', BUCKET_URL)
      request.urlretrieve(BUCKET_URL, CACHED_ARCHIVE)
      print('Extracting compressed archive...')
      subprocess.run(['tar', 'zxf', CACHED_ARCHIVE, '-C', GDELT_OUTPUT_DIRECTORY])
      print('Extraction complete!')
      os.remove(CACHED_ARCHIVE)
    except:
      # Bucket won't exist forever :(
      print('Failed downloading URL', BUCKET_URL)

  # Initialise gdelt so its API can be queried
  gd = gdelt.gdelt(version=2)
  downloaded_dates = []

  # Parallelize the download, as lots of event data is needed
  with thread.ThreadPoolExecutor(max_workers=GDELT_DOWNLOAD_WORKERS) as executor:
    # Pull every date out of the slice...
    slices = [slice for slice in analysis_dates()]
    dates = (date for dates in slices for date in dates)

    # ...log how many are to be downloaded
    download_cnt = ANALYSIS_SLICE_PERIOD_DAYS * ANALYSIS_SLICE_MULTIPLIER * ANALYSIS_BLOCK_COUNT
    print('Downloading', download_cnt, 'event files...')

    # ...and then download the corresponding GDELT event data for that day
    for date in dates:
      executor.submit(download_date, date, gd)
      downloaded_dates.append(date_formatted(date))

  print('Download complete')
  # All dates flattened, along with every (formatted) date in each slice of dates
  return downloaded_dates, [[date_formatted(date) for date in dates] for dates in slices]
コード例 #13
0
 def start(self):
     '''开启服务,线程控制'''
     try:
         print(
             f"\033[33m{'-'*10} {f'开 始 监 听({self.ip}:{self.port})':^32} {'-'*10}\033[0m\r\n"
         )
         t = thread.ThreadPoolExecutor(self.thread)
         while True:
             try:
                 # 接收套接字
                 conn, addr = self.s.accept()
                 t.submit(self.http, conn, addr)
             except BlockingIOError as e:  # 无连接pass继续查询
                 pass
         t.shutdown(wait=True)
     except KeyboardInterrupt as e:
         logging.exception(e)
         print("\033[31m手动停止\033[0m")
         self.s.shutdown(2)
         self.s.close()
コード例 #14
0
    def __init__(self,
                 runner: 'mtap.processing.ProcessingComponent',
                 host: str,
                 port: int = 0,
                 *,
                 register: bool = False,
                 workers: Optional[int] = None,
                 write_address: bool = False,
                 config: 'Optional[mtap.Config]' = None):
        self.host = host
        self._port = port
        self.processor_id = runner.processor_id
        self.write_address = write_address

        if config is None:
            config = _config.Config()

        self._health_servicer = health.HealthServicer()
        self._health_servicer.set('', 'SERVING')
        self._servicer = _ProcessorServicer(
            config=config,
            address=host,
            runner=runner,
            health_servicer=self._health_servicer,
            register=register)
        workers = workers or 10
        thread_pool = thread.ThreadPoolExecutor(max_workers=workers)
        self._server = grpc.server(
            thread_pool,
            options=[('grpc.max_send_message_length',
                      config.get('grpc.max_send_message_length')),
                     ('grpc.max_receive_message_length',
                      config.get('grpc.max_receive_message_length'))])
        health_pb2_grpc.add_HealthServicer_to_server(self._health_servicer,
                                                     self._server)
        processing_pb2_grpc.add_ProcessorServicer_to_server(
            self._servicer, self._server)
        self._port = self._server.add_insecure_port("{}:{}".format(
            self.host, self.port))
        self._stopped_event = threading.Event()
        self._address_file = None
コード例 #15
0
    def process(self):
        # t1 = Thread(target=self.consistency)
        # t2 = Thread(target=self.form)
        # t3 = Thread(target=self.recent_form)
        # t4 = Thread(target=self.total_consistency)
        # t5 = Thread(target=self.opposition)
        # t6 = Thread(target=self.venue)
        #
        # t1.start()
        # t2.start()
        # t3.start()
        # t4.start()
        # t5.start()
        # t6.start()
        #
        # t1.join()
        # t2.join()
        # t3.join()
        # t4.join()
        # t5.join()
        # t6.join()

        with thread.ThreadPoolExecutor() as executor:
            f1 = executor.submit(self.consistency)
            f2 = executor.submit(self.form)
            f3 = executor.submit(self.recent_form)
            f4 = executor.submit(self.total_consistency)
            f5 = executor.submit(self.opposition)
            f6 = executor.submit(self.venue)

        con = f1.result()
        form = f2.result()
        rf = f3.result()
        tc = f4.result()
        opp = f5.result()
        ven = f6.result()

        return con, form, rf, tc, opp, ven
コード例 #16
0
def run_trigger(input_file, output_file, plugin, expect_timeout=False):

    input_message = json.load(open(input_file))
    expected_output = json.load(open(output_file))

    trigger_name = input_message["body"]["trigger"]
    capture = CaptureDispatcher()
    plugin.triggers[trigger_name].dispatcher = capture

    executor = thread.ThreadPoolExecutor()
    executor.submit(plugin.handle_step, input_message)
    future = executor.submit(capture.wait_for_caught_message)
    out = futures.wait([future], timeout=10)
    done = out.done

    # Non-graceful shutdown
    executor._threads.clear()
    futures.thread._threads_queues.clear()

    if len(done) <= 0:
        if expect_timeout:
            return
        raise Exception("Timeout")

    output = capture.caught_message

    if "body" in output and "log" in output["body"]:
        output["body"]["log"] = ""

    if "body" in expected_output and "log" in expected_output["body"]:
        expected_output["body"]["log"] = ""

    if output != expected_output:
        raise Exception(
            "Actual output differs from expected output.{} != {}".format(
                output, expected_output))
コード例 #17
0
ファイル: checkpoints.py プロジェクト: joaogui1/flax
def restore_checkpoint(ckpt_dir,
                       target,
                       step=None,
                       prefix='checkpoint_',
                       parallel=True):
    """Restore last/best checkpoint from checkpoints in path.

  Sorts the checkpoint files naturally, returning the highest-valued
  file, e.g.:
    ckpt_1, ckpt_2, ckpt_3 --> ckpt_3
    ckpt_0.01, ckpt_0.1, ckpt_0.001 --> ckpt_0.1
    ckpt_-1.0, ckpt_1.0, ckpt_1e5 --> ckpt_1e5

  Args:
    ckpt_dir: str: checkpoint file or directory of checkpoints to restore from.
    target: matching object to rebuild via deserialized state-dict. If None,
      the deserialized state-dict is returned as-is.
    step: int: step number to load or None to load latest. If specified,
      ckpt_dir must be a directory.
    prefix: str: name prefix of checkpoint files.
    parallel: bool: whether to load seekable checkpoints in parallel, for speed.

  Returns:
    Restored `target` updated from checkpoint file, or if no step specified and
    no checkpoint files present, returns the passed-in `target` unchanged.
    If a file path is specified and is not found, the passed-in `target` will be
    returned. This is to match the behavior of the case where a directory path
    is specified but the directory has not yet been created.
  """
    if step:
        ckpt_path = _checkpoint_path(ckpt_dir, step, prefix)
        if not gfile.exists(ckpt_path):
            raise ValueError(f'Matching checkpoint not found: {ckpt_path}')
    else:
        if gfile.isdir(ckpt_dir):
            ckpt_path = latest_checkpoint(ckpt_dir, prefix)
            if not ckpt_path:
                logging.info(f'Found no checkpoint files in {ckpt_dir}')
                return target
        else:
            ckpt_path = ckpt_dir
            if not gfile.exists(ckpt_path):
                logging.info(f'Found no checkpoint file at {ckpt_path}')
                return target

    logging.info('Restoring checkpoint from %s', ckpt_path)
    with gfile.GFile(ckpt_path, 'rb') as fp:
        if parallel and fp.seekable():
            buf_size = 128 << 20  # 128M buffer.
            num_bufs = fp.size() / buf_size
            logging.debug('num_bufs: %d', num_bufs)
            checkpoint_contents = bytearray(fp.size())

            def read_chunk(i):
                # NOTE: We have to re-open the file to read each chunk, otherwise the
                # parallelism has no effect. But we could reuse the file pointers
                # within each thread.
                with gfile.GFile(ckpt_path, 'rb') as f:
                    f.seek(i * buf_size)
                    buf = f.read(buf_size)
                    if buf:
                        checkpoint_contents[i * buf_size:i * buf_size +
                                            len(buf)] = buf
                    return len(buf) / buf_size

            pool_size = 32
            pool = thread.ThreadPoolExecutor(pool_size)
            results = pool.map(read_chunk, range(int(num_bufs) + 1))
            results = list(results)
            pool.shutdown(wait=False)
            logging.debug('results: %s', results)
        else:
            checkpoint_contents = fp.read()

        if target is None:
            return serialization.msgpack_restore(checkpoint_contents)
        else:
            return serialization.from_bytes(target, checkpoint_contents)
コード例 #18
0
def run_pretrain(optimizer):
  """Run bert pretraining.

  Args:
    optimizer: BERT model with pretraining layer

  Returns:
    optimizer: trained model
  """
  result_stats = {}
  def get_input_context():

    class InputContext():

      def __init__(self):
        self.input_pipeline_id = jax.host_id()
        self.num_input_pipelines = jax.host_count()
    return InputContext()

  summary_thread = thread.ThreadPoolExecutor(1, 'summary')
  host_id = jax.host_id()
  # Get input dataset
  input_files = []
  for input_pattern in FLAGS.input_files.split(','):
    input_files.extend(tf.io.gfile.glob(input_pattern))
  logging.info('*** Input Files ***')
  for input_file in input_files:
    logging.info('  %s', input_file)

  eval_input_files = []
  for input_pattern in FLAGS.eval_input_files.split(','):
    eval_input_files.extend(tf.io.gfile.glob(input_pattern))
  logging.info('*** Eval Input Files ***')
  for input_file in eval_input_files:
    logging.info('  %s', input_file)

  train_input_fn = input_pipeline.input_fn_builder(
      input_files=input_files,
      max_seq_length=FLAGS.max_seq_length,
      max_predictions_per_seq=FLAGS.max_predictions_per_seq,
      is_training=True,
      num_cpu_threads=8)

  host_train_batch_size = FLAGS.train_batch_size // jax.host_count()
  host_eval_batch_size = FLAGS.eval_batch_size // jax.host_count()

  params = {'batch_size': host_train_batch_size}
  input_context = get_input_context()
  train_dataset = train_input_fn(params, input_context)
  train_iterator = iter(train_dataset)

  eval_input_fn = input_pipeline.input_fn_builder(
      input_files=eval_input_files,
      max_seq_length=FLAGS.max_seq_length,
      max_predictions_per_seq=FLAGS.max_predictions_per_seq,
      is_training=False,
      num_cpu_threads=8,
      global_input_size=FLAGS.eval_sample_size)
  eval_params = {'batch_size': host_eval_batch_size}
  eval_dataset = eval_input_fn(eval_params, input_context)
  eval_iterator = iter(eval_dataset)

  # train step
  total_training_steps = FLAGS.total_training_steps
  learning_rate_fn = create_learning_rate_scheduler(
      base_learning_rate=FLAGS.learning_rate,
      warmup_steps=FLAGS.warmup_steps,
      total_training_steps=FLAGS.total_training_steps,
      poly_power=FLAGS.poly_power,
      start_warmup_step=FLAGS.start_warmup_step)

  # Device training loop cond.
  def device_train_loop_cond(args):
    _, _, _, _, _, _, step, epoch, num_steps_per_epoch = args
    return step // num_steps_per_epoch == epoch

  # Device training loop body.
  def device_train_loop_body(args):
    """Device training loop body."""
    (optimizer, total_loss, lm_loss, sentence_loss, new_dropout_rng, token,
     step, epoch, num_steps_per_epoch) = args
    device_batch_size = FLAGS.train_batch_size // jax.device_count()
    input_shape = [device_batch_size, FLAGS.max_seq_length]
    input_shape_pred = [device_batch_size, FLAGS.max_predictions_per_seq]
    (input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids,
     masked_lm_weights, next_sentence_labels), token = lax.infeed(
         token,
         shape=(jax.ShapedArray(input_shape, jnp.int32),
                jax.ShapedArray(input_shape, jnp.int32),
                jax.ShapedArray(input_shape, jnp.int32),
                jax.ShapedArray(input_shape_pred, jnp.int32),
                jax.ShapedArray(input_shape_pred, jnp.int32),
                jax.ShapedArray(input_shape_pred, jnp.float32),
                jax.ShapedArray([device_batch_size, 1], jnp.int32)))
    inputs = [input_ids, input_mask, segment_ids, masked_lm_positions]
    labels = [masked_lm_ids, masked_lm_weights, next_sentence_labels]
    optimizer, total_loss, lm_loss, sentence_loss, new_dropout_rng = train_step(
        optimizer,
        inputs,
        labels,
        learning_rate_fn,
        dropout_rng=new_dropout_rng)
    step += 1
    return (optimizer, total_loss, lm_loss, sentence_loss,
            new_dropout_rng, token, step, epoch, num_steps_per_epoch)

  # Device training loop.
  def device_train_loop(optimizer, dropout_rng, total_loss, lm_loss,
                        sentence_loss, step, epoch, num_steps_per_epoch):
    """Device training loop."""
    token = lax.create_token(step)
    (optimizer, total_loss, lm_loss, sentence_loss, dropout_rng,
     _, step, epoch, num_steps_per_epoch) = lax.while_loop(
         device_train_loop_cond, device_train_loop_body,
         (optimizer, total_loss, lm_loss, sentence_loss, dropout_rng, token,
          step, epoch, num_steps_per_epoch))
    return optimizer, total_loss, lm_loss, sentence_loss, dropout_rng, step

  if FLAGS.infeed:
    pmap_fn = jax.pmap
    if FLAGS.enable_buffer_donation:
      pmap_fn = functools.partial(pmap_fn, donate_argnums=(0, 1))
    if FLAGS.enable_wus:
      pmap_fn = functools.partial(
          pmap_fn, in_axes=(None, 0, None, None, None, None, None, None))

    p_train_epoch = pmap_fn(device_train_loop, axis_name='batch')
  else:
    # without infeed.
    p_train_step = jax.pmap(
        functools.partial(train_step, learning_rate_fn=learning_rate_fn),
        axis_name='batch')

  if FLAGS.infeed:
    # Infeed is currently synchronous, so do it in a background thread too
    infeed_pool = thread.ThreadPoolExecutor(jax.local_device_count(), 'infeed')

  pmap_fn = jax.pmap
  # Weight update sharding is not implemented yet for host train loop.
  # Enable wus on eval only if device loop is used.
  if FLAGS.enable_wus and FLAGS.infeed:
    pmap_fn = functools.partial(pmap_fn, in_axes=(None, 0, 0))
  p_eval_step = pmap_fn(eval_step, axis_name='batch')

  rng = random.PRNGKey(0)
  device_count = jax.local_device_count()
  dropout_rngs = random.split(rng, device_count)
  num_steps_per_epoch = np.int32(FLAGS.num_steps_per_epoch)
  if FLAGS.precompile:
    if FLAGS.infeed:
      if FLAGS.enable_wus:
        total_loss = np.float32(0.0)
        lm_loss = np.float32(0.0)
        sentence_loss = np.float32(0.0)
        host_step = 0
        host_epoch = 1
        optimizer = unbroadcast(optimizer)
        # the device training loop condition will immediately be false
        optimizer, total_loss, lm_loss, sentence_loss, _, _ = p_train_epoch(
            optimizer, dropout_rngs, total_loss, lm_loss, sentence_loss,
            host_step, host_epoch, num_steps_per_epoch)
      else:
        total_loss = jax_utils.replicate(np.float32(0.0))
        lm_loss = jax_utils.replicate(np.float32(0.0))
        sentence_loss = jax_utils.replicate(np.float32(0.0))
        device_step = jax_utils.replicate(0)
        device_epoch = jax_utils.replicate(1)
        # the device training loop condition will immediately be false
        optimizer, total_loss, lm_loss, sentence_loss, _, _ = p_train_epoch(
            optimizer, dropout_rngs, total_loss, lm_loss, sentence_loss,
            device_step, device_epoch, jax_utils.replicate(num_steps_per_epoch))

    else:
      train_input_shape = (host_train_batch_size, FLAGS.max_seq_length)
      train_input_shape_pred = (host_train_batch_size,
                                FLAGS.max_predictions_per_seq)
      word_id_data = jax.random.randint(rng, train_input_shape, 0, 10)
      mask_data = jax.random.randint(rng, train_input_shape, 0, 1)
      type_id_data = jax.random.randint(rng, train_input_shape, 0, 3)
      lm_mask = jax.random.randint(rng, train_input_shape_pred, 0, 5)
      masked_lm_ids = jax.random.randint(rng, train_input_shape_pred, 0, 2)
      masked_lm_weights = jax.random.randint(rng, train_input_shape_pred, 1,
                                             1).astype(np.float32)
      next_sentence_labels = jax.random.randint(rng, (host_train_batch_size, 1),
                                                0, 1)

      labels = [masked_lm_ids, masked_lm_weights, next_sentence_labels]
      train_inputs = [word_id_data, mask_data, type_id_data, lm_mask]
      train_inputs = common_utils.shard(train_inputs)
      labels = common_utils.shard(labels)
      p_train_step(optimizer, train_inputs, labels, dropout_rng=dropout_rngs)

    eval_input_shape = (host_eval_batch_size, FLAGS.max_seq_length)
    eval_input_shape_pred = (host_eval_batch_size,
                             FLAGS.max_predictions_per_seq)
    word_id_data = jax.random.randint(rng, eval_input_shape, 0, 10)
    mask_data = jax.random.randint(rng, eval_input_shape, 0, 1)
    type_id_data = jax.random.randint(rng, eval_input_shape, 0, 3)
    lm_mask = jax.random.randint(rng, eval_input_shape_pred, 0, 5)
    masked_lm_ids = jax.random.randint(rng, eval_input_shape_pred, 0, 2)
    masked_lm_weights = jax.random.randint(
        rng, eval_input_shape_pred, 1, 1).astype(np.float32)
    next_sentence_labels = jax.random.randint(rng, (host_eval_batch_size, 1), 0,
                                              1)

    eval_inputs = {
        'input_ids': word_id_data,
        'input_mask': mask_data,
        'segment_ids': type_id_data,
        'masked_lm_positions': lm_mask,
        'masked_lm_ids': masked_lm_ids,
        'masked_lm_weights': masked_lm_weights,
        'next_sentence_labels': next_sentence_labels
    }

    eval_inputs = common_utils.shard(eval_inputs)
    metrics = empty_metrics()
    optimizer_target = optimizer.target
    # Weight update sharding is not implemented yet for host train loop.
    # Enable wus on eval only if device loop is used.
    if FLAGS.enable_wus and FLAGS.infeed:
      optimizer_target = unbroadcast(optimizer_target)
    metrics = p_eval_step(optimizer_target, eval_inputs, metrics)
    metrics = allreduce_metrics(metrics)
  metrics = empty_metrics()
  time.sleep(FLAGS.init_sleep)
  allreduce_metrics(metrics)['masked_lm_weighted_correct'].block_until_ready()
  mlp_log.mlperf_print('init_stop', None)
  mlp_log.mlperf_print('run_start', None)
  # To make the logging consistent with other mlperf models,
  # in all the mlp_log, epochs are steps, and examples are sequences.
  mlp_log.mlperf_print('train_samples',
                       FLAGS.total_training_steps * FLAGS.train_batch_size)
  mlp_log.mlperf_print('eval_samples', FLAGS.eval_sample_size)
  xprof = None
  run_start = time.time()
  global RUN_STOP
  global TOTAL_STEPS
  RUN_STOP = False
  TOTAL_STEPS = False

  if host_id == 0:
    if FLAGS.end_to_end_profile:
      xprof = xprof_session.XprofSession()
      xprof.start_session(device_name='REDACTED',
                          enable_python_tracer=True,
                          host_trace_level=2)
    elif FLAGS.profile:
      profile_with_xprof_on_background(start_after_sec=FLAGS.profile_latency,
                                       profile_time_sec=FLAGS.profile_duration)

  if FLAGS.infeed:
    h_total_loss = np.float32(0.0)
    h_lm_loss = np.float32(0.0)
    h_sentence_loss = np.float32(0.0)

    d_total_loss = jax_utils.replicate(np.float32(0.0))
    d_lm_loss = jax_utils.replicate(np.float32(0.0))
    d_sentence_loss = jax_utils.replicate(np.float32(0.0))

  host_step, device_step = 0, jax_utils.replicate(0)
  device_epoch = jax_utils.replicate(0)
  num_train_epochs = FLAGS.total_training_steps // FLAGS.num_steps_per_epoch
  steps_per_epoch = num_steps_per_epoch
  if num_train_epochs >= 6:
    # Merge the first 6 epochs, as we do not have to do eval.
    steps_per_epoch = np.int32(num_steps_per_epoch * 6)
  for host_epoch in range(num_train_epochs):
    block_step = host_step
    # While BERT pretraining does not have epochs,
    # to make the logging consistent with other mlperf models,
    # in all the mlp_log, epochs are steps, and examples are sequences.
    mlp_log.mlperf_print(
        'block_start',
        None,
        metadata={
            'first_epoch_num': block_step,
            'epoch_count': FLAGS.num_steps_per_epoch
        })

    if not (num_train_epochs >= 6 and
            host_epoch in (1, 2, 3, 4, 5)) and FLAGS.infeed:
      if FLAGS.enable_wus:
        optimizer = unbroadcast(optimizer)
        (optimizer, total_loss, lm_loss, sentence_loss, dropout_rngs,
         device_step) = p_train_epoch(optimizer, dropout_rngs,
                                      h_total_loss, h_lm_loss, h_sentence_loss,
                                      host_step, host_epoch, steps_per_epoch)
      else:
        device_epoch = jax_utils.replicate(host_epoch)
        device_steps_per_epoch = jax_utils.replicate(steps_per_epoch)

        (optimizer, total_loss, lm_loss, sentence_loss, dropout_rngs,
         device_step) = p_train_epoch(optimizer, dropout_rngs,
                                      d_total_loss, d_lm_loss, d_sentence_loss,
                                      device_step, device_epoch,
                                      device_steps_per_epoch)
    # After first epoch, reduce the steps per epoch back to normal number.
    steps_per_epoch = num_steps_per_epoch

    # Training for one epoch.
    while int(host_step // FLAGS.num_steps_per_epoch) == host_epoch:
      input_data = next(train_iterator)
      input_data = jax.tree_map(lambda x: x.numpy(), input_data)
      input_data = jax.tree_map(common_utils.shard, input_data)
      input_ids = input_data['input_ids']
      input_mask = input_data['input_mask']
      segment_ids = input_data['segment_ids']
      masked_lm_positions = input_data['masked_lm_positions']
      masked_lm_ids = input_data['masked_lm_ids']
      masked_lm_weights = input_data['masked_lm_weights']
      next_sentence_labels = input_data['next_sentence_labels']

      # Infeed data to infeed queue.
      if FLAGS.infeed:
        for i, device in enumerate(jax.local_devices()):
          infeed_pool.submit(
              partial(device.transfer_to_infeed,
                      (input_ids[i], input_mask[i], segment_ids[i],
                       masked_lm_positions[i], masked_lm_ids[i],
                       masked_lm_weights[i], next_sentence_labels[i])))
      else:
        inputs = [input_ids, input_mask, segment_ids, masked_lm_positions]
        labels = [masked_lm_ids, masked_lm_weights, next_sentence_labels]
        (optimizer, total_loss, lm_loss, sentence_loss, dropout_rngs
         ) = p_train_step(optimizer, inputs, labels, dropout_rng=dropout_rngs)
      host_step += 1

    mlp_log.mlperf_print('block_stop', None, metadata={
        'first_epoch_num': block_step,
        'epoch_count': FLAGS.num_steps_per_epoch
    })
    # No need to do eval in the first 5 epochs as it has to traverse min 3M
    # samples.
    if host_epoch < 5:
      continue
    if host_step % FLAGS.num_steps_per_epoch == 0:
      mlp_log.mlperf_print(
          'eval_start', None, metadata={'epoch_num': host_step})
      optimizer_target = optimizer.target
      if FLAGS.enable_wus and FLAGS.infeed:
        optimizer_target = unbroadcast(optimizer_target)
      metrics = empty_metrics()
      for _ in range(FLAGS.max_eval_steps):
        inputs = jax.tree_map(lambda x: x.numpy(), next(eval_iterator))
        inputs = jax.tree_map(common_utils.shard, inputs)
        # Weight update sharding is not implemented yet for host train loop.
        # Enable wus on eval only if device loop is used.
        metrics = p_eval_step(optimizer_target, inputs, metrics)
      metrics = allreduce_metrics(metrics)
      train_metrics = {'total_loss': total_loss, 'lm_loss': lm_loss,
                       'sentence_loss': sentence_loss}
      # masked_lm_accuracy = get_masked_lm_accuracy(metrics)
      summary_thread.submit(partial(
          _write_metrics, metrics, train_metrics,
          host_step, total_training_steps, host_id))
    if host_step % FLAGS.num_steps_per_epoch == 0 and FLAGS.save_checkpoint:
      if host_id == 0:
        checkpoints.save_checkpoint(
            FLAGS.model_dir, optimizer, host_step, prefix='checkpoint', keep=1)
  allreduce_metrics(metrics)['masked_lm_weighted_correct'].block_until_ready()
  summary_thread.shutdown()
  if not RUN_STOP:
    mlp_log.mlperf_print('run_stop', None, metadata={'status': 'abort'})
  mlp_log.mlperf_print('run_final', None)

  if host_id == 0:
    if FLAGS.end_to_end_profile:
      xprof_url = xprof.end_session_and_get_url(tag='')
      logging.info('Xprof profile is at %s', xprof_url)


  if RUN_STOP:
    result_stats['total_time'] = RUN_STOP - run_start
    result_stats['total_steps'] = TOTAL_STEPS
  return optimizer, result_stats
コード例 #19
0
ファイル: mysql.py プロジェクト: Zhouhao12345/tenantpy
 def test_thread_local(self):
     with thread.ThreadPoolExecutor(max_workers=2) as t:
         for _ in range(100):
             t.submit(increase)
コード例 #20
0
ファイル: mysql.py プロジェクト: Zhouhao12345/tenantpy
 def test_mutil_thread_connect(self):
     with thread.ThreadPoolExecutor(max_workers=100) as t:
         for _ in range(1000):
             t.submit(request)
コード例 #21
0
def main(argv):
    global BLEU_THRESHOLD_REACHED
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    init_mllogger()
    mllogger.event('cache_clear')
    mllogger.start('init_start')
    mllogger.event('submission_org', 'Google')
    mllogger.event('submission_platform',
                   'TPUv3-{}'.format(jax.device_count()))
    mllogger.event('submission_division', 'closed')
    mllogger.event('submission_status', 'research')
    mllogger.event('submission_benchmark', 'transformer')
    mllogger.event('train_samples', input_pipeline.N_TRAIN)
    mllogger.event('eval_samples', input_pipeline.N_EVAL)

    tf.enable_v2_behavior()

    # Use hardware RNG for bernoulli randoms in dropout mask creation.
    if FLAGS.hardware_rng:
        models.set_hardware_bernoulli()

    num_partitions = FLAGS.num_partitions
    batch_size = FLAGS.batch_size
    if batch_size is None:
        batch_size = min(16 * jax.device_count() // num_partitions, 2048)
    mllogger.event('global_batch_size', batch_size)

    num_eval_steps = FLAGS.num_eval_steps
    max_target_length = FLAGS.max_target_length
    max_eval_target_length = FLAGS.max_eval_target_length
    max_length = max(max_target_length, max_eval_target_length)
    mllogger.event('max_sequence_length',
                   max_length,
                   metadata={'method': 'discard'})
    if FLAGS.random_seed is not None:
        seed = FLAGS.random_seed
    else:
        seed = np.uint32(time.time() if jax.host_id() == 0 else 0)
        seed = per_host_sum_pmap(seed)
    mllogger.event('seed', int(seed))
    steps_per_epoch = int(math.ceil(input_pipeline.N_TRAIN / batch_size))
    logging.info('steps per epoch: %d', steps_per_epoch)
    num_replicas = jax.local_device_count() // num_partitions
    device_train_input_shape = (batch_size //
                                (num_replicas * jax.host_count()),
                                max_target_length)
    # This is per-host; in principle 64/replica or more should fit
    eval_batch_size = min(
        32 * num_replicas,
        int(
            math.ceil(input_pipeline.N_EVAL /
                      (num_replicas * jax.host_count()))) * num_replicas)
    logging.info('eval batch size: %d', eval_batch_size)
    pred_batches = int(
        math.ceil(input_pipeline.N_EVAL /
                  (jax.host_count() * eval_batch_size)))
    logging.info('pred batches: %d', pred_batches)
    broadcast = functools.partial(_broadcast,
                                  num_replicas=num_replicas,
                                  num_partitions=num_partitions)

    if jax.host_id() == 0:
        train_summary_writer = tensorboard.SummaryWriter(
            os.path.join(FLAGS.model_dir, 'train'))
        eval_summary_writer = tensorboard.SummaryWriter(
            os.path.join(FLAGS.model_dir, 'eval'))
    else:
        train_summary_writer = None
        eval_summary_writer = None
    # Write summaries in background thread to avoid blocking on device sync
    summary_thread = thread.ThreadPoolExecutor(1, 'summary')
    if FLAGS.infeed:
        # Infeed is currently synchronous, so do it in a background thread too
        infeed_pool = thread.ThreadPoolExecutor(jax.local_device_count(),
                                                'infeed')

    # MLPerf 2020 WMT en-de dataset uses a custom T2T dataset:
    #   Shared 32K subword tokenization
    #   256-length packed training examples from WMT17
    #   97-length unpacked evaluation examples from WMT14
    train_keys = [
        'inputs', 'targets', 'inputs_position', 'targets_position',
        'inputs_segmentation', 'targets_segmentation'
    ]
    encoder = mlperf_encoder.SubwordTextEncoder(filename=FLAGS.vocab_path)
    input_encoder = encoder
    target_encoder = encoder
    vocab_size = input_encoder.vocab_size
    output_vocab_size = target_encoder.vocab_size

    input_shape = (batch_size, max_target_length)
    target_shape = (batch_size, max_target_length)

    transformer_kwargs = flax.core.FrozenDict({
        'vocab_size': vocab_size,
        'output_vocab_size': output_vocab_size,
        'emb_dim': 1024,
        'num_heads': 16,
        'num_layers': 6,
        'qkv_dim': 1024,
        'mlp_dim': 4096,
        'max_len': max_length,
        'share_embeddings': FLAGS.share_embeddings,
        'logits_via_embedding': FLAGS.logits_via_embedding,
        'num_partitions': num_partitions,
    })

    rng = random.PRNGKey(seed)
    rng, init_rng = random.split(rng)
    model, cache_def = create_model(init_rng, tuple(input_shape),
                                    tuple(target_shape), transformer_kwargs)
    mllogger.event('opt_name', 'adam')
    if batch_size < 1024:
        learning_rate = 4.0  # 0.0625
        warmup_steps = 1000
        beta1 = 0.9
        beta2 = 0.98
    if batch_size < 2048:
        learning_rate = 2.0
        warmup_steps = 500  # ??
        beta1 = 0.9  # ??
        beta2 = 0.98  # ??
    else:
        learning_rate = 3.3092157691415953
        warmup_steps = 664
        beta1 = 0.9086575725261137
        beta2 = 0.9198719118104947
    epsilon = 1e-9
    if FLAGS.learning_rate is not None:
        learning_rate = FLAGS.learning_rate
    mllogger.event('opt_adam_beta_1', beta1)
    mllogger.event('opt_adam_beta_2', beta2)
    mllogger.event('opt_adam_epsilon', epsilon)
    optimizer_def = optim.Adam(learning_rate,
                               beta1=beta1,
                               beta2=beta2,
                               eps=epsilon,
                               weight_decay=FLAGS.weight_decay)
    optimizer = optimizer_def.create(model)
    del model  # don't keep a copy of the initial model

    # Build parameter partition annotations for preserving partitions from train
    # to eval.
    partition_rules = [
        (('encoder', 'posembed_input'), partitions.empty_dict),
        (('decoder', 'posembed_targets'), partitions.empty_dict),
        (('embedding', ), partitions.spec(num_partitions, 1)),
        ((r'LayerNorm_\d+', '(bias|scale)'), None),
        ((r'encoder(decoder)?_norm', '(bias|scale)'), None),
        ((r'MultiHeadDotProductAttention_\d+', '(query|key|value)', 'kernel'),
         partitions.spec(1, num_partitions, 1)),
        ((r'MultiHeadDotProductAttention_\d+', 'out', 'kernel'),
         partitions.spec(num_partitions, 1, 1)),
        ((r'MlpBlock_\d+', r'Dense_\d+', 'bias'), None),
        ((r'MlpBlock_\d+', 'Dense_0', 'kernel'),
         partitions.spec(1, num_partitions)),
        ((r'MlpBlock_\d+', 'Dense_1', 'kernel'),
         partitions.spec(num_partitions, 1)),
        (('state', 'step'), None),
    ]
    optimizer_partitions = optimizer.restore_state(
        partitions.set_partitions(partition_rules, optimizer.state_dict()))

    optimizer = broadcast(optimizer)
    empty_metrics = broadcast({'loss': 0.0, 'accuracy': 0, 'denominator': 0})

    learning_rate_fn = create_learning_rate_scheduler(
        base_learning_rate=learning_rate,
        warmup_steps=warmup_steps,
        hidden_size=transformer_kwargs['qkv_dim'])

    p_train_step = jax.pmap(functools.partial(
        train_step, learning_rate_fn=learning_rate_fn),
                            axis_name='batch',
                            in_axes=(None, 0, 0, 0))
    if num_partitions > 1:
        sharded_predict_step = sharded_jit(
            predict_step,
            in_parts=(None, optimizer_partitions.target, None),
            out_parts=None)
    else:
        sharded_predict_step = predict_step
    if FLAGS.extra_eval_metrics:
        p_eval_step = jax.pmap(eval_step, axis_name='batch', in_axes=(None, 0))
    p_pred_step = jax.pmap(sharded_predict_step,
                           axis_name='batch',
                           in_axes=(0, None, None))
    p_allreduce_metrics = jax.pmap(functools.partial(lax.psum,
                                                     axis_name='batch'),
                                   axis_name='batch')

    def device_train_loop_cond(args):
        _, _, _, _, step, epoch = args
        return step // steps_per_epoch == epoch

    def device_train_loop_body(args):
        optimizer, dropout_rngs, metrics, token, step, epoch = args
        input_data, token = lax.infeed(token,
                                       shape=tuple([
                                           jax.ShapedArray(
                                               device_train_input_shape,
                                               jnp.int32) for _ in train_keys
                                       ]))
        batch = {k: v for k, v in zip(train_keys, input_data)}
        optimizer, metrics, dropout_rngs = train_step(optimizer,
                                                      batch,
                                                      metrics,
                                                      learning_rate_fn,
                                                      dropout_rng=dropout_rngs)
        step += 1
        return optimizer, dropout_rngs, metrics, token, step, epoch

    def device_train_loop(optimizer, dropout_rngs, metrics, step, epoch):
        token = lax.create_token(step)
        optimizer, dropout_rngs, metrics, _, step, _ = lax.while_loop(
            device_train_loop_cond, device_train_loop_body,
            (optimizer, dropout_rngs, metrics, token, step, epoch))
        return optimizer, dropout_rngs, metrics, step

    if num_partitions > 1:
        device_train_loop = sharded_jit(device_train_loop,
                                        in_parts=(optimizer_partitions, None,
                                                  None, None, None),
                                        out_parts=(optimizer_partitions, None,
                                                   None, None))
    p_train_epoch = jax.pmap(device_train_loop,
                             axis_name='batch',
                             in_axes=(None, 0, 0, None, None))

    p_allreduce_metrics_train = functools.partial(lax.psum, axis_name='batch')
    if num_partitions > 1:
        p_allreduce_metrics_train = sharded_jit(p_allreduce_metrics_train,
                                                in_parts=None,
                                                out_parts=None,
                                                num_partitions=num_partitions)
    p_allreduce_metrics_train = jax.pmap(p_allreduce_metrics_train,
                                         axis_name='batch')

    # Precompile all needed computations with fake data so as not to include
    # compilation time in MLPerf metrics.
    if FLAGS.precompile:
        logging.info('precompiling step/epoch functions')
        if FLAGS.infeed:
            # the device training loop condition will immediately be false, but
            # the optimizer tree will be resharded here
            optimizer, *_ = p_train_epoch(unbroadcast(optimizer),
                                          random.split(rng, num_replicas),
                                          empty_metrics,
                                          jnp.array(0, dtype=jnp.int32), 1)
        else:
            metrics = empty_metrics
            train_input_shape = (num_replicas, batch_size // num_replicas,
                                 input_pipeline.MAX_TRAIN_LEN)
            fake_batch = {
                k: jnp.ones(train_input_shape, jnp.int32)
                for k in train_keys
            }
            p_train_step(unbroadcast(optimizer),
                         fake_batch,
                         metrics,
                         dropout_rng=random.split(rng, num_replicas))
        eval_input_shape = (num_replicas, eval_batch_size // num_replicas,
                            input_pipeline.MAX_EVAL_LEN)
        fake_eval_batch = {
            'inputs': jnp.ones(eval_input_shape, jnp.int32),
            'targets': jnp.ones(eval_input_shape, jnp.int32),
        }
        if FLAGS.extra_eval_metrics:
            p_eval_step(unbroadcast(optimizer.target), fake_eval_batch)
        fake_cache = cache_def.initialize_cache(
            (eval_input_shape[1], FLAGS.max_predict_length))
        p_pred_step(fake_eval_batch['inputs'], unbroadcast(optimizer.target),
                    fake_cache)
        time.sleep(20)
        sync_devices()
        fake_bleu_1 = np.zeros((4, ), dtype=np.int32)
        fake_bleu_2 = np.zeros((), dtype=np.int32)
        per_host_sum_pmap((fake_bleu_1, fake_bleu_1, fake_bleu_2, fake_bleu_2))
        sync_devices()
        p_allreduce_metrics_train(empty_metrics)
        sync_devices()
        logging.info('finished precompiling step/epoch functions')

    # We init the first set of dropout PRNG keys, but update it afterwards inside
    # the main pmap'd training update for performance.
    dropout_rngs = random.split(rng, num_replicas)

    # Record time-0 metrics for proper tensorboard plot x-axis scaling.
    if jax.host_id() == 0:
        if FLAGS.compute_train_metrics:
            train_summary_writer.scalar('loss', 9.999, 0)
            train_summary_writer.scalar('accuracy', 0.0, 0)
            train_summary_writer.flush()
        eval_summary_writer.scalar('bleu', 0.0, 0)
        eval_summary_writer.flush()

    train_ds = input_pipeline.get_wmt_dataset(batch_size=batch_size //
                                              jax.host_count(),
                                              train=True)
    eval_ds = input_pipeline.get_wmt_dataset(batch_size=eval_batch_size,
                                             train=False)
    train_iter = iter(train_ds)
    eval_iter = iter(eval_ds)
    local_devices = jax.local_devices()
    host_step, device_step = 0, broadcast(0)
    gc.disable()
    mllogger.end('init_stop')
    if jax.host_id() == 0:
        mllogger.start('run_start')
    for epoch in range(FLAGS.num_epochs):
        if jax.host_id() == 0 and not BLEU_THRESHOLD_REACHED:
            mllogger.start('block_start',
                           metadata={
                               'first_epoch_num': epoch + 1,
                               'epoch_count': 1
                           })
        metrics = empty_metrics
        if FLAGS.infeed:
            optimizer, dropout_rngs, metrics, device_step = p_train_epoch(
                unbroadcast(optimizer), dropout_rngs, metrics,
                unbroadcast(device_step), epoch)
        while int(host_step // steps_per_epoch) == epoch:
            # pylint: disable=protected-access
            batch = jax.tree_map(lambda x: x._numpy(), next(train_iter))
            # Shard data to devices and do a training step.
            batch = jax.tree_map(
                lambda x: x.reshape((num_replicas, -1) + x.shape[1:]), batch)
            if FLAGS.infeed:
                for i, device in enumerate(local_devices):
                    replica_id = i // num_partitions
                    input_tuple = tuple(
                        [batch[k][replica_id] for k in train_keys])
                    assert input_tuple[0].shape == device_train_input_shape, (
                        'infeed shape error %s != %s' %
                        (input_tuple[0].shape, device_train_input_shape))
                    assert input_tuple[0].dtype == jnp.int32, (
                        'infeed dtype error %s != %s' %
                        (input_tuple[0].dtype, jnp.int32))
                    infeed_pool.submit(
                        functools.partial(device.transfer_to_infeed,
                                          input_tuple))
            else:
                optimizer, metrics, dropout_rngs = p_train_step(
                    unbroadcast(optimizer),
                    batch,
                    metrics,
                    dropout_rng=dropout_rngs)
            host_step += 1

        if FLAGS.compute_train_metrics:
            metrics = p_allreduce_metrics_train(metrics)
            # Schedule training metric handling.
            summary_thread.submit(
                functools.partial(write_train_summary, metrics,
                                  train_summary_writer, host_step))

        # Optional, extra evaluation metrics.
        if FLAGS.extra_eval_metrics:
            eval_metrics = []
            eval_iter = iter(eval_ds)
            for _, eval_batch in zip(range(num_eval_steps), eval_iter):
                eval_batch = common_utils.shard(eval_batch)
                metrics = p_eval_step(unbroadcast(optimizer.target),
                                      eval_batch)
                eval_metrics.append(metrics)
            eval_metrics = p_allreduce_metrics(eval_metrics)
            # Schedule metric summarization/logging.
            summary_thread.submit(
                functools.partial(write_eval_summary, eval_metrics,
                                  eval_summary_writer, host_step))

        # Translation and BLEU Score.
        all_predicted, all_targets, all_bs = [], [], []
        for i in range(pred_batches):
            # pylint: disable=protected-access
            pred_batch = jax.tree_map(lambda x: x._numpy(), next(eval_iter))
            # Handle final odd-sized batch by padding instead of dropping it.
            cur_pred_batch_size = pred_batch['inputs'].shape[0]
            if cur_pred_batch_size != eval_batch_size:
                pred_batch = jax.tree_map(
                    lambda x: pad_examples(x, eval_batch_size), pred_batch)
            pred_batch = jax.tree_map(
                lambda x: x.reshape((num_replicas, -1) + x.shape[1:]),
                pred_batch)
            per_device_batchsize = pred_batch['inputs'].shape[1]
            cache = cache_def.initialize_cache(
                (per_device_batchsize, FLAGS.max_predict_length))
            all_predicted.append(
                p_pred_step(pred_batch['inputs'],
                            unbroadcast(optimizer.target), cache))
            all_targets.append(pred_batch['targets'])
            all_bs.append(cur_pred_batch_size)
        # Schedule BLEU calculation and summarization/logging.
        # We use the ICI as part of BLEU score computation, so we call this from the
        # main thread so the BLEU pmap runs before the next train epoch pmap
        write_predict_summary(all_predicted, all_targets, all_bs,
                              target_encoder, eval_summary_writer, epoch,
                              host_step, summary_thread)

    # Wait until computations are done before exiting
    sync_devices()
    if jax.host_id() == 0:
        summary_thread.shutdown()
        if not BLEU_THRESHOLD_REACHED:
            mllogger.end('run_stop', metadata={'status': 'aborted'})
コード例 #22
0
def profile_with_xprof_on_background(start_after_sec=30, profile_time_sec=1,
                                     device='REDACTED'):
  profiler_thread = thread.ThreadPoolExecutor(jax.local_device_count(), 'xprof')
  profiler_thread.submit(partial(xprof_profile, start_after_sec,
                                 profile_time_sec, device))
コード例 #23
0

if __name__ == "__main__":
    # run_experiment(0, 0, "placeholder")
    hotncold = """bp.registerBThread("HotBt", function() {
        bp.sync({request:bp.Event("hotEvent")});
        bp.sync({request:bp.Event("hotEvent")});
        bp.sync({request:bp.Event("hotEvent")});
    });

    bp.registerBThread("ColdBt", function() {
        bp.sync({request:bp.Event("coldEvent")});
        bp.sync({request:bp.Event("coldEvent")});
        bp.sync({request:bp.Event("coldEvent")});
    });

    bp.registerBThread("AlternatorBt", function() {
        for(var i = 0; i < 3; i++) {
            bp.sync({waitFor:bp.Event("coldEvent"), block:bp.Event("hotEvent")});
            bp.sync({waitFor:bp.Event("hotEvent"), block:bp.Event("coldEvent")});
        }
        bp.sync({request:bp.Event("allDone")});
    });"""
    print("start")
    with thread.ThreadPoolExecutor(max_workers=4) as e:
        e.submit(thread_check, hotncold)
        e.submit(thread_check, hotncold)
        e.submit(thread_check, hotncold)
        e.submit(thread_check, hotncold)
    print("finish")
コード例 #24
0
def main(argv):
    del argv
    # BEGIN GOOGLE-INTERNAL
    xm.setup_work_unit()
    # END GOOGLE-INTERNAL

    tf.enable_v2_behavior()
    init_mllogger()

    mllogger.event('cache_clear')
    mllogger.start('init_start')
    mllogger.event('submission_org', 'Google')
    mllogger.event('submission_platform',
                   'TPUv3-{}'.format(jax.device_count()))
    mllogger.event('submission_division', 'closed')
    mllogger.event('submission_status', 'research')
    mllogger.event('submission_benchmark', 'resnet')
    mllogger.event('train_samples', input_pipeline.TRAIN_IMAGES)
    mllogger.event('eval_samples', input_pipeline.EVAL_IMAGES)

    if jax.host_id() == 0:
        summary_writer = tensorboard.SummaryWriter(FLAGS.output_dir)
        # Write summaries in background thread to avoid blocking on device sync
        summary_thread = thread.ThreadPoolExecutor(1, 'summary')
    # Infeed is currently synchronous, so do it in a background thread too
    infeed_pool = thread.ThreadPoolExecutor(jax.local_device_count(), 'infeed')

    if FLAGS.seed is not None:
        seed = FLAGS.seed
    else:
        seed = np.uint32(time.time() if jax.host_id() == 0 else 0)
        seed = per_host_sum_pmap(seed)

    mllogger.event('seed', int(seed))
    key = random.PRNGKey(seed)

    batch_size = FLAGS.batch_size
    if batch_size == -1:
        if jax.device_count() > 4096:
            batch_size = 65536
        else:
            batch_size = min(128 * jax.device_count(), 32768)
    mllogger.event('global_batch_size', batch_size)
    eval_batch_size = min(input_pipeline.EVAL_IMAGES, 256 * jax.device_count())
    device_batch_size = batch_size // jax.device_count()
    device_eval_batch_size = int(
        math.ceil(eval_batch_size / jax.device_count()))

    model_dtype = jnp.bfloat16 if FLAGS.bfloat16 else jnp.float32
    input_dtype = tf.bfloat16 if FLAGS.bfloat16 else tf.float32

    num_epochs = FLAGS.num_epochs
    if num_epochs is None:
        if batch_size < 32768:
            num_epochs = 56
        elif batch_size < 65536:
            num_epochs = 64
        else:
            num_epochs = 92

    steps_per_epoch = input_pipeline.TRAIN_IMAGES / batch_size
    # match TF submission behavior (round steps per loop up)
    steps_per_loop = int(math.ceil(steps_per_epoch * FLAGS.epochs_per_loop))
    # also apply rounding loop up to next step to "epochs" in LR schedule
    steps_per_epoch *= steps_per_loop / (steps_per_epoch *
                                         FLAGS.epochs_per_loop)

    steps_per_eval = int(
        math.ceil(input_pipeline.EVAL_IMAGES / eval_batch_size))

    base_learning_rate = FLAGS.learning_rate * batch_size / 256.
    beta = FLAGS.momentum
    if beta is None:
        if batch_size < 32768:
            beta = 0.9
        elif batch_size < 65536:
            beta = 0.929
        else:
            beta = 0.9537213777059405
    weight_decay = FLAGS.weight_decay
    if weight_decay is None:
        weight_decay = 2e-4 if batch_size < 32768 else 1e-4

    space_to_depth = FLAGS.space_to_depth
    if space_to_depth is None:
        space_to_depth = device_batch_size <= 8

    image_format = FLAGS.image_format
    if image_format is None:
        if space_to_depth and device_batch_size <= 8:
            image_format = 'HWNC'
        else:
            image_format = 'HWCN'

    image_size = input_pipeline.IMAGE_SIZE
    if space_to_depth:
        train_input_shape = (device_batch_size, image_size // 2,
                             image_size // 2, 12)
        eval_input_shape = (device_eval_batch_size, image_size // 2,
                            image_size // 2, 12)
    else:
        train_input_shape = (device_batch_size, image_size, image_size, 3)
        eval_input_shape = (device_eval_batch_size, image_size, image_size, 3)
    if image_format == 'HWCN':
        train_input_shape = tuple(train_input_shape[i] for i in [1, 2, 3, 0])
        eval_input_shape = tuple(eval_input_shape[i] for i in [1, 2, 3, 0])
    elif image_format == 'HWNC':
        train_input_shape = tuple(train_input_shape[i] for i in [1, 2, 0, 3])
        eval_input_shape = tuple(eval_input_shape[i] for i in [1, 2, 0, 3])

    model, state = create_model(key, device_batch_size, image_size,
                                model_dtype, space_to_depth)

    if FLAGS.lars:
        mllogger.event('opt_name', 'lars')
        mllogger.event('lars_opt_weight_decay', weight_decay)
        mllogger.event('lars_opt_momentum', beta)
        mllogger.event('lars_epsilon', 0)
        weight_opt_def = optim.LARS(base_learning_rate,
                                    beta,
                                    weight_decay=weight_decay)
        other_opt_def = optim.Momentum(base_learning_rate,
                                       beta,
                                       weight_decay=0,
                                       nesterov=False)
        learning_rate_fn = polynomial_learning_rate_fn(batch_size,
                                                       steps_per_epoch,
                                                       num_epochs)
    else:
        mllogger.event('opt_name', 'sgd')
        mllogger.event('sgd_opt_momentum', beta)
        weight_opt_def = optim.Momentum(base_learning_rate,
                                        beta,
                                        weight_decay=weight_decay,
                                        nesterov=True)
        other_opt_def = optim.Momentum(base_learning_rate,
                                       beta,
                                       weight_decay=0,
                                       nesterov=True)
        learning_rate_fn = piecewise_learning_rate_fn(base_learning_rate,
                                                      steps_per_epoch,
                                                      num_epochs)

    def filter_weights(key, _):
        return 'bias' not in key and 'scale' not in key

    def filter_other(key, _):
        return 'bias' in key or 'scale' in key

    weight_traversal = optim.ModelParamTraversal(filter_weights)
    other_traversal = optim.ModelParamTraversal(filter_other)
    optimizer_def = optim.MultiOptimizer((weight_traversal, weight_opt_def),
                                         (other_traversal, other_opt_def))
    optimizer = optimizer_def.create(model)
    del model  # do not keep a copy of the initial model

    optimizer = broadcast(optimizer)
    state = broadcast(state)
    empty_metrics = broadcast({'samples': 0, 'loss': 0., 'accuracy': 0})

    p_allreduce_metrics = jax.pmap(allreduce_metrics, axis_name='batch')

    p_sync_batchnorm_stats = jax.pmap(sync_batchnorm_stats, axis_name='batch')

    def host_loop_train_step(optimizer, state, metrics):
        token = lax.create_token(optimizer.state[0].step)
        batch, token = lax.infeed(token,
                                  shape=(jax.ShapedArray(
                                      train_input_shape, model_dtype),
                                         jax.ShapedArray((device_batch_size, ),
                                                         jnp.int32)))
        optimizer, state, metrics = train_step(optimizer, state, batch,
                                               metrics, learning_rate_fn,
                                               image_format, space_to_depth)
        return optimizer, state, metrics

    p_host_loop_train_step = jax.pmap(host_loop_train_step,
                                      axis_name='batch',
                                      in_axes=(None, 0, 0))

    def host_loop_eval_step(model, state, metrics):
        token = lax.create_token(metrics['samples'])
        batch, token = lax.infeed(
            token,
            shape=(jax.ShapedArray(eval_input_shape, model_dtype),
                   jax.ShapedArray((device_eval_batch_size, ), jnp.int32)))
        metrics = eval_step(model, state, batch, metrics, image_format,
                            space_to_depth)
        return metrics

    p_host_loop_eval_step = jax.pmap(host_loop_eval_step,
                                     axis_name='batch',
                                     in_axes=(None, None, 0))

    def device_train_loop_cond(args):
        _, _, _, _, step, loop = args
        return step // steps_per_loop == loop

    def device_train_loop_body(args):
        optimizer, state, metrics, token, step, loop = args
        batch, token = lax.infeed(token,
                                  shape=(jax.ShapedArray(
                                      train_input_shape, model_dtype),
                                         jax.ShapedArray((device_batch_size, ),
                                                         jnp.int32)))
        optimizer, state, metrics = train_step(optimizer, state, batch,
                                               metrics, learning_rate_fn,
                                               image_format, space_to_depth)
        step += 1
        return optimizer, state, metrics, token, step, loop

    def device_train_loop(optimizer, state, metrics, step, loop):
        token = lax.create_token(step)
        optimizer, state, metrics, _, step, _ = lax.while_loop(
            device_train_loop_cond, device_train_loop_body,
            (optimizer, state, metrics, token, step, loop))
        state = sync_batchnorm_stats(state)
        metrics = allreduce_metrics(metrics)
        return optimizer, state, metrics, step

    p_train_loop = jax.pmap(device_train_loop,
                            axis_name='batch',
                            in_axes=(None, None, 0, None, None))

    # BEGIN GOOGLE-INTERNAL
    def maybe_start_xprof(seconds):
        if jax.host_id() == 0 and FLAGS.xprof:
            xprof = xprof_session.XprofSession()
            xprof.start_session('REDACTED', True, 2)

            def sleep_and_end_xprof():
                time.sleep(seconds)
                logging.info(
                    'Xprof URL: %s',
                    xprof.end_session_and_get_url(
                        tag='flax resnet, {} devices, batch {} per device'.
                        format(jax.device_count(), device_batch_size)))

            thread.ThreadPoolExecutor(1, 'xprof').submit(sleep_and_end_xprof)

    # END GOOGLE-INTERNAL

    if FLAGS.precompile:
        logging.info('precompiling step/loop functions')
        if FLAGS.device_loop:
            # the device training loop condition will immediately be false
            p_train_loop(unbroadcast(optimizer), unbroadcast(state),
                         empty_metrics, jnp.array(0, dtype=jnp.int32), 1)
        else:
            for device in jax.local_devices():
                images = np.zeros(train_input_shape, model_dtype)
                labels = np.zeros((device_batch_size, ), np.int32)
                infeed_pool.submit(
                    partial(device.transfer_to_infeed, (images, labels)))
            p_host_loop_train_step(unbroadcast(optimizer), state,
                                   empty_metrics)
            p_sync_batchnorm_stats(state)
        for device in jax.local_devices():
            images = np.zeros(eval_input_shape, model_dtype)
            labels = np.zeros((device_eval_batch_size, ), np.int32)
            infeed_pool.submit(
                partial(device.transfer_to_infeed, (images, labels)))
        p_host_loop_eval_step(unbroadcast(optimizer.target),
                              unbroadcast(state), empty_metrics)
        p_allreduce_metrics(empty_metrics)['accuracy'].block_until_ready()
        logging.info('finished precompiling')

    # BEGIN GOOGLE-INTERNAL
    maybe_start_xprof(20)
    # END GOOGLE-INTERNAL
    if not FLAGS.fake_data:
        logging.info('constructing datasets')
        # pylint: disable=g-complex-comprehension
        train_ds, eval_ds = [
            input_pipeline.load_split(
                device_batch_size if train else device_eval_batch_size,
                dtype=input_dtype,
                train=train,
                image_format=image_format,
                space_to_depth=space_to_depth,
                cache_uncompressed=jax.device_count() > 64)
            for train in (True, False)
        ]
        logging.info('constructing dataset iterators')
        train_iter = iter(train_ds)
        eval_iter = iter(eval_ds)

    local_devices = jax.local_devices()
    host_step, device_step = 0, broadcast(0)
    mllogger.end('init_stop')
    mllogger.start('run_start')
    mllogger.start('block_start',
                   metadata={
                       'first_epoch_num': 1,
                       'epoch_count': FLAGS.epochs_per_loop
                   })
    for loop in range(int(math.ceil(num_epochs / FLAGS.epochs_per_loop)) + 2):
        # BEGIN GOOGLE-INTERNAL
        if loop == 10: maybe_start_xprof(1)
        # END GOOGLE-INTERNAL
        metrics = empty_metrics
        if FLAGS.device_loop:
            optimizer, state, metrics, device_step = p_train_loop(
                unbroadcast(optimizer), unbroadcast(state), metrics,
                unbroadcast(device_step), loop)
        while int(host_step // steps_per_loop) == loop:
            if not FLAGS.device_loop:
                optimizer, state, metrics = p_host_loop_train_step(
                    unbroadcast(optimizer), state, metrics)
            # pylint: disable=protected-access
            while infeed_pool._work_queue.qsize() > 100:
                time.sleep(0.01)
            for device in local_devices:
                if FLAGS.fake_data:
                    images = np.zeros(train_input_shape, model_dtype)
                    labels = np.zeros((device_batch_size, ), np.int32)
                else:
                    # pylint: disable=protected-access
                    images, labels = jax.tree_map(lambda x: x._numpy(),
                                                  next(train_iter))
                assert images.shape == train_input_shape and labels.dtype == jnp.int32
                infeed_pool.submit(
                    partial(device.transfer_to_infeed, (images, labels)))
            host_step += 1
        epoch = (loop + 1) * FLAGS.epochs_per_loop
        if FLAGS.train_metrics:
            if not FLAGS.device_loop:
                metrics = p_allreduce_metrics(metrics)
            if jax.host_id() == 0:
                summary_thread.submit(
                    partial(write_summary, summary_writer, metrics, 'train',
                            epoch))
        if not FLAGS.device_loop:
            state = p_sync_batchnorm_stats(state)
        metrics = empty_metrics
        for _ in range(steps_per_eval):
            metrics = p_host_loop_eval_step(unbroadcast(optimizer.target),
                                            unbroadcast(state), metrics)
            for device in local_devices:
                if FLAGS.fake_data:
                    images = np.zeros(eval_input_shape, model_dtype)
                    labels = np.zeros((device_eval_batch_size, ), np.int32)
                else:
                    # pylint: disable=protected-access
                    images, labels = jax.tree_map(lambda x: x._numpy(),
                                                  next(eval_iter))
                assert images.shape == eval_input_shape and labels.dtype == jnp.int32, \
                    'images.shape={}'.format(images.shape)
                infeed_pool.submit(
                    partial(device.transfer_to_infeed, (images, labels)))
        metrics = p_allreduce_metrics(metrics)
        if jax.host_id() == 0:
            summary_thread.submit(
                partial(write_summary, summary_writer, metrics, 'eval', epoch))
    # Wait until computations are done before exiting
    p_allreduce_metrics(metrics)['accuracy'].block_until_ready()
    if jax.host_id() == 0:
        summary_thread.shutdown()
        if not DONE:
            mllogger.end('run_stop', metadata={'status': 'aborted'})
コード例 #25
0
ファイル: core.py プロジェクト: longwosion/aiocrontab
 def executor(self) -> thread.ThreadPoolExecutor:
     if self._executor is None:
         self._executor = thread.ThreadPoolExecutor()
     return self._executor
コード例 #26
0
ファイル: train.py プロジェクト: zqhfpjlswsqy/google-research
def main(argv):
    global CFG
    CFG = FLAGS.config

    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    # Guarantee that the JAX bfloat16 extension is used rather than TF bfloat16.
    _ = np.array(jnp.array([1.0], dtype=jnp.bfloat16))

    # Use hardware RNG for bernoulli randoms in dropout mask creation.
    if CFG.hardware_rng:
        models.set_hardware_bernoulli()

    if 'module_import' in CFG and CFG.module_import:
        for module in CFG.module_import:
            importlib.import_module(module)

    if 'additional_task_cache_dirs' in CFG and CFG.additional_task_cache_dirs:
        t5.data.add_global_cache_dirs(CFG.additional_task_cache_dirs)

    num_partitions = CFG.num_partitions
    topology = train_lib.compute_multihost_topology(num_partitions)
    batch_size = CFG.batch_size
    eval_batch_size = CFG.eval_batch_size
    per_replica_set_eval_batch_size = eval_batch_size // topology.num_replica_sets
    if batch_size % topology.num_replicas:
        raise ValueError(
            'Batch size must be divisible by the number of replicas.')

    steps_per_epoch = CFG.steps_per_epoch
    logging.info('steps per epoch: %d', steps_per_epoch)

    broadcast = functools.partial(
        train_lib.broadcast,
        num_replicas=topology.per_replica_set_num_replicas,
        num_partitions=topology.per_host_num_partitions,
        devices=topology.this_host_device_assignment)

    if jax.host_id() == 0:
        tf.io.gfile.makedirs(FLAGS.model_dir)
        tf.io.gfile.copy(FLAGS['config'].config_filename,
                         os.path.join(FLAGS.model_dir, 'config.py'),
                         overwrite=True)
        train_summary_writer = tensorboard.SummaryWriter(
            os.path.join(FLAGS.model_dir, 'train'))
        eval_summary_writer = tensorboard.SummaryWriter(
            os.path.join(FLAGS.model_dir, 'eval'))
    else:
        train_summary_writer = None
        eval_summary_writer = None

    # Write summaries in background thread to avoid blocking on device sync
    if CFG.infeed:
        # Infeed is currently synchronous, so do it in a background thread too
        infeed_pool = thread.ThreadPoolExecutor(jax.local_device_count(),
                                                'infeed')

    (train_ds, eval_ds), eval_cache = input_pipeline.get_datasets_and_cache(
        CFG, topology.num_replica_sets, topology.replica_set_id,
        topology.per_replica_set_host_id)

    vocab = input_pipeline.get_vocabulary(CFG.mixture_or_task_name)
    encoder = vocab.tf_tokenizer
    eos_id = vocab.tokenizer.eos_id()

    def decode_tokens(toks, eos_id=eos_id, max_id=32000):
        """Decode tokens back to unicode."""
        del eos_id
        # TODO(levskaya): T5 doesn't seem to emit EOS tokens?  double check this
        # is the best decoding function or just switch to using tf_decode.
        # valid_toks = toks[:np.argmax(toks == eos_id) + 1].astype(np.int32)
        valid_toks = toks.astype(np.int32)
        valid_toks[valid_toks >= max_id] = 3
        return encoder.detokenize(valid_toks).numpy().decode('utf-8')

    logging.info('Initializing model, optimizer, and step functions.')

    train_config, eval_config, predict_config = get_configs(CFG)

    rng = random.PRNGKey(CFG.random_seed)
    rng, init_rng = random.split(rng)
    # This is used for infeed conversion from feature dict <--> tuple
    train_keys = [
        'inputs', 'targets', 'inputs_position', 'targets_position',
        'inputs_segmentation', 'targets_segmentation'
    ]
    device_train_input_shape = tuple([
        (batch_size // topology.num_replicas,
         CFG.max_input_length if 'inputs' in k else CFG.max_target_length)
        for k in train_keys
    ])

    learning_rate_fn = train_lib.create_learning_rate_scheduler(
        factors=CFG.schedule,
        base_learning_rate=CFG.learning_rate,
        warmup_steps=CFG.warmup_steps)

    # First, we only abstractly initialize the optimizer and model parameters,
    # since the parameters may not even fit in device memory!
    # TODO(jekbradbury): make optimizer_defs compare by value so it can be created
    # in get_initial_params without causing pytree incompatibility
    optimizer_def = optim.Adafactor(CFG.learning_rate,
                                    decay_rate=0.8,
                                    step_offset=CFG.step_offset)
    initialize_params_fn = functools.partial(get_initial_params,
                                             config=CFG,
                                             transformer_config=eval_config,
                                             optimizer_def=optimizer_def)
    optimizer = jax.eval_shape(initialize_params_fn, init_rng)
    # tuple-like pytree leaves for global_arg_shapes
    optimizer_shapes = jax.tree_map(lambda x: partitions.Spec(*x.shape),
                                    optimizer)

    # Build parameter partition annotations for preserving partitions from train
    # to eval.
    if num_partitions > 1:
        optimizer_partitions = optimizer.restore_state(
            partitions.set_partitions(num_partitions, optimizer.state_dict()))
        per_host_optimizer_partitions = optimizer.restore_state(
            partitions.set_partitions(topology.per_host_num_partitions,
                                      optimizer.state_dict()))

    # Restore unreplicated optimizer + model state from last checkpoint.
    # TODO(jekbradbury,levskaya): implement sharded native checkpoint/restore
    existing_checkpoint_found = False
    if CFG.restore_checkpoints:
        existing_checkpoint_found = train_lib.checkpoint_exists(
            FLAGS.model_dir)
        optimizer = checkpoints.restore_checkpoint(FLAGS.model_dir, optimizer)

    # Import a pretrained-T5 checkpoint only if we didn't import a local
    # "native" checkpoint (e.g. due to resuming a pre-empted finetuning run.)
    # TODO(jekbradbury,levskaya): implement sharded T5 checkpoint/restore
    if CFG.restore_t5_checkpoint and not existing_checkpoint_found:
        optimizer = checkpoint_importer.restore_from_t5_checkpoint(
            optimizer, CFG.restore_t5_checkpoint)

    if CFG.restore_t5_checkpoint or existing_checkpoint_found:
        if num_partitions > 1:
            # Until checkpoint/restore is sharded, the restored checkpoint is global
            # and we need to slice each sharded parameter into the chunk containing
            # only the partitions that are present on this host.
            def per_host_chunk(x, spec):
                if spec is None or spec is x:  # unsharded or not a parameter
                    return x
                if spec[0] == 1:
                    dim_size = x.shape[1]
                elif spec[1] == 1:
                    dim_size = x.shape[0]
                else:
                    raise NotImplementedError()
                chunk_size = (dim_size * topology.per_host_num_partitions //
                              num_partitions)
                lower = topology.per_replica_set_host_id * chunk_size
                upper = (topology.per_replica_set_host_id + 1) * chunk_size
                if spec[0] == 1:
                    return x[:, lower:upper]
                else:
                    return x[lower:upper]

            optimizer = jax.tree_multimap(per_host_chunk, optimizer,
                                          optimizer_partitions)
    else:
        # If pretraining and no checkpoint imported, we jit the (sharded-) init
        # function to minimize fragmentation. We use the same pmap(sharded_jit)
        # setup as the training step/loop to initialize everything "in-place" and
        # avoid communication or OOM.
        if num_partitions > 1:
            initialize_params_fn = sharded_jit(
                initialize_params_fn,
                in_parts=None,
                local_in_parts=None,
                out_parts=optimizer_partitions,
                local_out_parts=per_host_optimizer_partitions,
                # devices=one_replica_device_assignment,
            )
            initialize_params_fn = jax.pmap(initialize_params_fn,
                                            'batch',
                                            in_axes=0,
                                            axis_size=topology.num_replicas,
                                            devices=topology.device_assignment)
            init_rng = broadcast(init_rng)
            optimizer = initialize_params_fn(init_rng)
            # We maintain the optimizer in unbroadcasted form (i.e. with no leading
            # replica axis). This is equivalent to the as-yet-nonexistent pmap kwarg
            # out_axes=None.
            optimizer = train_lib.unbroadcast(optimizer)
        else:
            optimizer = jax.jit(initialize_params_fn)(init_rng)

    # ---------------------------------------------------------------------------
    # Compile multidevice versions of train/eval/predict step and cache init fn.
    # ---------------------------------------------------------------------------

    # We can use either a single train-step for a host training loop:

    # train_step(optimizer, batch, prev_metrics, dropout_rng, **kwargs)
    #  --> new_optimizer, metrics, new_dropout_rng
    def p_train_step(optimizer, batch, prev_metrics, dropout_rng):
        return train_lib.train_step(optimizer,
                                    batch,
                                    prev_metrics,
                                    dropout_rng,
                                    config=train_config,
                                    learning_rate_fn=learning_rate_fn,
                                    num_microbatches=CFG.microbatches,
                                    label_smoothing=CFG.label_smoothing,
                                    z_loss=CFG.z_loss,
                                    use_bfloat16=CFG.use_bfloat16)

    if num_partitions > 1:
        p_train_step = sharded_jit(
            p_train_step,
            in_parts=(optimizer_partitions, None, None, None),
            local_in_parts=(per_host_optimizer_partitions, None, None, None),
            out_parts=(optimizer_partitions, None, None),
            local_out_parts=(per_host_optimizer_partitions, None, None))
    # TODO(levskaya): the in_axes spec below might be wrong, double-check.
    p_train_step = jax.pmap(p_train_step,
                            axis_name='batch',
                            in_axes=(None, 0, 0, 0),
                            donate_argnums=(0, ),
                            global_arg_shapes=(optimizer_shapes, None, None,
                                               None),
                            axis_size=topology.num_replicas,
                            devices=topology.device_assignment)  # pytype: disable=wrong-arg-types

    # OR, we use an on-device loop that feeds the training step via infeed queue.
    def device_train_loop_cond(args):
        """Stopping criterion for on-device loop."""
        _, _, _, _, step, epoch = args
        return step // steps_per_epoch == epoch

    def device_train_loop_body(args):
        """On-device loop body."""
        optimizer, dropout_rngs, metrics, token, step, epoch = args
        # Ordering input data from infeed requires threading a symbolic token
        # through the computation.
        input_data, token = lax.infeed(token,
                                       shape=tuple([
                                           jax.ShapedArray(s, jnp.int32)
                                           for s in device_train_input_shape
                                       ]))
        # Rebuild input dict from infeed data tuple.
        batch = {k: v for k, v in zip(train_keys, input_data)}
        # Run the train_step function and return the loop state.
        optimizer, metrics, dropout_rngs = train_lib.train_step(
            optimizer,
            batch,
            metrics,
            dropout_rngs,
            train_config,
            learning_rate_fn,
            num_microbatches=CFG.microbatches,
            label_smoothing=CFG.label_smoothing,
            z_loss=CFG.z_loss)
        step += 1
        return optimizer, dropout_rngs, metrics, token, step, epoch

    def device_train_loop(optimizer, dropout_rngs, metrics, step, epoch):
        # Create symbolic token for threading infeed data.
        token = lax.create_token(step)
        # Run on-device loop.
        optimizer, dropout_rngs, metrics, _, step, _ = lax.while_loop(
            device_train_loop_cond, device_train_loop_body,
            (optimizer, dropout_rngs, metrics, token, step, epoch))
        return optimizer, dropout_rngs, metrics, step

    if num_partitions > 1:
        device_train_loop = sharded_jit(
            device_train_loop,
            in_parts=(optimizer_partitions, None, None, None, None),
            local_in_parts=(per_host_optimizer_partitions, None, None, None,
                            None),
            out_parts=(optimizer_partitions, None, None, None),
            local_out_parts=(per_host_optimizer_partitions, None, None, None))
    p_train_epoch = jax.pmap(device_train_loop,
                             axis_name='batch',
                             in_axes=(None, 0, 0, None, None),
                             donate_argnums=(0, ),
                             global_arg_shapes=(optimizer_shapes, None, None,
                                                None, None),
                             axis_size=topology.num_replicas,
                             devices=topology.device_assignment)  # pytype: disable=wrong-arg-types

    # Reduction psum for metric data.

    def p_allreduce_metrics(x):
        return lax.psum(x, axis_name='batch')

    if num_partitions > 1:
        p_allreduce_metrics = sharded_jit(
            p_allreduce_metrics,
            in_parts=None,
            local_in_parts=None,
            out_parts=None,
            local_out_parts=None,
            num_partitions=num_partitions,
            local_num_partitions=topology.per_host_num_partitions)
    p_allreduce_metrics = jax.pmap(p_allreduce_metrics,
                                   axis_name='batch',
                                   global_arg_shapes=None,
                                   axis_size=topology.num_replicas,
                                   devices=topology.device_assignment)

    # Training evaluation computation.

    # eval_step(params, batch, config, label_smoothing=0.0) --> metrics
    def p_eval_step(params, batch):
        return train_lib.eval_step(params,
                                   batch,
                                   config=eval_config,
                                   label_smoothing=CFG.label_smoothing)

    if num_partitions > 1:
        p_eval_step = sharded_jit(
            p_eval_step,
            in_parts=(optimizer_partitions.target, None),
            local_in_parts=(per_host_optimizer_partitions.target, None),
            out_parts=None,
            local_out_parts=None)
    p_eval_step = jax.pmap(p_eval_step,
                           axis_name='batch',
                           in_axes=(None, 0),
                           global_arg_shapes=(optimizer_shapes.target, None),
                           axis_size=topology.num_replicas,
                           devices=topology.device_assignment)  # pytype: disable=wrong-arg-types

    # Fast autoregressive decoding loop.
    # For inference and model evaluation.

    # predict_step(inputs, params,
    #              eos_id, max_decode_len, config, beam_size=4) --> beam_seqs
    def p_pred_step(inputs, params):
        return train_lib.predict_step(inputs, params, eos_id,
                                      CFG.max_eval_target_length,
                                      predict_config, CFG.beam_size)

    if num_partitions > 1:
        p_pred_step = sharded_jit(
            p_pred_step,
            in_parts=(None, optimizer_partitions.target),
            local_in_parts=(None, per_host_optimizer_partitions.target),
            out_parts=None,
            local_out_parts=None)
    p_pred_step = jax.pmap(p_pred_step,
                           axis_name='batch',
                           in_axes=(0, None),
                           global_arg_shapes=(None, optimizer_shapes.target),
                           axis_size=topology.num_replicas,
                           devices=topology.device_assignment)  # pytype: disable=wrong-arg-types

    # ---------------------------------------------------------------------------
    # Main Train Loop
    # ---------------------------------------------------------------------------

    # We init the first set of dropout PRNG keys, but update it afterwards inside
    # the main pmap'd training update for performance.
    # There should be a unique dropout key for each replica represented on this
    # host, but the key should be the same for the same replica on other hosts.
    # Again, this is what the replica set abstraction is for.
    dropout_rngs = random.split(random.fold_in(rng, topology.replica_set_id),
                                topology.per_replica_set_num_replicas)
    # restore step from last checkpoint
    host_step = int(optimizer.state.step)
    empty_metrics = broadcast({
        'loss': 0.0,
        'accuracy': 0.0,
        'learning_rate': 0.0,
        'denominator': 0.0
    })
    if CFG.infeed:
        # TODO(jekbradbury): support something like this for the Python-loop case
        logging.info(
            'Precompiling training loop and moving optimizer to device.')
        optimizer, _, metrics, _ = p_train_epoch(optimizer, dropout_rngs,
                                                 empty_metrics,
                                                 jnp.array(0,
                                                           dtype=jnp.int32), 1)
        optimizer = train_lib.unbroadcast(optimizer)
        metrics['loss'].block_until_ready()

    logging.info('Starting training loop.')

    local_devices = jax.local_devices()
    device_step = broadcast(host_step)
    first_epoch = host_step // steps_per_epoch

    # Main Loop over "epochs".
    train_iter = train_ds.as_numpy_iterator()
    for epoch in range(first_epoch, first_epoch + CFG.num_epochs):
        metrics = empty_metrics

        # NOTE: 'optimizer' is unbroadcast by construction at initialization or
        # when loading a checkpoint. It is maintained in 'unbroadcast' state to
        # enable the XLA cross-replica sharding optimization.  The broadcasting is
        # handled automatically by the pmap'd functions that use it.

        # Gather all task evaluation metrics.
        logging.info('Evaluating tasks.')
        if epoch == first_epoch + 1:
            train_lib.sync_devices()
        for task in eval_cache.tasks:
            logging.info('Evaluating task %s', task.name)
            all_predicted, all_bs = [], []
            for pred_batch in eval_cache.preprocessed_examples[task.name]:
                # Handle final odd-sized batch by padding instead of dropping it.
                input_batch, unpadded_batch_size = train_lib.pad_batch_to_size(
                    pred_batch['inputs'], per_replica_set_eval_batch_size)
                all_bs.append(unpadded_batch_size)
                # Split batch dimensions for pmap.
                input_batch = jax.tree_map(
                    lambda x: x.reshape((topology.per_replica_set_num_replicas,
                                         -1) + x.shape[1:]), input_batch)
                # Run fast inference on batch.
                all_predicted.append(p_pred_step(input_batch,
                                                 optimizer.target))

            # Pad out the number of batches so each host has the same number.
            max_host_batch_number = np.max(
                eval_cache.preprocessed_batch_sizes[task.name])
            batch_shortfall = max_host_batch_number - len(all_predicted)
            if batch_shortfall > 0:
                # TODO(levskaya): Fix for case of entirely empty all_predicted.
                # To make sure the cross-host barriers work, we run the program the same
                # number of times on all hosts. The results of this call is ignored, and
                # the predictions are populated with zeros instead.
                p_pred_step(input_batch, optimizer.target)  # Dummy call.
                all_predicted.extend([jnp.zeros_like(all_predicted[0])] *
                                     batch_shortfall)
                all_bs.extend([0] * batch_shortfall)
            all_predicted = jnp.concatenate(all_predicted)
            all_bs = jnp.array(all_bs)

            # Collect all batches from across hosts and reverse sharding.
            all_predicted = train_lib.host_allgather(
                all_predicted, topology.num_replica_sets,
                topology.replica_set_id, topology.per_replica_set_host_id == 0)
            seqlength = all_predicted.shape[-1]
            total_examples = np.sum(
                train_lib.host_allgather(
                    all_bs, topology.num_replica_sets, topology.replica_set_id,
                    topology.per_replica_set_host_id == 0))
            del all_bs
            assert total_examples == len(eval_cache.examples[task.name]), (
                'Total number of batches incorrect for task %s.' % task.name)
            # De-shard the collected predicted tokens and remove padding.
            all_predicted = np.transpose(all_predicted, (1, 2, 0, 3)).reshape(
                -1, seqlength)[:total_examples]

            # We now run the post-processing and metric-fns on a single host.
            if jax.host_id() == 0:
                assert eval_summary_writer
                raw_predictions = []
                for tokens in all_predicted:
                    raw_predictions.append(decode_tokens(tokens))

                # post-process predictions for metric fns
                predictions = [
                    task.postprocess_fn(p, example=ex) for p, ex in zip(
                        raw_predictions, eval_cache.examples[task.name])
                ]

                for metric_fn in task.metric_fns:
                    scores = metric_fn(eval_cache.targets[task.name],
                                       predictions)
                    for metric_name, metric_value in scores.items():
                        tag = f'eval/{task.name}/{metric_name}'
                        eval_summary_writer.scalar(tag, metric_value,
                                                   host_step)
                        logging.info('EVAL %s at step %d: %.3f', tag,
                                     host_step, metric_value)
                    eval_summary_writer.flush()

                # Save text samples for tensorboard.
                exemplars = ''
                for n in np.random.choice(np.arange(len(predictions)), 8):
                    tgt_txt = tf.compat.as_text(
                        eval_cache.examples[task.name][n]['targets_plaintext'])
                    pred_txt = raw_predictions[n]
                    exemplars += (f'{eval_cache.inputs[task.name][n]}\n\n'
                                  f'target: {tgt_txt}\n\n'
                                  f'prediction: {pred_txt}\n\n')
                eval_summary_writer.text(f'{task.name} samples', exemplars,
                                         host_step)
                eval_summary_writer.flush()

        # Take an Xprof trace after the first loop has compiled everything.
        if epoch == first_epoch + 1:
            train_lib.sync_devices()

        # For on-device loop, we launch the computation before feeding data.
        logging.info('BEGIN Train loop.')
        if CFG.infeed:
            optimizer, dropout_rngs, metrics, device_step = p_train_epoch(
                optimizer, dropout_rngs, metrics,
                train_lib.unbroadcast(device_step), epoch)
            optimizer = train_lib.unbroadcast(optimizer)

        # Epoch loop.
        while int(host_step // steps_per_epoch) == epoch:
            batch = next(train_iter)
            batch = jax.tree_map(
                lambda x: x.reshape(
                    (topology.per_replica_set_num_replicas, -1) + x.shape[1:]),
                batch)
            # Feed the on-device training loop.
            if CFG.infeed:
                for i, device in enumerate(local_devices):
                    # When using infeed to provide data to the computation, we're on our
                    # own for feeding the right values to the right devices. Each device
                    # should get the minibatch corresponding to its replica, a slice of
                    # the larger batch corresponding to the host's replica set.
                    if device.platform == 'tpu':
                        device_coords = (*device.coords, device.id % 2)
                    else:
                        device_coords = (device.host_id, i)
                    per_replica_set_device_coords = tuple(
                        dc % prsm for dc, prsm in zip(
                            device_coords, topology.per_replica_set_mesh))
                    per_replica_set_replica_coords = tuple(
                        prsdc // prm
                        for prsdc, prm in zip(per_replica_set_device_coords,
                                              topology.per_replica_mesh))
                    per_replica_set_replica_id = 0
                    for prsm, prm, prsrc in zip(
                            topology.per_replica_set_mesh,
                            topology.per_replica_mesh,
                            per_replica_set_replica_coords):
                        per_replica_set_replica_id = (
                            per_replica_set_replica_id * prsm // prm + prsrc)
                    input_tuple = tuple([
                        batch[k][per_replica_set_replica_id]
                        for k in train_keys
                    ])
                    # Safety check: infeed does not check shape or types but requires
                    # them to agree with on-device spec, otherwise the queue and program
                    # stalls.
                    tuple_shapes = jax.tree_map(jnp.shape, input_tuple)
                    tuple_dtypes = jax.tree_map(lambda x: x.dtype, input_tuple)
                    assert tuple_shapes == device_train_input_shape, (
                        'infeed shape error %s != %s' %
                        (tuple_shapes, device_train_input_shape))
                    assert tuple(set(tuple_dtypes)) == (jnp.int32,), \
                        ('infeed dtype error %s not all of type %s' % (
                            tuple_dtypes, jnp.int32))
                    infeed_pool.submit(
                        functools.partial(device.transfer_to_infeed,
                                          input_tuple))
            # Host training loop.
            else:
                optimizer, metrics, dropout_rngs = p_train_step(
                    optimizer, batch, metrics, dropout_rngs)
                optimizer = train_lib.unbroadcast(optimizer)
            host_step += 1
        logging.info('END Train loop.')

        # Maybe save a checkpoint on one host.
        if (CFG.save_checkpoints
                and epoch % CFG.checkpoint_freq == CFG.checkpoint_freq - 1
                and jax.host_id() == 0):
            checkpoints.save_checkpoint(FLAGS.model_dir, optimizer, host_step)

        # Gather training metrics.
        metrics = p_allreduce_metrics(metrics)
        metrics = jax.tree_map(lambda x: jax.device_get(x[0]), metrics)
        denominator = metrics.pop('denominator')
        summary = jax.tree_map(lambda x: x / denominator, metrics)  # pylint: disable=cell-var-from-loop
        logging.info('train in step: %s, %s', host_step, summary)
        if jax.host_id() == 0:
            assert train_summary_writer
            for key, val in summary.items():
                train_summary_writer.scalar(key, val, host_step)
            train_summary_writer.flush()

        # Gather training evaluation metrics.
        logging.info('Gathering training evaluation metrics.')
        eval_metrics = []
        eval_iter = eval_ds.as_numpy_iterator()
        for _, eval_batch in zip(range(CFG.num_eval_steps), eval_iter):
            eval_batch = jax.tree_map(
                lambda x: x.reshape(
                    (topology.per_replica_set_num_replicas, -1) + x.shape[1:]),
                eval_batch)
            metrics = p_eval_step(optimizer.target, eval_batch)
            eval_metrics.append(metrics)
        # average metrics across devices
        eval_metrics = p_allreduce_metrics(eval_metrics)
        eval_metrics = common_utils.get_metrics(eval_metrics)
        # average metrics across steps
        eval_metrics = jax.tree_map(np.sum, eval_metrics)
        eval_denominator = eval_metrics.pop('denominator')
        eval_summary = jax.tree_map(
            lambda x: x / eval_denominator,  # pylint: disable=cell-var-from-loop
            eval_metrics)
        logging.info('eval in step: %s, %s', host_step, eval_summary)
        if jax.host_id() == 0:
            assert eval_summary_writer
            for key, val in eval_summary.items():
                eval_summary_writer.scalar(key, val, host_step)
            eval_summary_writer.flush()

    # Wait until computations are done before exiting
    logging.info('Finished.')
    train_lib.sync_devices()
    # Shut down the infeed threadpool.
    if CFG.infeed:
        infeed_pool.shutdown()
コード例 #27
0
def main(argv):
    del argv
    # BEGIN GOOGLE-INTERNAL
    xm.setup_work_unit()
    # END GOOGLE-INTERNAL

    tf.enable_v2_behavior()

    if jax.host_id() == 0:
        summary_writer = tensorboard.SummaryWriter(FLAGS.output_dir)
        # Write summaries in background thread to avoid blocking on device sync
        summary_thread = thread.ThreadPoolExecutor(1, 'summary')
    if FLAGS.infeed:
        # Infeed is currently synchronous, so do it in a background thread too
        infeed_pool = thread.ThreadPoolExecutor(jax.local_device_count(),
                                                'infeed')

    rng = random.PRNGKey(0)

    image_size = 224

    batch_size = FLAGS.batch_size
    if batch_size is None:
        batch_size = min(128 * jax.device_count(), 32768)
    eval_batch_size = 128 * jax.device_count()
    local_batch_size = batch_size // jax.host_count()
    local_eval_batch_size = eval_batch_size // jax.host_count()
    device_batch_size = batch_size // jax.device_count()
    device_eval_batch_size = eval_batch_size // jax.device_count()
    device_last_eval_batch_size = (input_pipeline.EVAL_IMAGES %
                                   eval_batch_size) // jax.device_count()

    model_dtype = jnp.bfloat16 if FLAGS.bfloat16 else jnp.float32
    input_dtype = tf.bfloat16 if FLAGS.bfloat16 else tf.float32
    if FLAGS.transpose_images:
        train_input_shape = (224, 224, 3, device_batch_size)
        eval_input_shapes = [(224, 224, 3, bs)
                             for bs in (device_eval_batch_size,
                                        device_last_eval_batch_size)]
    else:
        train_input_shape = (device_batch_size, 224, 224, 3)
        eval_input_shapes = [(bs, 224, 224, 3)
                             for bs in (device_eval_batch_size,
                                        device_last_eval_batch_size)]

    num_epochs = FLAGS.num_epochs
    steps_per_epoch = input_pipeline.TRAIN_IMAGES / batch_size
    logging.info('steps_per_epoch: %f', steps_per_epoch)
    steps_per_eval = int(np.ceil(input_pipeline.EVAL_IMAGES / eval_batch_size))
    logging.info('steps_per_eval: %d', steps_per_eval)

    base_learning_rate = FLAGS.learning_rate * batch_size / 256.
    beta = FLAGS.momentum
    weight_decay = FLAGS.weight_decay

    logging.info('creating and initializing model and optimizer')
    model, state = create_model(rng, device_batch_size, image_size,
                                model_dtype)
    state = jax_utils.replicate(state)
    if FLAGS.lars:
        weight_opt_def = optim.LARS(base_learning_rate,
                                    beta,
                                    weight_decay=weight_decay)
        other_opt_def = optim.Momentum(base_learning_rate,
                                       beta,
                                       weight_decay=0,
                                       nesterov=False)
        learning_rate_fn = polynomial_learning_rate_fn(batch_size,
                                                       steps_per_epoch,
                                                       num_epochs)
    else:
        weight_opt_def = optim.Momentum(base_learning_rate,
                                        beta,
                                        weight_decay=weight_decay,
                                        nesterov=True)
        other_opt_def = optim.Momentum(base_learning_rate,
                                       beta,
                                       weight_decay=0,
                                       nesterov=True)
        learning_rate_fn = piecewise_learning_rate_fn(base_learning_rate,
                                                      steps_per_epoch,
                                                      num_epochs)

    def filter_weights(key, _):
        return 'bias' not in key and 'scale' not in key

    def filter_other(key, _):
        return 'bias' in key or 'scale' in key

    weight_traversal = optim.ModelParamTraversal(filter_weights)
    other_traversal = optim.ModelParamTraversal(filter_other)
    optimizer_def = optim.MultiOptimizer((weight_traversal, weight_opt_def),
                                         (other_traversal, other_opt_def))
    optimizer = optimizer_def.create(model)
    optimizer = optimizer.replicate()
    del model  # do not keep a copy of the initial model

    p_train_step = jax.pmap(partial(train_step,
                                    learning_rate_fn=learning_rate_fn),
                            axis_name='batch')
    p_eval_step = jax.pmap(eval_step, axis_name='batch')

    def device_train_loop_cond(args):
        _, _, _, _, step, epoch = args
        return step // steps_per_epoch == epoch

    def device_train_loop_body(args):
        optimizer, state, metrics, token, step, epoch = args
        (images, labels), token = lax.infeed(
            token,
            shape=(jax.ShapedArray(train_input_shape, model_dtype),
                   jax.ShapedArray((device_batch_size, ), jnp.int32)))
        batch = {'image': images, 'label': labels}
        optimizer, state, metrics = train_step(optimizer, state, batch,
                                               metrics, learning_rate_fn)
        step += 1
        return optimizer, state, metrics, token, step, epoch

    def device_train_loop(optimizer, state, metrics, step, epoch):
        token = lax.create_token(step)
        optimizer, state, metrics, _, step, _ = lax.while_loop(
            device_train_loop_cond, device_train_loop_body,
            (optimizer, state, metrics, token, step, epoch))
        return optimizer, state, metrics, step

    p_train_epoch = jax.pmap(device_train_loop, axis_name='batch')

    if FLAGS.precompile:
        logging.info('precompiling step/epoch functions')
        if FLAGS.infeed:
            # the device training loop condition will immediately be false
            p_train_epoch(optimizer, state, empty_metrics(),
                          jax_utils.replicate(0), jax_utils.replicate(1))
        else:
            batch = {
                'image':
                jnp.zeros((jax.local_device_count(), ) + train_input_shape,
                          model_dtype),
                'label':
                jnp.zeros((jax.local_device_count(), ) + (device_batch_size, ),
                          jnp.int32)
            }
            p_train_step(optimizer, state, batch, empty_metrics())
        for dbs, eis in zip(
            [device_eval_batch_size, device_last_eval_batch_size],
                eval_input_shapes):
            batch = {
                'image':
                jnp.zeros((jax.local_device_count(), ) + eis, model_dtype),
                'label':
                jnp.zeros((jax.local_device_count(), ) + (dbs, ), jnp.int32)
            }
            p_eval_step(optimizer.target, state, batch, empty_metrics())
        allreduce_metrics(empty_metrics())
        pmean = functools.partial(jax.lax.pmean, axis_name='batch')
        jax.pmap(pmean, axis_name='batch')(state)

    logging.info('constructing datasets')
    # pylint: disable=g-complex-comprehension
    train_ds, eval_ds = [
        input_pipeline.load_split(
            local_batch_size if train else local_eval_batch_size,
            image_size=image_size,
            dtype=input_dtype,
            train=train,
            transpose_images=FLAGS.transpose_images) for train in (True, False)
    ]
    # pylint: enable=g-complex-comprehension
    logging.info('constructing dataset iterators')
    train_iter = iter(train_ds)
    eval_iter = iter(eval_ds)

    logging.info('beginning training')
    host_step, device_step = 0, jax_utils.replicate(0)
    for epoch in range(num_epochs):
        device_epoch = jax_utils.replicate(epoch)
        metrics = empty_metrics()
        if FLAGS.infeed:
            optimizer, state, metrics, device_step = p_train_epoch(
                optimizer, state, metrics, device_step, device_epoch)
        while int(host_step // steps_per_epoch) == epoch:
            batch = jax.tree_map(lambda x: x._numpy(), next(train_iter))  # pylint: disable=protected-access
            if FLAGS.infeed:
                for i, device in enumerate(jax.local_devices()):
                    images, labels = batch['image'][i], batch['label'][i]
                    assert images.shape == train_input_shape and labels.dtype == jnp.int32
                    infeed_pool.submit(
                        partial(device.transfer_to_infeed, (images, labels)))
            else:
                optimizer, state, metrics = p_train_step(
                    optimizer, state, batch, metrics)
            host_step += 1
        if FLAGS.train_metrics:
            metrics = allreduce_metrics(metrics)
            if jax.host_id() == 0:
                summary_thread.submit(
                    partial(write_summary, summary_writer, metrics, 'train',
                            epoch + 1))
        if not FLAGS.distributed_batchnorm:  # otherwise it's already synced
            pmean = functools.partial(jax.lax.pmean, axis_name='batch')
            state = jax.pmap(pmean, axis_name='batch')(state)
        metrics = empty_metrics()
        for _ in range(steps_per_eval):
            batch = jax.tree_map(lambda x: x._numpy(), next(eval_iter))  # pylint: disable=protected-access
            metrics = p_eval_step(optimizer.target, state, batch, metrics)
        metrics = allreduce_metrics(metrics)
        if jax.host_id() == 0:
            summary_thread.submit(
                partial(write_summary, summary_writer, metrics, 'eval',
                        epoch + 1))
        # TODO(deveci): do something like this from the summary thread:
        # if summary['accuracy'] > TARGET_ACCURACY:
        #   break
    if jax.host_id() == 0:
        summary_thread.shutdown()
    # Wait until computations are done before exiting
    jax.random.normal(jax.random.PRNGKey(0), ()).block_until_ready()
コード例 #28
0
# -*- coding: utf-8 -*-
import functools
from concurrent.futures import thread

executor_pool = thread.ThreadPoolExecutor()


def run_on_executor(fn):
    """
    Decorator to run a synchronous method asynchronously
    """
    @functools.wraps(fn)
    def wrapper(*args, **kwargs):
        future = executor_pool.submit(fn, *args, **kwargs)
        return future

    return wrapper
コード例 #29
0
def predict(argv):
    import redisclient
    from load_config import load_config
    args = ctparser.parse_args(argv)
    conf = load_config('../conf/servers.yaml' if len(args.config) ==
                       0 else os.path.join(cur_dir, args.config))
    rc = redisclient.predictClient(conf)

    def predAndDraw(target, args):
        try:
            result = rc.sendRequest(args.model,
                                    target[0],
                                    priority=args.priority,
                                    thresh=args.printthresh)
            assert result.status == 0, 'Invalid response!'
            if target[1] is not None:
                print_boxes(
                    target[0],
                    os.path.join(target[1], os.path.basename(target[0])),
                    result.dets, args.printthresh)
        except Exception:
            return (None, target)
        return (result, target)

    csvfile = open(os.path.join(cur_dir, args.csv),
                   'w') if len(args.csv) > 0 else None
    xsfile = open(os.path.join(cur_dir, args.xsize),
                  'w') if len(args.xsize) > 0 else None
    outpath = os.path.join(cur_dir,
                           args.output) if len(args.output) > 0 else None
    if outpath is not None and not os.path.exists(outpath):
        os.makedirs(outpath)

    imagelist = []
    videolist = []
    args.input = os.path.join(cur_dir, args.input)

    if args.action == 'image':
        imagelist.append((args.input, outpath))
    elif args.action == 'images':
        for name in os.listdir(args.input):
            rl = mimetypes.guess_type(name)
            if rl[0] is not None and rl[0].startswith('image'):
                imagelist.append((os.path.join(args.input, name), outpath))
    elif args.action == 'video':
        videolist.append(
            videoprocess(
                os.path.join(cur_dir, args.input),
                os.path.join(cur_dir, args.output,
                             os.path.basename(args.input)), args.fps))
    else:
        raise NotImplementedError

    for video in videolist:
        imagelist.extend(video.getFrames())

    imagelist.sort()

    with thread.ThreadPoolExecutor(8) as pool:
        for i in range(args.retry + 1):
            if (len(imagelist)) == 0:
                print('all files has been processed')
                break
            wklist = [
                pool.submit(predAndDraw, target, args) for target in imagelist
            ]
            imagelist = []
            for wk in wklist:
                result = wk.result()
                if result[0] is None:
                    imagelist.append(result[1])
                else:
                    print('{} processed'.format(result[1][0]))
                    if csvfile:
                        csvfile.write(
                            message2csv(
                                result[0].dets,
                                os.path.splitext(os.path.basename(
                                    result[1][0]))[0]))
                    if xsfile:
                        xsfile.write('{}:{},{},{}\n'.format(
                            os.path.splitext(os.path.basename(
                                result[1][0]))[0], result[0].xsize,
                            result[0].width, result[0].height))
    if csvfile:
        csvfile.close()
    if xsfile:
        xsfile.close()

    for target in imagelist:
        print('failed to process {}'.format(target[0]))
    for video in videolist:
        video.getVideo()
コード例 #30
0
def run_thread_by_pool(foo,argslist):
    executor = thread.ThreadPoolExecutor(max_workers=100)
    for arg in argslist[0:1]:
        executor.submit(foo,arg)