예제 #1
0
 def print_process(self):
     if not self.parent.interactive and not log.v[5]:
         return
     start_elapsed = time.time() - self.parent.start_time
     complete = self.parent.batches.completed_frac()
     assert complete > 0
     total_time_estimated = start_elapsed / complete
     remaining_estimated = total_time_estimated - start_elapsed
     if log.verbose[5]:
         mem_usage = self.device_mem_usage_str(self.alloc_devices)
         info = [
             self.parent.report_prefix,
             "batch %i" % self.run_start_batch_idx
         ]
         if self.eval_info:  # Such as score.
             info += [
                 "%s %s" % item
                 for item in sorted(self.eval_info.items())
             ]
         info += [
             "elapsed %s" % hms(start_elapsed),
             "exp. remaining %s" % hms(remaining_estimated),
             "complete %.02f%%" % (complete * 100)
         ]
         if mem_usage:
             info += ["memory %s" % mem_usage]
         print(", ".join(filter(None, info)), file=log.v5)
     if self.parent.interactive:
         progress_bar(complete, hms(remaining_estimated))
예제 #2
0
 def _print_process(self, report_prefix, step, step_duration, eval_info):
     if not self._show_interactive_process_bar and not log.v[5]:
         return
     start_elapsed = time.time() - self.start_time
     complete = self.data_provider.batches.completed_frac()
     assert complete > 0
     total_time_estimated = start_elapsed / complete
     remaining_estimated = total_time_estimated - start_elapsed
     if log.verbose[5]:
         info = [report_prefix, "step %i" % step]
         if eval_info:  # Such as score.
             info += ["%s %s" % item for item in sorted(eval_info.items())]
         info += [
             "%.3f sec/step" % step_duration,
             "elapsed %s" % hms(start_elapsed),
             "exp. remaining %s" % hms(remaining_estimated),
             "complete %.02f%%" % (complete * 100)
         ]
         print(", ".join(filter(None, info)), file=log.v5)
     elif self._show_interactive_process_bar:
         from Util import progress_bar
         progress_bar(complete, hms(remaining_estimated))
예제 #3
0
 def print_process(self):
   if not self.parent.interactive and not log.v[5]:
     return
   start_elapsed = time.time() - self.parent.start_time
   complete = self.parent.batches.completed_frac()
   assert complete > 0
   total_time_estimated = start_elapsed / complete
   remaining_estimated = total_time_estimated - start_elapsed
   if log.verbose[5]:
     mem_usage = self.device_mem_usage_str(self.alloc_devices)
     info = [
       self.parent.report_prefix,
       "batch %i" % self.run_start_batch_idx]
     if self.eval_info:  # Such as score.
       info += ["%s %s" % item for item in sorted(self.eval_info.items())]
     info += [
       "elapsed %s" % hms(start_elapsed),
       "exp. remaining %s" % hms(remaining_estimated),
       "complete %.02f%%" % (complete * 100)]
     if mem_usage:
       info += ["memory %s" % mem_usage]
     print >> log.v5, ", ".join(filter(None, info))
   if self.parent.interactive:
     progress_bar(complete, hms(remaining_estimated))
예제 #4
0
    def run_inner(self):
        self.start_time = time.time()
        for device in self.devices:
            device.prepare(epoch=self.epoch, **self.get_device_prepare_args())
        self.initialize()
        terminal_width, _ = terminal_size()
        self.interactive = (log.v[3] and terminal_width >= 0)
        print("starting task", self.task, file=log.v5)

        for device in self.devices:
            device.eval_batch_idx = -1
            device.start_epoch_stats()
            device.num_frames = 0
            device.num_updates = 0
            device.tot = 0

        num_device_runs = 1 if self.share_batches else len(self.devices)
        deviceRuns = [
            self.DeviceBatchRun(
                self,
                [self.devices[i]] if not self.share_batches else self.devices)
            for i in range(num_device_runs)
        ]

        results = {'batchess': [], 'results': [], 'num_frames': NumbersDict(0)}
        run_frames = NumbersDict(0)
        cost_result_format = -1

        crashed = False
        assert num_device_runs > 0

        while True:
            if getattr(sys, "exited", False):
                # This happens when we exit Python.
                # Without this check, this thread would keep running until all exit handlers of Python are done.
                print("%s stopped" % self, file=log.v5)
                crashed = True
                break

            for i in range(num_device_runs):
                if deviceRuns[i].crashed or not deviceRuns[i].is_alive():
                    crashed = True
                    break
                if deviceRuns[i].finished:
                    results['batchess'] += deviceRuns[i].result['batchess'][:]
                    results['results'] += deviceRuns[i].result['results'][:]
                    results['result_format'] = deviceRuns[i].result[
                        'result_format']
                    deviceRuns[i].finished = False
            if crashed:
                break

            if cost_result_format < 0 and deviceRuns[i].result['result_format']:
                for idx, fmt in enumerate(
                        deviceRuns[i].result['result_format']):
                    if fmt and fmt.startswith('cost:'):
                        cost_result_format = idx
            total_cost = 0
            if results['results'] and cost_result_format >= 0:
                total_cost = numpy.asarray(
                    results['results'])[:, cost_result_format].sum()
            if total_cost >= self.eval_batch_size or not self.batches.has_more(
            ):
                if all(not (dev.finished or dev.allocated or dev.processing)
                       for dev in deviceRuns):
                    results['num_frames'] = run_frames
                    self.num_frames += run_frames
                    if self.share_batches: run_frames *= len(self.devices)
                    self.reduce(run_frames)
                    self.eval_batch_idx += 1
                    run_frames = NumbersDict(0)
                    results['batchess'] = []
                    results['results'] = []
                    for device in self.devices:
                        device.num_frames = 0
                        device.num_updates = 0
                    if not self.batches.has_more():
                        break
                else:
                    time.sleep(0.01)

            match = True
            while self.batches.has_more(
            ) and total_cost < self.eval_batch_size and match:
                self.batch_idx = self.batches.get_current_batch_idx()
                if self.batch_idx < self.start_batch:
                    self.batches.advance(1)
                    break
                match = False
                for i in range(num_device_runs):
                    if not deviceRuns[i].allocated:
                        deviceRuns[i].allocate()
                        run_frames += deviceRuns[i].run_frames
                        match = True
                        break
            if not match:
                time.sleep(0.01)

        for run in deviceRuns:
            run.stop()
        if crashed: return
        for device in self.devices:
            device.finish_epoch_stats()
        self.finalize()
        if self.interactive: progress_bar()
        self.elapsed = (time.time() - self.start_time)
예제 #5
0
 def _print_finish_process(self):
   if self._show_interactive_process_bar:
     from Util import progress_bar
     progress_bar()
예제 #6
0
    def run_inner(self):
      self.start_time = time.time()
      for device in self.devices:
        device.prepare(epoch=self.epoch, **self.get_device_prepare_args())
      self.initialize()
      terminal_width, _ = terminal_size()
      self.interactive = (log.v[3] and terminal_width >= 0)
      print >> log.v5, "starting task", self.task

      for device in self.devices:
        device.eval_batch_idx = -1
        device.start_epoch_stats()
        device.num_frames = 0
        device.num_updates = 0
        device.tot = 0

      num_device_runs = 1 if self.share_batches else len(self.devices)
      deviceRuns = [ self.DeviceBatchRun(self, [self.devices[i]] if not self.share_batches else self.devices) for i in range(num_device_runs) ]

      results = { 'batchess': [], 'results': [], 'num_frames' : NumbersDict(0) }
      run_frames = NumbersDict(0)

      crashed = False

      while True:
        if getattr(sys, "exited", False):
          # This happens when we exit Python.
          # Without this check, this thread would keep running until all exit handlers of Python are done.
          print >> log.v5, "%s stopped" % self
          crashed = True
          break

        for i in range(num_device_runs):
          if deviceRuns[i].crashed:
            crashed = True
            break
          if deviceRuns[i].finished:
            results['batchess'] += deviceRuns[i].result['batchess'][:]
            results['results'] += deviceRuns[i].result['results'][:]
            results['result_format'] = deviceRuns[i].result['result_format']
            deviceRuns[i].finished = False
        if crashed:
          break

        if run_frames.max_value() >= self.eval_batch_size or not self.batches.has_more():
          if all(not (dev.finished or dev.allocated or dev.processing) for dev in deviceRuns):
            results['num_frames'] = run_frames
            self.num_frames += run_frames
            if self.share_batches: run_frames *= len(self.devices)
            self.reduce(run_frames)
            self.eval_batch_idx += 1
            run_frames = NumbersDict(0)
            results['batchess'] = []
            results['results'] = []
            for device in self.devices:
              device.num_frames = 0
              device.num_updates = 0
            if not self.batches.has_more():
              break
          else:
            time.sleep(0.01)

        match = True
        while self.batches.has_more() and run_frames.max_value() < self.eval_batch_size and match:
          self.batch_idx = self.batches.get_current_batch_idx()
          if self.batch_idx < self.start_batch:
            self.batches.advance(1)
            break
          match = False
          for i in range(num_device_runs):
            if not deviceRuns[i].allocated:
              deviceRuns[i].allocate()
              run_frames += deviceRuns[i].run_frames
              match = True
              break
        if not match:
          time.sleep(0.01)

      for run in deviceRuns:
        run.stop()
      if crashed: return
      for device in self.devices:
        device.finish_epoch_stats()
      self.finalize()
      if self.interactive: progress_bar()
      self.elapsed = (time.time() - self.start_time)
예제 #7
0
  def work(self):
    print("Starting hyper param search. Using %i threads." % self.num_threads, file=log.v1)
    from TFUtil import get_available_gpu_devices
    from Log import wrap_log_streams, StreamDummy
    from threading import Thread, Condition
    from Util import progress_bar, hms, is_tty

    class Outstanding:
      cond = Condition()
      threads = []  # type: list[WorkerThread]
      population = []
      exit = False
      exception = None

    class WorkerThread(Thread):
      def __init__(self, gpu_ids):
        """
        :param set[int] gpu_ids:
        """
        super(WorkerThread, self).__init__(name="Hyper param tune train thread")
        self.gpu_ids = gpu_ids
        self.trainer = None  # type: _IndividualTrainer
        self.finished = False
        self.start()

      def cancel(self, join=False):
        with Outstanding.cond:
          if self.trainer:
            self.trainer.cancel_flag = True
            if self.trainer.runner:
              self.trainer.runner.cancel_flag = True
        if join:
          self.join()

      def get_complete_frac(self):
        with Outstanding.cond:
          if self.trainer and self.trainer.runner:
            return self.trainer.runner.data_provider.get_complete_frac()
        return 0.0

      def run(self_thread):
        try:
          while True:
            with Outstanding.cond:
              if Outstanding.exit or Outstanding.exception:
                return
              if not Outstanding.population:
                self_thread.finished = True
                Outstanding.cond.notify_all()
                return
              individual = Outstanding.population.pop(0)
              self_thread.trainer = _IndividualTrainer(optim=self, individual=individual, gpu_ids=self_thread.gpu_ids)
            self_thread.name = "Hyper param tune train thread on %r" % individual.name
            self_thread.trainer.run()
        except Exception as exc:
          with Outstanding.cond:
            if not Outstanding.exception:
              Outstanding.exception = exc or True
            Outstanding.cond.notify_all()
          for thread in Outstanding.threads:
            if thread is not self_thread:
              thread.cancel()
          if not isinstance(exc, CancelTrainingException):
            with Outstanding.cond:  # So that we don't mix up multiple on sys.stderr.
              # This would normally dump it on sys.stderr so it's fine.
              sys.excepthook(*sys.exc_info())

    best_individuals = []
    population = []
    canceled = False
    num_gpus = len(get_available_gpu_devices())
    print("Num available GPUs:", num_gpus)
    num_gpus = num_gpus or 1  # Would be ignored anyway.
    interactive = is_tty()
    try:
      print("Population of %i individuals (hyper param setting instances), running for %i evaluation iterations." % (
        self.num_individuals, self.num_iterations), file=log.v2)
      for cur_iteration_idx in range(1, self.num_iterations + 1):
        print("Starting iteration %i." % cur_iteration_idx, file=log.v2)
        if cur_iteration_idx == 1:
          population.append(Individual(
            {p: p.get_default_value() for p in self.hyper_params}, name="default"))
          population.append(Individual(
            {p: p.get_initial_value() for p in self.hyper_params}, name="canonical"))
        population.extend(self.get_population(
          iteration_idx=cur_iteration_idx, num_individuals=self.num_individuals - len(population)))
        if cur_iteration_idx > 1:
          self.cross_over(population=population, iteration_idx=cur_iteration_idx)
        if cur_iteration_idx == 1 and self.dry_run_first_individual:
          # Train first directly for testing and to see log output.
          # Later we will strip away all log output.
          print("Very first try with log output:", file=log.v2)
          _IndividualTrainer(optim=self, individual=population[0], gpu_ids={0}).run()
        print("Starting training with thread pool of %i threads." % self.num_threads)
        iteration_start_time = time.time()
        with wrap_log_streams(StreamDummy(), also_sys_stdout=True, tf_log_verbosity="WARN"):
          Outstanding.exit = False
          Outstanding.population = list(population)
          Outstanding.threads = [WorkerThread(gpu_ids={i % num_gpus}) for i in range(self.num_threads)]
          try:
            while True:
              with Outstanding.cond:
                if all([thread.finished for thread in Outstanding.threads]) or Outstanding.exception:
                  break
                complete_frac = max(len(population) - len(Outstanding.population) - len(Outstanding.threads), 0)
                complete_frac += sum([thread.get_complete_frac() for thread in Outstanding.threads])
                complete_frac /= float(len(population))
                remaining_str = ""
                if complete_frac > 0:
                  start_elapsed = time.time() - iteration_start_time
                  total_time_estimated = start_elapsed / complete_frac
                  remaining_estimated = total_time_estimated - start_elapsed
                  remaining_str = hms(remaining_estimated)
                if interactive:
                  progress_bar(complete_frac, prefix=remaining_str, file=sys.__stdout__)
                else:
                  print(
                    "Progress: %.02f%%" % (complete_frac * 100),
                    "remaining:", remaining_str or "unknown", file=sys.__stdout__)
                  sys.__stdout__.flush()
                Outstanding.cond.wait(1 if interactive else 10)
            for thread in Outstanding.threads:
              thread.join()
          finally:
            Outstanding.exit = True
            for thread in Outstanding.threads:
              thread.cancel(join=True)
        Outstanding.threads = []
        print("Training iteration elapsed time:", hms(time.time() - iteration_start_time))
        if Outstanding.exception:
          raise Outstanding.exception
        assert not Outstanding.population
        print("Training iteration finished.")
        population.sort(key=lambda p: p.cost)
        del population[-self.num_kill_individuals:]
        best_individuals.extend(population)
        best_individuals.sort(key=lambda p: p.cost)
        del best_individuals[self.num_best:]
        population = best_individuals[:self.num_kill_individuals // 4] + population
        print("Current best setting, individual %s" % best_individuals[0].name, "cost:", best_individuals[0].cost)
        for p in self.hyper_params:
          print(" %s -> %s" % (p.description(), best_individuals[0].hyper_param_mapping[p]))
    except KeyboardInterrupt:
      print("KeyboardInterrupt, canceled search.")
      canceled = True

    print("Best %i settings:" % len(best_individuals))
    for individual in best_individuals:
      print("Individual %s" % individual.name, "cost:", individual.cost)
      for p in self.hyper_params:
        print(" %s -> %s" % (p.description(), individual.hyper_param_mapping[p]))
예제 #8
0
  def work(self):
    print("Starting hyper param search. Using %i threads." % self.num_threads, file=log.v1)
    from TFUtil import get_available_gpu_devices
    from Log import wrap_log_streams, StreamDummy
    from threading import Thread, Condition
    from Util import progress_bar, hms, is_tty

    class Outstanding:
      cond = Condition()
      threads = []  # type: list[WorkerThread]
      population = []
      exit = False
      exception = None

    class WorkerThread(Thread):
      def __init__(self, gpu_ids):
        """
        :param set[int] gpu_ids:
        """
        super(WorkerThread, self).__init__(name="Hyper param tune train thread")
        self.gpu_ids = gpu_ids
        self.trainer = None  # type: _IndividualTrainer
        self.finished = False
        self.start()

      def cancel(self, join=False):
        with Outstanding.cond:
          if self.trainer:
            self.trainer.cancel_flag = True
            if self.trainer.runner:
              self.trainer.runner.cancel_flag = True
        if join:
          self.join()

      def get_complete_frac(self):
        with Outstanding.cond:
          if self.trainer and self.trainer.runner:
            return self.trainer.runner.data_provider.get_complete_frac()
        return 0.0

      def run(self_thread):
        try:
          while True:
            with Outstanding.cond:
              if Outstanding.exit or Outstanding.exception:
                return
              if not Outstanding.population:
                self_thread.finished = True
                Outstanding.cond.notify_all()
                return
              individual = Outstanding.population.pop(0)
              self_thread.trainer = _IndividualTrainer(optim=self, individual=individual, gpu_ids=self_thread.gpu_ids)
            self_thread.name = "Hyper param tune train thread on %r" % individual.name
            self_thread.trainer.run()
        except Exception as exc:
          with Outstanding.cond:
            if not Outstanding.exception:
              Outstanding.exception = exc or True
            Outstanding.cond.notify_all()
          for thread in Outstanding.threads:
            if thread is not self_thread:
              thread.cancel()
          if not isinstance(exc, CancelTrainingException):
            with Outstanding.cond:  # So that we don't mix up multiple on sys.stderr.
              # This would normally dump it on sys.stderr so it's fine.
              sys.excepthook(*sys.exc_info())

    best_individuals = []
    population = []
    canceled = False
    num_gpus = len(get_available_gpu_devices())
    print("Num available GPUs:", num_gpus)
    num_gpus = num_gpus or 1  # Would be ignored anyway.
    interactive = is_tty()
    try:
      print("Population of %i individuals (hyper param setting instances), running for %i evaluation iterations." % (
        self.num_individuals, self.num_iterations), file=log.v2)
      for cur_iteration_idx in range(1, self.num_iterations + 1):
        print("Starting iteration %i." % cur_iteration_idx, file=log.v2)
        if cur_iteration_idx == 1:
          population.append(Individual(
            {p: p.get_default_value() for p in self.hyper_params}, name="default"))
          population.append(Individual(
            {p: p.get_initial_value() for p in self.hyper_params}, name="canonical"))
        population.extend(self.get_population(
          iteration_idx=cur_iteration_idx, num_individuals=self.num_individuals - len(population)))
        if cur_iteration_idx > 1:
          self.cross_over(population=population, iteration_idx=cur_iteration_idx)
        if cur_iteration_idx == 1 and self.dry_run_first_individual:
          # Train first directly for testing and to see log output.
          # Later we will strip away all log output.
          print("Very first try with log output:", file=log.v2)
          _IndividualTrainer(optim=self, individual=population[0], gpu_ids={0}).run()
        print("Starting training with thread pool of %i threads." % self.num_threads)
        iteration_start_time = time.time()
        with wrap_log_streams(StreamDummy(), also_sys_stdout=True, tf_log_verbosity="WARN"):
          Outstanding.exit = False
          Outstanding.population = list(population)
          Outstanding.threads = [WorkerThread(gpu_ids={i % num_gpus}) for i in range(self.num_threads)]
          try:
            while True:
              with Outstanding.cond:
                if all([thread.finished for thread in Outstanding.threads]) or Outstanding.exception:
                  break
                complete_frac = max(len(population) - len(Outstanding.population) - len(Outstanding.threads), 0)
                complete_frac += sum([thread.get_complete_frac() for thread in Outstanding.threads])
                complete_frac /= float(len(population))
                remaining_str = ""
                if complete_frac > 0:
                  start_elapsed = time.time() - iteration_start_time
                  total_time_estimated = start_elapsed / complete_frac
                  remaining_estimated = total_time_estimated - start_elapsed
                  remaining_str = hms(remaining_estimated)
                if interactive:
                  progress_bar(complete_frac, prefix=remaining_str, file=sys.__stdout__)
                else:
                  print(
                    "Progress: %.02f%%" % (complete_frac * 100),
                    "remaining:", remaining_str or "unknown", file=sys.__stdout__)
                  sys.__stdout__.flush()
                Outstanding.cond.wait(1 if interactive else 10)
            for thread in Outstanding.threads:
              thread.join()
          finally:
            Outstanding.exit = True
            for thread in Outstanding.threads:
              thread.cancel(join=True)
        Outstanding.threads = []
        print("Training iteration elapsed time:", hms(time.time() - iteration_start_time))
        if Outstanding.exception:
          raise Outstanding.exception
        assert not Outstanding.population
        print("Training iteration finished.")
        population.sort(key=lambda p: p.cost)
        del population[-self.num_kill_individuals:]
        best_individuals.extend(population)
        best_individuals.sort(key=lambda p: p.cost)
        del best_individuals[self.num_best:]
        population = best_individuals[:self.num_kill_individuals // 4] + population
        print("Current best setting, individual %s" % best_individuals[0].name, "cost:", best_individuals[0].cost)
        for p in self.hyper_params:
          print(" %s -> %s" % (p.description(), best_individuals[0].hyper_param_mapping[p]))
    except KeyboardInterrupt:
      print("KeyboardInterrupt, canceled search.")
      canceled = True

    print("Best %i settings:" % len(best_individuals))
    for individual in best_individuals:
      print("Individual %s" % individual.name, "cost:", individual.cost)
      for p in self.hyper_params:
        print(" %s -> %s" % (p.description(), individual.hyper_param_mapping[p]))