Example #1
0
    def mp_search(self, graph, other_info, model_id, train_data, test_data):
        ctx = mp.get_context()
        q = ctx.Queue()
        p = ctx.Process(target=train,
                        args=(q, graph, train_data, test_data,
                              self.trainer_args, self.metric, self.loss,
                              self.verbose, self.path))
        try:
            p.start()
            search_results = self._search_common(q)
            metric_value, loss, graph = q.get(block=True)
            if time.time() >= self._timeout:
                raise TimeoutError
            if self.verbose and search_results:
                for (generated_graph, generated_other_info,
                     new_model_id) in search_results:
                    verbose_print(generated_other_info, generated_graph,
                                  new_model_id)

            if metric_value is not None:
                self.add_model(metric_value, loss, graph, model_id)
                self.update(other_info, model_id, graph, metric_value)

        except (TimeoutError, queue.Empty) as e:
            raise TimeoutError from e
        finally:
            # terminate and join the subprocess to prevent any resource leak
            p.terminate()
            p.join()
Example #2
0
    def mp_search(self, graph, other_info, model_id, train_data, test_data):
        ctx = mp.get_context()
        q = ctx.Queue()
        p = ctx.Process(target=train, args=(q, graph, train_data, test_data, self.trainer_args,
                                            self.metric, self.loss, self.verbose, self.path))
        try:
            p.start()
            search_results = self._search_common(q)
            metric_value, loss, graph = q.get(block=True)
            if time.time() >= self._timeout:
                raise TimeoutError
            if self.verbose and search_results:
                for (generated_graph, generated_other_info, new_model_id) in search_results:
                    verbose_print(generated_other_info, generated_graph, new_model_id)

            if metric_value is not None:
                self.add_model(metric_value, loss, graph, model_id)
                self.update(other_info, model_id, graph, metric_value)

        except (TimeoutError, queue.Empty) as e:
            raise TimeoutError from e
        finally:
            # terminate and join the subprocess to prevent any resource leak
            p.terminate()
            p.join()
Example #3
0
    def sp_search(self, graph, other_info, model_id, train_data, test_data):
        try:
            metric_value, loss, graph = train(None, graph, train_data, test_data, self.trainer_args,
                                              self.metric, self.loss, self.verbose, self.path)
            # Do the search in current thread.
            search_results = self._search_common()
            if self.verbose and search_results:
                for (generated_graph, generated_other_info, new_model_id) in search_results:
                    verbose_print(generated_other_info, generated_graph, new_model_id)

            if metric_value is not None:
                self.add_model(metric_value, loss, graph, model_id)
                self.update(other_info, model_id, graph, metric_value)

        except TimeoutError as e:
            raise TimeoutError from e
Example #4
0
    def sp_search(self, graph, other_info, model_id, train_data, test_data):
        try:
            metric_value, loss, graph = train(None, graph, train_data, test_data, self.trainer_args,
                                              self.metric, self.loss, self.verbose, self.path)
            # Do the search in current thread.
            search_results = self._search_common()
            if self.verbose and search_results:
                for (generated_graph, generated_other_info, new_model_id) in search_results:
                    verbose_print(generated_other_info, generated_graph, new_model_id)

            if metric_value is not None:
                self.add_model(metric_value, loss, graph, model_id)
                self.update(other_info, model_id, graph, metric_value)

        except TimeoutError as e:
            raise TimeoutError from e
Example #5
0
    def search(self, train_data, test_data, timeout=60 * 60 * 24):
        """Run the search loop of training, generating and updating once.

        The function will run the training and generate in parallel.
        Then it will update the controller.
        The training is just pop out a graph from the training_queue and train it.
        The generate will call teh self.generate function.
        The update will call the self.update function.

        Args:
            train_data: An instance of DataLoader.
            test_data: An instance of Dataloader.
            timeout: An integer, time limit in seconds.
        """
        start_time = time.time()
        torch.cuda.empty_cache()
        if not self.history:
            self.init_search()

        # Start the new process for training.
        graph, other_info, model_id = self.training_queue.pop(0)
        if self.verbose:
            print('\n')
            print('+' + '-' * 46 + '+')
            print('|' + 'Training model {}'.format(model_id).center(46) + '|')
            print('+' + '-' * 46 + '+')
        # Temporary solution to support GOOGLE Colab
        if get_system() == Constant.SYS_GOOGLE_COLAB:
            ctx = mp.get_context('fork')
        else:
            ctx = mp.get_context('spawn')
        q = ctx.Queue()
        p = ctx.Process(target=train,
                        args=(q, graph, train_data, test_data,
                              self.trainer_args, self.metric, self.loss,
                              self.verbose, self.path))
        try:
            p.start()
            # Do the search in current thread.
            searched = False
            generated_graph = None
            generated_other_info = None
            if not self.training_queue:
                searched = True

                remaining_time = timeout - (time.time() - start_time)
                generated_other_info, generated_graph = self.generate(
                    remaining_time, q)
                new_model_id = self.model_count
                self.model_count += 1
                self.training_queue.append(
                    (generated_graph, generated_other_info, new_model_id))
                self.descriptors.append(generated_graph.extract_descriptor())

            remaining_time = timeout - (time.time() - start_time)
            if remaining_time <= 0:
                raise TimeoutError
            metric_value, loss, graph = q.get(timeout=remaining_time)

            if self.verbose and searched:
                verbose_print(generated_other_info, generated_graph)

            self.add_model(metric_value, loss, graph, model_id)
            self.update(other_info, graph, metric_value, model_id)

            self.export_json(os.path.join(self.path, 'history.json'))

        except (TimeoutError, queue.Empty) as e:
            raise TimeoutError from e
        except RuntimeError as e:
            if not re.search('out of memory', str(e)):
                raise e
            if self.verbose:
                print(
                    '\nCurrent model size is too big. Discontinuing training this model to search for other models.'
                )
            Constant.MAX_MODEL_SIZE = graph.size() - 1
            return
        finally:
            # terminate and join the subprocess to prevent any resource leak
            p.terminate()
            p.join()
Example #6
0
    def search(self, train_data, test_data, timeout=60 * 60 * 24):
        start_time = time.time()
        torch.cuda.empty_cache()
        if not self.history:
            self.init_search()

        # Start the new process for training.
        graph, father_id, model_id = self.training_queue.pop(0)
        if self.verbose:
            print('\n')
            print('+' + '-' * 46 + '+')
            print('|' + 'Training model {}'.format(model_id).center(46) + '|')
            print('+' + '-' * 46 + '+')
        ctx = mp.get_context('fork')
        q = ctx.Queue()
        p = ctx.Process(target=train,
                        args=(q, (graph, train_data, test_data,
                                  self.trainer_args, self.metric, self.loss,
                                  self.verbose, self.path)))
        try:
            p.start()
            # Do the search in current thread.
            searched = False
            new_graph = None
            new_father_id = None
            if not self.training_queue:
                searched = True

                while new_father_id is None:
                    remaining_time = timeout - (time.time() - start_time)
                    new_graph, new_father_id = self.bo.optimize_acq(
                        self.search_tree.adj_list.keys(), self.descriptors,
                        remaining_time)
                new_model_id = self.model_count
                self.model_count += 1
                self.training_queue.append(
                    (new_graph, new_father_id, new_model_id))
                self.descriptors.append(new_graph.extract_descriptor())

            remaining_time = timeout - (time.time() - start_time)
            if remaining_time <= 0:
                raise TimeoutError
            metric_value, loss, graph = q.get(timeout=remaining_time)

            if self.verbose and searched:
                verbose_print(new_father_id, new_graph)

            self.add_model(metric_value, loss, graph, model_id)
            self.search_tree.add_child(father_id, model_id)
            self.bo.fit(self.x_queue, self.y_queue)
            self.x_queue = []
            self.y_queue = []

            pickle_to_file(self, os.path.join(self.path, 'searcher'))
            self.export_json(os.path.join(self.path, 'history.json'))

        except TimeoutError as e:
            raise TimeoutError from e
        except RuntimeError as e:
            if not re.search('out of memory', str(e)):
                raise e
            if self.verbose:
                print(
                    '\nCurrent model size is too big. Discontinuing training this model to search for other models.'
                )
            Constant.MAX_MODEL_SIZE = graph.size() - 1
            return
        finally:
            # terminate and join the subprocess to prevent any resource leak
            p.terminate()
            p.join()
Example #7
0
    def search(self, train_data, test_data, timeout=60 * 60 * 24):
        start_time = time.time()
        torch.cuda.empty_cache()
        if not self.history:
            self.init_search()

        # Start the new process for training.
        graph, father_id, model_id = self.training_queue.pop(0)
        if self.verbose:
            print('\n')
            print('+' + '-' * 46 + '+')
            print('|' + 'Training model {}'.format(model_id).center(46) + '|')
            print('+' + '-' * 46 + '+')
        mp.set_start_method('spawn', force=True)
        pool = mp.Pool(1)
        try:
            train_results = pool.map_async(
                train, [(graph, train_data, test_data, self.trainer_args,
                         os.path.join(self.path,
                                      str(model_id) + '.png'), self.metric,
                         self.loss, self.verbose)])

            # Do the search in current thread.
            searched = False
            new_graph = None
            new_father_id = None
            if not self.training_queue:
                searched = True

                while new_father_id is None:
                    remaining_time = timeout - (time.time() - start_time)
                    new_graph, new_father_id = self.bo.optimize_acq(
                        self.search_tree.adj_list.keys(), self.descriptors,
                        remaining_time)
                new_model_id = self.model_count
                self.model_count += 1
                self.training_queue.append(
                    (new_graph, new_father_id, new_model_id))
                self.descriptors.append(new_graph.extract_descriptor())

            remaining_time = timeout - (time.time() - start_time)
            if remaining_time <= 0:
                raise TimeoutError
            metric_value, loss, graph = train_results.get(
                timeout=remaining_time)[0]

            if self.verbose and searched:
                verbose_print(new_father_id, new_graph)

            self.add_model(metric_value, loss, graph, model_id)
            self.search_tree.add_child(father_id, model_id)
            self.bo.fit(self.x_queue, self.y_queue)
            self.x_queue = []
            self.y_queue = []

            pickle_to_file(self, os.path.join(self.path, 'searcher'))
            self.export_json(os.path.join(self.path, 'history.json'))

        except (mp.TimeoutError, TimeoutError) as e:
            raise TimeoutError from e
        except RuntimeError as e:
            if not re.search('out of memory', str(e)):
                raise e
            if self.verbose:
                print('out of memory')
            Constant.MAX_MODEL_SIZE = graph.size() - 1
            return
        finally:
            # terminate and join the subprocess to prevent any resource leak
            pool.close()
            pool.join()
Example #8
0
    def search(self, train_data, test_data, timeout=60 * 60 * 24):
        """Run the search loop of training, generating and updating once.

        The function will run the training and generate in parallel.
        Then it will update the controller.
        The training is just pop out a graph from the training_queue and train it.
        The generate will call the self.generate function.
        The update will call the self.update function.

        Args:
            train_data: An instance of DataLoader.
            test_data: An instance of Dataloader.
            timeout: An integer, time limit in seconds.
        """
        ctx = torch.multiprocessing.get_context("spawn")
        torch.cuda.empty_cache()
        if not self.history:
            self.init_search()
        mpq = ctx.Queue(self.n_parralel * 85)
        self._timeout = time.time(
        ) + timeout if timeout is not None else sys.maxsize
        self.trainer_args['timeout'] = timeout
        # Start the new process for training.

        if len(self.training_queue) < 1:
            search_results = self._search_common()
            if self.verbose and search_results:
                for (generated_graph, generated_other_info,
                     new_model_id) in search_results:
                    verbose_print(generated_other_info, generated_graph,
                                  new_model_id)

        if (self.n_parralel > 1):
            print("TRAINING ", len(self.training_queue), " MODELS IN PARRALEL")
        processes = []

        for i in range(min(self.n_parralel, len(self.training_queue))):
            graph, other_info, model_id = self.training_queue.pop(0)
            if self.verbose:
                print('\n')
                print('+' + '-' * 46 + '+')
                print('|' + 'Training model {}'.format(model_id).center(46) +
                      '|')
                print('+' + '-' * 46 + '+')

            #self.sp_search(graph, other_info, model_id, train_data, test_data,mpq)
            p = ctx.Process(target=self.sp_search,
                            args=(graph, other_info, model_id, train_data,
                                  test_data, mpq, i))
            p.start()
            processes.append(p)

        #for proc in processes:
        #    proc.start()
        for proc in processes:
            proc.join()

        while not mpq.empty():
            metric_value, loss, model_id, other_info = mpq.get()
            self.add_model(metric_value, loss, model_id)
            graph = pickle_from_file(
                os.path.join(self.path,
                             str(model_id) + '.graph'))
            self.update(other_info, model_id, graph, metric_value)