Ejemplo n.º 1
0
    def _solve(self, root_dir):
        """

        Args:
            :root_dir:

        Returns:

        """
        # initialise generation based on individual representation
        population, bounds = self._population_initialisation()
        global fs_handle
        fs_handle = hdfs.get_fs()
        global run_id

        new_gen_best_param = None
        new_gen_best = None

        for _ in range(self.generations):

            donor_population = self._mutation(population, bounds)
            trial_population = self._recombination(population,
                                                   donor_population)

            population = self._selection(population, trial_population)

            new_gen_avg = sum(self._scores) / self.n

            if self.direction.upper() == Direction.MAX:
                new_gen_best = max(self._scores)
            elif self.direction.upper() == Direction.MIN:
                new_gen_best = min(self._scores)
            else:
                raise ValueError('invalid direction: ' + self.direction)

            new_gen_best_param = self._parse_back(
                population[self._scores.index(new_gen_best)])

            index = 0
            for name in self._param_names:
                new_gen_best_param[index] = name + "=" + str(
                    new_gen_best_param[index])
                index += 1

            print("Generation " + str(self._generation) + " || " + "average metric: " + str(new_gen_avg) \
                  + ", best metric: " + str(new_gen_best) + ", best parameter combination: " + str(new_gen_best_param) + "\n")

            if cleanup:
                pydoop.hdfs.rmr(root_dir + '/generation.' +
                                str(self._generation - 1))

        parsed_back_population = []
        for indiv in population:
            parsed_back_population.append(self._parse_back(indiv))

        return new_gen_best_param, new_gen_best
Ejemplo n.º 2
0
    def _wrapper_fun(iter):

        for i in iter:
            executor_num = i

        hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
            app_id, run_id, None, 'horovod')

        tb_pid = 0
        tb_hdfs_path = ''

        pydoop.hdfs.dump('',
                         os.environ['EXEC_LOGFILE'],
                         user=hopshdfs.project_user())
        hopshdfs.init_logger()
        hopshdfs.log('Starting Spark executor with arguments')
        if executor_num == 0:
            tb_hdfs_path, tb_pid = tensorboard.register(
                hdfs_exec_logdir,
                hdfs_appid_logdir,
                0,
                local_logdir=local_logdir)

        gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info(
        )
        hopshdfs.log(gpu_str)
        print(gpu_str)

        #1. Download notebook file
        fs_handle = hopshdfs.get_fs()

        try:
            fd = fs_handle.open_file(nb_path, flags='r')
        except:
            fd = fs_handle.open_file(nb_path, mode='r')

        notebook = ''
        for line in fd:
            notebook += line

        path, filename = os.path.split(nb_path)
        f_nb = open(filename, "w+")
        f_nb.write(notebook)
        f_nb.flush()
        f_nb.close()

        # 2. Convert notebook to py file
        jupyter_runnable = os.path.abspath(
            os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter'
        conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename
        conversion = subprocess.Popen(conversion_cmd,
                                      shell=True,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
        conversion.wait()
        stdout, stderr = conversion.communicate()
        print(stdout)
        print(stderr)

        # 3. Make py file runnable
        py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py'
        st = os.stat(py_runnable)
        os.chmod(py_runnable, st.st_mode | stat.S_IEXEC)

        t_gpus = threading.Thread(
            target=devices.print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t_gpus.start()

        mpi_logfile_path = os.getcwd() + '/mpirun.log'
        if os.path.exists(mpi_logfile_path):
            os.remove(mpi_logfile_path)

        mpi_logfile = open(mpi_logfile_path, 'w')

        # 4. Run allreduce
        mpi_np = os.environ['MPI_NP']
        mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \
                  ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \
                  ' mpirun -np ' + str(mpi_np) + \
                  ' -bind-to none -map-by slot ' + \
                  ' -x HOROVOD_TIMELINE ' + \
                  ' -x TENSORBOARD_LOGDIR ' + \
                  ' -x NCCL_DEBUG=INFO ' + \
                  os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable
        mpi = subprocess.Popen(mpi_cmd,
                               shell=True,
                               stdout=mpi_logfile,
                               stderr=mpi_logfile,
                               preexec_fn=util.on_executor_exit('SIGTERM'))

        t_log = threading.Thread(target=print_log)
        t_log.start()

        mpi.wait()

        if devices.get_num_gpus() > 0:
            t_gpus.do_run = False
            t_gpus.join()

        return_code = mpi.returncode

        if local_logdir:
            local_tb = tensorboard.local_logdir_path
            pydoop.hdfs.put(local_tb, hdfs_exec_logdir)

        if return_code != 0:
            cleanup(tb_hdfs_path)
            t_log.do_run = False
            t_log.join()
            raise Exception('mpirun FAILED, look in the logs for the error')

        cleanup(tb_hdfs_path)
        t_log.do_run = False
        t_log.join()

        hopshdfs.kill_logger()
 def save(data, path):
     with hdfs.get_fs().open_file(path, 'w') as data_file:
         data_file.write(data)
 def savetf(temp_file, path, close=True):
     with hdfs.get_fs().open_file(path, 'w') as data_file:
         temp_file.seek(0)
         data_file.write(temp_file.read())
     if close:
         temp_file.close()
 def open_hdfs(path):
     temp_file = TemporaryFile()
     with hdfs.get_fs().open_file(path, 'r') as data_file:
         temp_file.write(data_file.read())
         temp_file.seek(0)
     return temp_file