Esempio n. 1
0
File: cli.py Progetto: sony/nnabla
def cli_main(parser, args):
    global return_value
    return_value = False

    if 'func' not in args:
        parser.print_help(sys.stderr)
        sys.exit(-1)

    if args.mpi:
        from nnabla.utils.communicator_util import create_communicator
        comm = create_communicator()
        try:
            return_value = args.func(args)
        except:
            import traceback
            print(traceback.format_exc())

            logger.log(99, "ABORTED")
            os.kill(os.getpid(), 9)
            # comm.abort()
    else:
        try:
            return_value = args.func(args)
        except:
            import traceback
            print(traceback.format_exc())
            return_value = False
            sys.exit(-1)
Esempio n. 2
0
    def _get_data(self, position):

        self._position = position
        if current_communicator():
            try:
                filename, index = self._order[position]
            except IndexError:
                logger.log(99, '_get_data() fails at worker {} retrying.'.format(
                    current_communicator().rank))
                sleep(0.01)
                return self._get_data(position)
        else:
            filename, index = self._order[position]

        if filename != self._current_filename:
            file_names_to_prefetch = None
            if self._cache_type == ".npy" and self._num_of_threads > 0:
                file_names_to_prefetch = [o[0] for o in self._order[position + self._max_length:position + self._max_length *
                                                                    self._num_of_threads:self._max_length]]

            self._current_data = self._get_next_data(
                filename, file_names_to_prefetch)
            self._current_filename = filename

        data = [self._current_data[v][index] for v in self.variables]

        if self._normalize:
            data = [d.astype(numpy.float32) * (1.0 / 255.0)
                    if d.dtype == numpy.uint8 else d for d in data]
        return data
Esempio n. 3
0
def _get_current_parameter(args, config):
    def convert_to_info(config):
        class Info:
            pass

        ret = Info()
        ret.optimizers = OrderedDict()
        for name, opt in config.optimizers.items():
            ret.optimizers[name] = opt.optimizer
        return ret

    best_error, best_epoch = callback.get_best_from_status(args)

    globname = os.path.join(args.outdir, 'results_current_*.nnp')
    exists = glob.glob(globname)

    if len(exists) > 0:
        ex_list = {}

        info = convert_to_info(config)
        for ex in exists:
            n = int(ex.rsplit('_', 1)[1].rsplit('.', 1)[0])
            ex_list[n] = ex

        last_epoch = sorted(ex_list.keys(), reverse=True)[0]
        last_parameter = ex_list[last_epoch]
        logger.log(
            99, "Load parameter from [{}]".format(
                os.path.basename(last_parameter)))
        #load.load([last_parameter], parameter_only=True)
        load_train_state(last_parameter, info)
        return last_epoch, best_epoch, best_error

    return 0, best_epoch, best_error
Esempio n. 4
0
def create_communicator(ignore_error=False,
                        extension_module='cudnn',
                        type_config='float'):
    global _current_communicator

    if os.environ.get('OMPI_COMM_WORLD_SIZE') is not None:
        from nnabla.ext_utils import get_extension_context
        context = get_extension_context(extension_module,
                                        type_config=type_config)
        try:
            logger.log(99,
                       'Create communicator with contexts {}'.format(context))
            _current_communicator = C.MultiProcessCommunicator(context)
            _current_communicator.init()
            context.device_id = str(_current_communicator.rank %
                                    _current_communicator.size)
            if _current_communicator.size == 1:
                _current_communicator = None
        except:
            if not ignore_error:
                raise
            logger.warning("Failed to initialize nnabla.communicators.")
            _current_communicator = None
    else:
        _current_communicator = None

    return _current_communicator
Esempio n. 5
0
def load_csv(file, shape=None, normalize=False):
    """
    Load CSV file.

    :param file: CSV file.
    :type file: file like object
    :param shape : data array is reshape to this shape.
    :type shape: tuple of int

    :return: numpy array
    """
    value_list = []
    if six.PY2:
        for row in csv.reader(file):
            if len(row):
                value_list.append(list(map(float, row)))
    elif six.PY34:
        for row in csv.reader([l.decode('utf-8') for l in file.readlines()]):
            if len(row):
                value_list.append(list(map(float, row)))
    try:
        if shape is None:
            return numpy.array(value_list)
        else:
            return numpy.array(value_list).reshape(shape)
    except:
        logger.log(99, 'Failed to load array from "{}".'.format(file.name))
        raise
Esempio n. 6
0
    def read_cache(self, file_name, variables):
        retry = 1
        while True:
            if retry > 10:
                logger.log(99, 'read_cache() retry count over give up.')
                logger.log(99, 'Cache file {} not found.'.format(file_name))
                logger.log(99, 'Fatal Error! send SIGKILL to myself.')
                os.kill(os.getpid(), 9)

            result = {}
            try:
                with FileReader(file_name).open(textmode=False) as f:
                    for v in variables:
                        result[v] = numpy.load(f)
                if set(result.keys()) == set(variables):
                    break
                else:
                    logger.log(
                        99, 'read_cache() fails retrying count {}/10.'.format(
                            retry))
                    retry += 1
            except:
                logger.log(
                    99, 'Cache file {} not found, retry count {}.'.format(
                        file_name, retry))
                retry += 1

        return result
Esempio n. 7
0
    def next(self):
        '''next

        It generates tuple of data.

        For example,
        if :py:meth:`self._variables == ('x', 'y')`
        This method returns :py:meth:` ( [[X] * batch_size], [[Y] * batch_size] )`

        Returns:
            tuple: tuple of data for mini-batch in numpy.ndarray.
        '''
        if self._use_thread:
            # Wait for finish previous thread.
            self._next_thread.join()

            if self._current_data is None:
                logger.log(99, 'next() got None retrying.')
                self._next_thread = threading.Thread(target=self._next)
                self._next_thread.start()
                self._next_thread.join()
            self._current_epoch, data = self._current_data
            # Start next thread.
            self._next_thread = threading.Thread(target=self._next)
            self._next_thread.start()
        else:
            self._next()
            self._current_epoch, data = self._current_data

        return data
Esempio n. 8
0
    def initialize_cache_files(self, filename):
        length = -1
        with self._filereader.open_cache(filename) as cache:

            # Check variables.
            if self._variables is None:
                self._variables = list(cache.keys())
            else:
                if current_communicator():
                    if not set(self._variables) == set(cache.keys()):
                        logger.log(
                            99, 'Error at worker {} {} {}'.format(
                                current_communicator().rank,
                                set(self._variables), set(cache.keys())))
                        raise

            for k, v in cache.items():
                if length < 0:
                    length = len(v)
                else:
                    assert (length == len(v))
            self._cache_files.append((filename, length))
            logger.info('{} {}'.format(filename, length))
            if length > self._max_length:
                self._max_length = length
Esempio n. 9
0
def profile(config, name, func, result_dict):
    # for sync CPU/GPU
    identity = F.Identity(config.global_config.default_context)
    tmp_in = nn.Variable((1,))
    tmp_out = nn.Variable((1,))
    identity.setup([tmp_in], [tmp_out])

    tmp_in.d = [0.]
    identity.forward([tmp_in], [tmp_out])

    # Profile
    start = time.time()
    count = 0
    while time.time() < start + 1.0 or count < 100:
        func()
        count += 1

    # sync CPU/GPU
    identity.forward([tmp_in], [tmp_out])
    data = tmp_out.d

    t = (time.time() - start) * 1000 / count
    logger.log(99, '%s %f(ms)' % (name, t))
    result_dict[name] = t
    return result_dict
Esempio n. 10
0
def profile(config, name, func, result_dict):
    # for sync CPU/GPU
    identity = F.Identity(config.global_config.default_context)
    tmp_in = nn.Variable((1,))
    tmp_out = nn.Variable((1,))
    identity.setup([tmp_in], [tmp_out])

    tmp_in.d = [0.]
    identity.forward([tmp_in], [tmp_out])

    # Profile
    start = time.time()
    count = 0
    while time.time() < start + 1.0 or count < 100:
        func()
        count += 1

    # sync CPU/GPU
    identity.forward([tmp_in], [tmp_out])
    data = tmp_out.d

    t = (time.time() - start) * 1000 / count
    logger.log(99, '%s %f(ms)' % (name, t))
    result_dict[name] = t
    return result_dict
Esempio n. 11
0
def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator):
    class Dataset:
        pass
    dataset = Dataset()
    dataset.uri = uri
    dataset.normalize = not no_image_normalization

    if prepare_data_iterator:
        if cache_dir == '':
            cache_dir = None
        if cache_dir and create_cache_explicitly:
            if not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0 or overwrite_cache:
                if not os.path.exists(cache_dir):
                    os.mkdir(cache_dir)
                logger.log(99, 'Creating cache data for "' + uri + '"')
                with data_iterator_csv_dataset(uri, batch_size, shuffle, normalize=False, cache_dir=cache_dir) as di:
                    index = 0
                    while index < di.size:
                        progress('', (1.0 * di.position) / di.size)
                        di.next()
                        index += batch_size
            dataset.data_iterator = (lambda: data_iterator_cache(
                cache_dir, batch_size, shuffle, normalize=dataset.normalize))
        elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0:
            if cache_dir and not os.path.exists(cache_dir):
                os.mkdir(cache_dir)
            dataset.data_iterator = (lambda: data_iterator_csv_dataset(
                uri, batch_size, shuffle, normalize=dataset.normalize, cache_dir=cache_dir))
        else:
            dataset.data_iterator = (lambda: data_iterator_cache(
                cache_dir, batch_size, shuffle, normalize=dataset.normalize))
    else:
        dataset.data_iterator = None
    return dataset
Esempio n. 12
0
def train_command(args):
    configure_progress(os.path.join(args.outdir, 'progress.txt'))
    files = []
    files.append(args.config)
    if args.param:
        files.append(args.param)

    class TrainConfig:
        pass

    config = TrainConfig()
    info = load.load(files)

    logger.log(99, 'Train with contexts {}'.format(available_contexts))

    config.global_config = info.global_config
    config.training_config = info.training_config

    class OptConfig:
        pass

    config.optimizers = OrderedDict()
    for name, opt in info.optimizers.items():
        o = OptConfig()
        o.optimizer = opt
        o.data_iterator = None
        config.optimizers[name] = o

    class MonConfig:
        pass

    config.monitors = OrderedDict()
    for name, mon in info.monitors.items():
        m = MonConfig()
        m.monitor = mon
        m.data_iterator = None
        config.monitors[name] = m

    # Training
    max_iter = config.training_config.max_epoch * \
        config.training_config.iter_per_epoch
    if max_iter > 0:

        data_iterators = {'optimizer': {}, 'monitor': {}}
        with ExitStack() as stack:
            for name, o in config.optimizers.items():
                o.data_iterator = stack.enter_context(
                    o.optimizer.data_iterator())
            for name, m in config.monitors.items():
                m.data_iterator = stack.enter_context(
                    m.monitor.data_iterator())
            train(args, config)

    else:
        # save parameters without training (0 epoch learning)
        save_parameters(os.path.join(args.outdir, 'parameters.h5'))

    logger.log(99, 'Training Completed.')
    progress(None)
Esempio n. 13
0
def profile_command(args):
    configure_progress(os.path.join(args.outdir, 'progress.txt'))
    files = []
    files.append(args.config)

    class TrainConfig:
        pass

    config = TrainConfig()
    info = load.load(files)

    config.global_config = info.global_config
    config.training_config = info.training_config

    class OptConfig:
        pass

    config.optimizers = OrderedDict()
    for name, opt in info.optimizers.items():
        o = OptConfig()
        o.optimizer = opt
        o.data_iterator = None
        config.optimizers[name] = o

    class MonConfig:
        pass

    config.monitors = OrderedDict()
    for name, mon in info.monitors.items():
        m = MonConfig()
        m.monitor = mon
        m.data_iterator = None
        config.monitors[name] = m

    ext_module = import_extension_module(
        config.global_config.default_context.backend[0].split(':')[0])

    def synchronize():
        return ext_module.synchronize(
            device_id=config.global_config.default_context.device_id)

    result_array = [['time in ms']]

    # Profile Optimizer
    with ExitStack() as stack:
        for name, o in config.optimizers.items():
            o.data_iterator = stack.enter_context(o.optimizer.data_iterator())
        result_array = profile_optimizer(config, result_array, synchronize)

    # Write profiling result
    import csv
    with open(args.outdir + os.sep + 'profile.csv', 'w') as f:
        writer = csv.writer(f, lineterminator='\n')
        writer.writerows(result_array)

    logger.log(99, 'Profile Completed.')
    progress(None)
    return True
Esempio n. 14
0
 def _wait():
     import time
     count = 0
     while not _finish:
         if count > 10000:
             logger.log(99, "STALLED MPI RANK {}".format(comm.rank))
             os.kill(os.getpid(), 9)
         time.sleep(0.01)
         count += 1
Esempio n. 15
0
def train_command(args):
    logger.log(99, 'Train with contexts {}'.format(available_contexts))

    configure_progress(os.path.join(args.outdir, 'progress.txt'))
    files = []
    files.append(args.config)
    if args.param:
        files.append(args.param)

    class TrainConfig:
        pass
    config = TrainConfig()
    info = load.load(files)

    config.global_config = info.global_config
    config.training_config = info.training_config

    class OptConfig:
        pass
    config.optimizers = OrderedDict()
    for name, opt in info.optimizers.items():
        o = OptConfig()
        o.optimizer = opt
        o.data_iterator = None
        config.optimizers[name] = o

    class MonConfig:
        pass
    config.monitors = OrderedDict()
    for name, mon in info.monitors.items():
        m = MonConfig()
        m.monitor = mon
        m.data_iterator = None
        config.monitors[name] = m

    # Training
    max_iter = config.training_config.max_epoch * \
        config.training_config.iter_per_epoch
    if max_iter > 0:

        data_iterators = {'optimizer': {}, 'monitor': {}}
        with ExitStack() as stack:
            for name, o in config.optimizers.items():
                o.data_iterator = stack.enter_context(
                    o.optimizer.data_iterator())
            for name, m in config.monitors.items():
                m.data_iterator = stack.enter_context(
                    m.monitor.data_iterator())
            train(args, config)

    else:
        # save parameters without training (0 epoch learning)
        save_parameters(os.path.join(
            args.outdir, 'parameters.h5'))

    logger.log(99, 'Training Completed.')
    progress(None)
Esempio n. 16
0
 def _wait():
     import time
     import sys
     count = 0
     while not _finish:
         if count > 10000:
             logger.log(99, "STALLED MPI RANK {}".format(comm.rank))
             sys.exit(-1)
         time.sleep(0.01)
         count += 1
Esempio n. 17
0
def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator):
    class Dataset:
        pass
    dataset = Dataset()
    dataset.uri = uri
    dataset.normalize = not no_image_normalization

    comm = current_communicator()

    # use same random state for each process until slice is called
    rng = numpy.random.RandomState(0)
    use_memory_cache = comm.size == 1 if comm else True

    if prepare_data_iterator:
        if cache_dir == '':
            cache_dir = None

        # Disable implicit cache creation when MPI is available.
        if cache_dir and (create_cache_explicitly or comm):
            cache_index = os.path.join(cache_dir, "cache_index.csv")
            if not os.path.exists(cache_index) or overwrite_cache:
                if single_or_rankzero():
                    logger.log(99, 'Creating cache data for "' + uri + '"')

                    try:
                        os.makedirs(cache_dir)
                    except OSError:
                        pass  # python2 does not support exists_ok arg

                    with data_iterator_csv_dataset(uri, batch_size, shuffle, rng=rng, normalize=False, cache_dir=cache_dir, with_memory_cache=False) as di:
                        pass

            rng = numpy.random.RandomState(0)
            dataset.data_iterator = (lambda: data_iterator_cache(
                cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache))
        elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0:
            if comm:
                logger.critical(
                    'Implicit cache creation does not support with MPI')
                import sys
                sys.exit(-1)
            else:
                if cache_dir:
                    try:
                        os.makedirs(cache_dir)
                    except OSError:
                        pass  # python2 does not support exists_ok arg
                dataset.data_iterator = (lambda: data_iterator_csv_dataset(
                    uri, batch_size, shuffle, rng=rng, normalize=dataset.normalize, cache_dir=cache_dir))
        else:
            dataset.data_iterator = (lambda: data_iterator_cache(
                cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache))
    else:
        dataset.data_iterator = None
    return dataset
Esempio n. 18
0
def create_data_csv(seed):
    path = os.path.abspath(os.path.dirname(__file__))
    base_dir = os.path.join(path, 'stl10')
    ensure_dir(base_dir)
    # Create original training set
    logger.log(99, 'Downloading STL10 dataset...')
    output_dir = os.path.join(path, 'download')
    train_di = data_iterator_stl10(5000,
                                   True,
                                   None,
                                   False,
                                   output_dir=output_dir)
    logger.log(99, 'Creating "stl10_training.csv"... ')
    train_csv = data_iterator_to_csv(base_dir, 'stl10_training.csv',
                                     'training', train_di)
    train_csv, val_csv = split_data_into_train_val(train_csv,
                                                   val_size=1000,
                                                   seed=seed)
    save_list_to_csv(train_csv, base_dir,
                     'stl10_training' + '_' + str(seed) + '.csv')
    save_list_to_csv(val_csv, base_dir,
                     'stl10_validation' + '_' + str(seed) + '.csv')

    # Validation
    validation_di = data_iterator_stl10(8000,
                                        False,
                                        None,
                                        False,
                                        output_dir=output_dir)
    logger.log(99, 'Creating "stl10_test.csv"... ')
    _ = data_iterator_to_csv(base_dir, 'stl10_test.csv', 'validation',
                             validation_di)
    logger.log(99, 'Dataset creation completed successfully.')
Esempio n. 19
0
def profile_command(args):
    configure_progress(os.path.join(args.outdir, 'progress.txt'))
    files = []
    files.append(args.config)

    class TrainConfig:
        pass
    config = TrainConfig()
    info = load.load(files)

    config.global_config = info.global_config
    config.training_config = info.training_config

    class OptConfig:
        pass
    config.optimizers = OrderedDict()
    for name, opt in info.optimizers.items():
        o = OptConfig()
        o.optimizer = opt
        o.data_iterator = None
        config.optimizers[name] = o

    class MonConfig:
        pass
    config.monitors = OrderedDict()
    for name, mon in info.monitors.items():
        m = MonConfig()
        m.monitor = mon
        m.data_iterator = None
        config.monitors[name] = m

    result_array = [['time in ms']]

    # Profile Optimizer
    with ExitStack() as stack:
        for name, o in config.optimizers.items():
            o.data_iterator = stack.enter_context(
                o.optimizer.data_iterator())
        result_array = profile_optimizer(config, result_array)

    # Write profiling result
    import csv
    with open(args.outdir + os.sep + 'profile.csv', 'w') as f:
        writer = csv.writer(f, lineterminator='\n')
        writer.writerows(result_array)

    logger.log(99, 'Profile Completed.')
    progress(None)
Esempio n. 20
0
    def read_s3_object(self, key):
        retry = 1
        result = ''
        while True:
            if retry > 10:
                logger.log(99, 'read_s3_object() retry count over give up.')
                raise
            try:
                result = self._s3_bucket.Object(key).get()['Body'].read()
                break
            except:
                logger.log(
                    99, 'read_s3_object() fails retrying count {}/10.'.format(retry))
                retry += 1

        return result
Esempio n. 21
0
def train(args, config):
    max_iter = config.training_config.max_epoch * \
        config.training_config.iter_per_epoch
    logger.log(
        99, 'Training epoch 1 of {} begin'.format(
            config.training_config.max_epoch))

    class Cost:
        pass

    cost = Cost()
    cost.sum_epoch = 0.0
    cost.sum_iter = 0.0
    cost.variables = None

    best_error = None

    for iter in range(max_iter):
        cost = _update(iter, config, cost)

        if (iter + 1) % config.training_config.iter_per_epoch == 0:
            # End of epoch
            epoch = iter / config.training_config.iter_per_epoch + 1
            cost_avg_epoch = cost.sum_epoch / config.training_config.iter_per_epoch
            monitoring_report = []

            # Evaluation
            error_str = ''
            if epoch % 10 == 0 or epoch <= 5:
                best_error, error_str = _evaluate(args, config,
                                                  monitoring_report,
                                                  best_error)

            # Write to monitoring_report.yml
            f = open(os.path.join(args.outdir, 'monitoring_report.yml'), 'a')
            f.write('{}:\n'.format(epoch - 1))
            f.write('  cost: {}\n'.format(cost_avg_epoch))
            for str in monitoring_report:
                f.write(str)
            f.close()
            cost.sum_epoch = 0

            logger.log(
                99, 'epoch {} of {} cost={:.6f} {}'.format(
                    epoch, config.training_config.max_epoch, cost_avg_epoch,
                    error_str))
Esempio n. 22
0
        def get_data(args):
            pos = args[0]
            q = args[1]
            retry = 1
            while True:
                if retry > 10:
                    logger.log(
                        99, '_get_current_data() retry count over give up.')
                    raise
                d = self._data_source._get_data(pos)
                if d is not None:
                    break
                logger.log(99, '_get_data() fails. retrying count {}/10.'.format(
                           retry))
                retry += 1

            q.put((pos, d))
Esempio n. 23
0
def _get_current_parameter(args):

    globname = os.path.join(args.outdir, 'results_current_*.nnp')
    exists = glob.glob(globname)

    if len(exists) > 0:
        ex_list = {}

        for ex in exists:
            n = int(ex.rsplit('_', 1)[1].rsplit('.', 1)[0])
            ex_list[n] = ex

        last_epoch = sorted(ex_list.keys())[0]
        last_parameter = ex_list[last_epoch]
        logger.log(99, "Load parameter from [{}]".format(
            os.path.basename(last_parameter)))
        load.load([last_parameter], parameter_only=True)
        return last_epoch

    return 0
Esempio n. 24
0
def train(args, config):
    max_iter = config.training_config.max_epoch * \
        config.training_config.iter_per_epoch
    logger.log(99, 'Training epoch 1 of {} begin'.format(
        config.training_config.max_epoch))

    class Cost:
        pass
    cost = Cost()
    cost.sum_epoch = 0.0
    cost.sum_iter = 0.0
    cost.variables = None

    best_error = None

    for iter in range(max_iter):
        cost = _update(iter, config, cost)

        if (iter + 1) % config.training_config.iter_per_epoch == 0:
            # End of epoch
            epoch = iter / config.training_config.iter_per_epoch + 1
            cost_avg_epoch = cost.sum_epoch / config.training_config.iter_per_epoch
            monitoring_report = []

            # Evaluation
            error_str = ''
            if epoch % 10 == 0 or epoch <= 5:
                best_error, error_str = _evaluate(
                    args, config, monitoring_report, best_error)

            # Write to monitoring_report.yml
            f = open(os.path.join(args.outdir, 'monitoring_report.yml'), 'a')
            f.write('{}:\n'.format(epoch - 1))
            f.write('  cost: {}\n'.format(cost_avg_epoch))
            for str in monitoring_report:
                f.write(str)
            f.close()
            cost.sum_epoch = 0

            logger.log(99, 'epoch {} of {} cost={:.6f} {}'.format(
                epoch, config.training_config.max_epoch, cost_avg_epoch, error_str))
Esempio n. 25
0
def create_communicator(ignore_error=False):
    global _current_communicator

    from nnabla.ext_utils import get_extension_context
    extension_module = "cudnn"
    context = get_extension_context(extension_module)
    try:
        logger.log(99, 'Create communicator with contexts {}'.format(context))
        _current_communicator = C.MultiProcessCommunicator(context)
        _current_communicator.init()
        context.device_id = str(_current_communicator.rank %
                                _current_communicator.size)
        if _current_communicator.size == 1:
            _current_communicator = None
    except:
        if not ignore_error:
            raise
        logger.warning("Failed to initialize nnabla.communicators.")
        _current_communicator = None

    return _current_communicator
Esempio n. 26
0
    def _get_next_data(self, filename, file_names_to_prefetch, retry=1):
        if retry > 10:
            logger.log(99, '_get_next_data() retry count over give up.')
            raise
        if self._cache_type == '.npy':
            next_data = self._cache_reader_with_prefetch.open_and_prefetch_cache(
                filename, file_names_to_prefetch)
        else:
            # h5 format
            next_data = {}
            with self._filereader.open_cache(filename) as cache:
                for k, v in cache.items():
                    next_data[k] = v[()]

        if current_communicator():
            if set(self._variables) != set(next_data.keys()):
                logger.log(99, '_get_next_data() fails at worker {} retrying count {}/10.'.format(
                    current_communicator().rank, retry))
                sleep(0.01)
                return self._get_next_data(filename, file_names_to_prefetch, retry+1)
        return next_data
Esempio n. 27
0
def profile(config, name, func, result_dict, synchromize):
    # Warm-up
    func()
    synchromize()

    # Profile
    start_0 = time.time()
    result = 0
    count = 0
    while time.time() < start_0 + 1.0 or count < 100:
        start = time.time()
        func()
        synchromize()
        stop = time.time()
        result += stop - start
        count += 1

    t = result * 1000 / count

    logger.log(99, '%s %f(ms)' % (name, t))
    result_dict[name] = t
    return result_dict
Esempio n. 28
0
    def next(self):
        '''next

        It generates tuple of data.

        For example,
        if :py:meth:`self._variables == ('x', 'y')`
        This method returns :py:meth:` ( [[X] * batch_size], [[Y] * batch_size] )`

        Returns:
            tuple: tuple of data for mini-batch in numpy.ndarray.
        '''
        if not self._use_thread:
            self._next()
        data, n_reset = self._queue.get()
        self._queue.task_done()
        if self._use_thread:
            self._next_thread.join()
        if data is None:
            if self._stop_exhausted and self._data_source.position + self._batch_size >= self._size:
                raise StopIteration
            if self._use_thread:
                logger.log(99, 'next() got None retrying...')
                self._next_thread = threading.Thread(target=self._next)
                self._next_thread.start()
                data, n_reset = self._queue.get()
                self._queue.task_done()
                self._next_thread.join()
        if self._use_thread:
            self._next_thread = threading.Thread(target=self._next)
            self._next_thread.start()
        for _ in range(n_reset):
            if self._current_epoch >= 0:
                self._callback_epoch_end()
            self._current_epoch += 1
            self._callback_epoch_begin()
        return data
Esempio n. 29
0
def compare_optimizer(config, parameters, config_cpu, parameters_cpu, result_array):
    loaded_datas = {}
    for opt, opt_cpu in zip(config.optimizers.values(), config_cpu.optimizers.values()):
        o = opt.optimizer
        o_cpu = opt.optimizer
        opts = [o, o_cpu]

        result_name = "optimizer '%s' with network '%s'" % (
            o.name, o.network.name)
        result_dict = OrderedDict()

        logger.log(99, 'Comparing ' + result_name + ' ...')
        logger.log(
            99, 'process(func, variable), norm_diff, current_context_std, cpu_std, diff_std')
        # Start comparison with same parameters
        for p, p_cpu in zip(parameters.values(), parameters_cpu.values()):
            p_cpu.d = p.d

        # Load dataset
        di = opt.data_iterator
        if di not in loaded_datas:
            loaded_datas[di] = di.next()
        datas = loaded_datas[di]

        for v, d in o.dataset_assign.items():
            let_data_to_variable(v.variable_instance, datas[
                                 di.variables.index(d)])
        for v, d in o_cpu.dataset_assign.items():
            let_data_to_variable(v.variable_instance, datas[
                                 di.variables.index(d)])

        # Generate data
        generated = {}
        for v, generator in o.generator_assign.items():
            generated[v.name] = generator(v.shape)
            dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[
                0].inputs else None
            let_data_to_variable(v.variable_instance,
                                 data=generated[v.name], ctx=dest_context)
        for v, generator in o_cpu.generator_assign.items():
            dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[
                0].inputs else None
            let_data_to_variable(v.variable_instance,
                                 data=generated[v.name], ctx=dest_context)

        last_max_diff = 1e-5

        # Forward
        for func, func_cpu in zip(o.forward_sequence, o_cpu.forward_sequence):
            o.network.forward_function(func)
            o_cpu.network.forward_function(func_cpu)
            large_diff = False
            for v, v_cpu in zip(func.outputs, func_cpu.outputs):
                name = 'forward_function (%s, %s)' % (func.name, v.name)
                if v.variable_instance.d.shape != v_cpu.variable_instance.d.shape:
                    logger.log(99, 'Variable shape is different in %s (current_context=%s, cpu=%s)' % (
                        v.name, str(v.variable_instance.d.shape), str(v_cpu.variable_instance.d.shape)))
                norm_diff, std1, std2, diff_std = calc_norm_diff(
                    v.variable_instance.d, v_cpu.variable_instance.d)
                logger.log(99, '%s, %f, %f, %f, %f' %
                           (name, norm_diff, std1, std2, diff_std))
                result_dict[name] = norm_diff
                if norm_diff > last_max_diff:
                    if norm_diff > last_max_diff * 10:
                        logger.log(99, '  current_context(data)=' +
                                   str(v.variable_instance.d.flatten()))
                        logger.log(99, '  cpu(data)=' +
                                   str(v_cpu.variable_instance.d.flatten()))
                        large_diff = True
                    last_max_diff = norm_diff
            if large_diff:
                logger.log(99, '  x_data:')
                for v, v_cpu in zip(func.inputs, func_cpu.inputs):
                    logger.log(99, '    current_context(%s.d)=%s' %
                               (v.name, str(v.variable_instance.d.flatten())))
                    logger.log(99, '    cpu(%s.d)=%s' % (
                        v_cpu.name, str(v_cpu.variable_instance.d.flatten())))

        # Backward
        o.network.prepare_backward(o.backward_sequence)
        o_cpu.network.prepare_backward(o_cpu.backward_sequence)
        for seq, seq_cpu in zip(o.backward_sequence.sequence, o_cpu.backward_sequence.sequence):
            o.network.backward_function(seq)
            o_cpu.network.backward_function(seq_cpu)
            large_diff = False
            for v, v_cpu in zip(seq.func.inputs, seq_cpu.func.inputs):
                if v.variable_instance.need_grad:
                    name = 'backward_function (%s, %s)' % (
                        seq.func.name, v.name)
                    norm_diff, std1, std2, diff_std = calc_norm_diff(
                        v.variable_instance.g, v_cpu.variable_instance.g)
                    logger.log(99, '%s, %f, %f, %f, %f' %
                               (name, norm_diff, std1, std2, diff_std))
                    result_dict[name] = norm_diff
                    if norm_diff > last_max_diff:
                        if norm_diff > last_max_diff * 10:
                            logger.log(99, '  current_context(diff)=' + str(
                                v.variable_instance) + str(v.variable_instance.g.flatten()))
                            logger.log(99, '  cpu(diff)=' + str(v_cpu.variable_instance) +
                                       str(v_cpu.variable_instance.g.flatten()))
                            large_diff = True
                        last_max_diff = norm_diff
            if large_diff:
                logger.log(99, '  x_data:')
                for v, v_cpu in zip(seq.func.inputs, seq_cpu.func.inputs):
                    logger.log(99, '    current_context(%s.d)=%s' %
                               (v.name, str(v.variable_instance.d.flatten())))
                    logger.log(99, '    cpu(%s.d)=%s' % (
                        v_cpu.name, str(v_cpu.variable_instance.d.flatten())))
                logger.log(99, '  y_diff:')
                for v, v_cpu in zip(seq.func.outputs, seq_cpu.func.outputs):
                    logger.log(99, '    current_context(%s.g)=%s' %
                               (v.name, str(v.variable_instance.g.flatten())))
                    logger.log(99, '    cpu(%s.g)=%s' % (
                        v_cpu.name, str(v_cpu.variable_instance.g.flatten())))

        # Update (weight decay)
        if o.weight_decay > 0:
            o.solver.weight_decay(o.weight_decay)
            o_cpu.solver.weight_decay(o_cpu.weight_decay)

        # Update
        o.solver.update()
        o_cpu.solver.update()
        for i, (v, lr) in enumerate(o.parameter_learning_rate_multipliers.items()):
            v_cpu = o_cpu.parameter_learning_rate_multipliers.items()[i][0]
            if lr > 0:
                name = 'update (%s, %s)' % (o.solver.name, v.name)
                norm_diff, std1, std2, diff_std = calc_norm_diff(
                    v.variable_instance.d, v_cpu.variable_instance.d)
                logger.log(99, '%s, %f, %f, %f, %f' %
                           (name, norm_diff, std1, std2, diff_std))
                result_dict[name] = norm_diff

        result_array = add_result(result_name, result_dict, result_array)

    return result_array
Esempio n. 30
0
def train_command(args):
    callback.update_status(args)

    if single_or_rankzero():
        configure_progress(os.path.join(args.outdir, 'progress.txt'))

    info = load.load([args.config],
                     prepare_data_iterator=None,
                     exclude_parameter=True)

    # Check dataset uri is empty.
    dataset_error = False
    for dataset in info.datasets.values():
        if dataset.uri.strip() == '':
            dataset_error = True
    if dataset_error:
        logger.log(99, 'Fatal error. Dataset URI is empty.')
        return False

    class TrainConfig:
        pass

    config = TrainConfig()
    config.timelimit = -1
    if args.param:
        load.load([args.param], parameter_only=True)

    config.timelimit = callback.get_timelimit(args)

    config.global_config = info.global_config
    config.training_config = info.training_config

    if single_or_rankzero():
        logger.log(99, 'Train with contexts {}'.format(available_contexts))

    class OptConfig:
        pass

    config.optimizers = OrderedDict()
    for name, opt in info.optimizers.items():
        o = OptConfig()
        o.optimizer = opt
        o.data_iterators = []
        config.optimizers[name] = o

    class MonConfig:
        pass

    config.monitors = OrderedDict()
    for name, mon in info.monitors.items():
        m = MonConfig()
        m.monitor = mon
        m.data_iterators = []
        config.monitors[name] = m

    # Training
    comm = current_communicator()
    config.training_config.iter_per_epoch //= comm.size if comm else 1
    max_iteration = config.training_config.max_epoch * \
        config.training_config.iter_per_epoch

    global _save_parameter_info
    _save_parameter_info = {}
    _, config_ext = os.path.splitext(args.config)
    if config_ext == '.prototxt' or config_ext == '.nntxt':
        _save_parameter_info['config'] = args.config
    elif config_ext == '.nnp':
        with zipfile.ZipFile(args.config, 'r') as nnp:
            for name in nnp.namelist():
                _, ext = os.path.splitext(name)
                if ext == '.nntxt' or ext == '.prototxt':
                    nnp.extract(name, args.outdir)
                    _save_parameter_info['config'] = os.path.join(
                        args.outdir, name)

    result = False
    restart = False
    if max_iteration > 0:
        rng = np.random.RandomState(comm.rank if comm else 0)
        with ExitStack() as stack:
            # Create data_iterator instance only once for each dataset in optimizers
            optimizer_data_iterators = {}
            for name, o in config.optimizers.items():
                for di in o.optimizer.data_iterators.values():
                    if di not in optimizer_data_iterators:
                        di_instance = stack.enter_context(di())
                        if comm and comm.size > 1:
                            di_instance = di_instance.slice(
                                rng, comm.size, comm.rank)
                        optimizer_data_iterators[di] = di_instance
                    else:
                        di_instance = optimizer_data_iterators[di]
                    o.data_iterators.append(di_instance)

            # Create data_iterator instance only once for each dataset in monitors
            monitor_data_iterators = {}
            for name, m in config.monitors.items():
                for di in m.monitor.data_iterators.values():
                    if di not in monitor_data_iterators:
                        di_instance = stack.enter_context(di())
                        if comm and comm.size > 1:
                            di_instance = di_instance.slice(
                                rng, comm.size, comm.rank)
                        monitor_data_iterators[di] = di_instance
                    else:
                        di_instance = monitor_data_iterators[di]
                    m.data_iterators.append(di_instance)
            monitor_data_iterators.update(optimizer_data_iterators)

            result, restart = _train(args, config)
    else:
        # save parameters without training (0 epoch learning)
        logger.log(99, '0 epoch learning. (Just save parameter.)')
        if single_or_rankzero():
            _save_parameters(args, None, 0, config, True)
        result = True

    if single_or_rankzero() and not restart:
        if result:
            logger.log(99, 'Training Completed.')
            callback.update_status('finished')
        else:
            logger.log(99, 'Training Incompleted.')
            callback.update_status('failed')
    if single_or_rankzero():
        progress(None)
    return True
Esempio n. 31
0
def _train(args, config):
    global _save_parameter_info
    comm = current_communicator()
    _CGLOAD_LOG_INTERVAL = 20

    best_epoch = None
    best_error = None
    last_epoch = 0
    if args.resume:
        last_epoch, best_epoch, best_error = _get_current_parameter(args)
        if best_epoch is not None:
            logger.log(
                99, "Best error {} recorded at epoch {} in previous training.".
                format(best_error, best_epoch))
            if best_epoch > last_epoch:
                logger.log(
                    99,
                    "Resumed epoch is {} but this training keep this result.".
                    format(last_epoch))
        logger.log(99, "Resume from epoch {}".format(last_epoch + 1))

    callback.update_status(('epoch.max', config.training_config.max_epoch))
    callback.update_status(
        ('epoch.current',
         last_epoch + 1 if last_epoch < config.training_config.max_epoch else
         config.training_config.max_epoch))

    max_iteration = config.training_config.max_epoch * \
        config.training_config.iter_per_epoch
    if single_or_rankzero():
        logger.log(
            99, 'Training epoch {} of {} begin'.format(
                last_epoch + 1, config.training_config.max_epoch))

    class Cost:
        pass

    cost = Cost()
    cost.sum_epoch = 0.0
    cost.num_iteration = 0
    cost.sum_iteration = 0.0
    cost.variables = None

    class TimeInfo:
        pass

    timeinfo = TimeInfo()
    timeinfo.past_time = 0
    timeinfo.estimate_time = 0
    timeinfo.last_past_time = None

    if max_iteration > 0:
        last_iteration = last_epoch * config.training_config.iter_per_epoch
        if last_iteration < max_iteration:

            timeinfo.start_time = time.time()
            timeinfo.last_epoch_start_time = timeinfo.start_time

            callback.update_status('processing', True, timeinfo.start_time)

            for iteration in range(last_iteration, max_iteration):

                # instant load measurement
                measure_cpu_gpu_instant_load()

                cost = _update(iteration, config, cost)

                if np.isnan(cost.sum_epoch) or np.isinf(cost.sum_epoch):
                    logger.log(99, 'Cost is Nan')
                    return False, False

                timeinfo = _calc_estimate_time(timeinfo, max_iteration,
                                               last_iteration, iteration + 1)
                callback.update_time_train(prediction=timeinfo.estimate_time)

                if 0 < config.timelimit < timeinfo.estimate_time:
                    logger.log(
                        99,
                        'Expected training time ({:.3f}s) will exceed time limit ({}s).'
                        .format(timeinfo.estimate_time, config.timelimit))
                    return False, False

                if (iteration +
                        1) % config.training_config.iter_per_epoch == 0:
                    last_past_time = -1
                    # End of epoch
                    epoch = iteration // config.training_config.iter_per_epoch + 1
                    cost_avg_epoch = cost.sum_epoch / cost.num_iteration if cost.num_iteration else 0
                    cost.sum_epoch = 0.0
                    cost.num_iteration = 0
                    monitoring_report = []

                    # Evaluation
                    error_str = ''
                    if epoch % config.training_config.monitor_interval == 0 or epoch <= 5:
                        best_error, error_str = _evaluate(
                            args, config, monitoring_report, best_error, epoch)

                    # Cpu/Gpu average load
                    cg_load_str = ''
                    cgload_log = ''
                    cg_load = get_cpu_gpu_average_load()
                    if cg_load:
                        cg_load_str = 'epoch {} average_load_matrix: {}'.format(
                            epoch, cg_load)
                        span = _calc_epoch_span(timeinfo)
                        if span > _CGLOAD_LOG_INTERVAL:
                            cgload_log = _format_cgload_log(cg_load)

                    if single_or_rankzero():
                        # Write to monitoring_report.yml
                        f = open(
                            os.path.join(args.outdir, 'monitoring_report.yml'),
                            'a')
                        f.write('{}:\n'.format(epoch - 1))
                        f.write('  cost: {}\n'.format(cost_avg_epoch))
                        for s in monitoring_report:
                            f.write(s)
                        f.close()

                        callback.update_status(
                            (['monitoring_report', epoch,
                              'cost'], cost_avg_epoch))

                        _save_parameters(args, 'current', epoch, config)

                        callback.update_status(('epoch.current', epoch))
                        callback.update_status()

                        logger.log(
                            99,
                            'epoch {} of {} cost={:.6f} {} time=({:.1f}s /{:.1f}s) {}'
                            .format(epoch, config.training_config.max_epoch,
                                    cost_avg_epoch, error_str,
                                    timeinfo.past_time, timeinfo.estimate_time,
                                    cgload_log))

                        if cg_load_str:
                            # cpu_gpu_average_load record at epoch level
                            callback.update_status(
                                (['cpu_gpu_epoch_load', epoch], cg_load))
                            progress(cg_load_str, 1)

                        if not callback.check_training_time(
                                args, config, timeinfo, epoch, last_epoch):
                            _save_parameters(args, 'current', epoch, config,
                                             True)
                            return False, True

            if single_or_rankzero():
                _save_parameters(args, 'current', epoch, config, True)
    return True, False
Esempio n. 32
0
def _evaluate(args, config, monitoring_report, best_error, epoch):
    comm = current_communicator()
    error_str = ''
    valid_error = 0.0

    def _sum_error(sum, error):
        ret = None
        if comm:
            # logger.log(99, "Calc error with communicator")
            var = [nn.NdArray()]
            var[0].data = error
            _all_reduce(comm, var, division=False, inplace=True)
            ret = sum + var[0].data
        else:
            ret = sum + error
        return ret

    for name, mon in config.monitors.items():
        m = mon.monitor
        error_sum_monitor = 0.0
        error_count = 0
        data_size = max([di.size for di in mon.data_iterators])
        batch_size = max([di.batch_size for di in mon.data_iterators])

        for i in range(data_size // batch_size):
            # Load dataset
            data = OrderedDict()
            for di in mon.data_iterators:
                data.update(zip(di.variables, di.next()))

            # Set data to variable
            for v, d in m.dataset_assign.items():
                dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[
                    0].inputs else None
                let_data_to_variable(v.variable_instance,
                                     data[d],
                                     ctx=dest_context,
                                     data_name=d,
                                     variable_name=v.name)

            # Generate data
            for v, generator in m.generator_assign.items():
                dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[
                    0].inputs else None
                let_data_to_variable(v.variable_instance,
                                     data=generator(v.shape),
                                     ctx=dest_context,
                                     variable_name=v.name)

            # Sum error before forward to prepare input data while processing
            # on GPU
            if error_count > 0:
                error_sum = 0.0
                for v in m.monitor_variables:
                    error_sum += np.mean(v.variable_instance.d)
                    # v.variable_instance.data.zero()
                error_sum_monitor = _sum_error(error_sum_monitor, error_sum)
                if single_or_rankzero():
                    progress(
                        'Evaluating "{0}"'.format(name) +
                        ' : error={0:0.6f}'.format(
                            error_sum_monitor / error_count),
                        di.position * 1.0 / di.size)
            error_count += comm.size if comm else 1

            # Forward recursive
            m.network.forward(m.forward_sequence)

        # Sum error at the end of dataset
        error_sum = 0.0
        for v in m.monitor_variables:
            error_sum += np.mean(v.variable_instance.d)
            # v.variable_instance.data.zero()
        error_sum_monitor = _sum_error(error_sum_monitor, error_sum)

        if error_count == 0:
            error = 0
        else:
            error = error_sum_monitor / error_count

        if np.isnan(error) or np.isinf(error):
            logger.log(99, 'Validation error is Nan')
            error = 0.0

        monitoring_report.append('  {}: {}\n'.format(name, error))

        callback.update_status((['monitoring_report', epoch, name], error))
        callback.update_status((['last', name], error))  # save last value

        if error_str != '':
            error_str += ', '
        else:
            error_str = ' {'
        error_str += '{}={:.6f}'.format(name, error)
        if name == 'valid_error':
            valid_error = error

    if error_str != '':
        error_str += '}'

    # Save Parameters
    if single_or_rankzero():
        if (not config.training_config.save_best) or \
           (not best_error) or \
           (best_error is not None and valid_error <= best_error):
            best_error = valid_error
            callback.update_status(('best.valid_error', best_error))
            callback.update_status(('best.epoch', epoch))
            _save_parameters(args, 'best', epoch, config, True)

    return best_error, error_str
Esempio n. 33
0
def profile_optimizer(config, result_array):
    # Profile Training
    for opt in config.optimizers.values():
        o = opt.optimizer
        result_name = "optimizer '%s' with network '%s'" % (
            o.name, o.network.name)
        result_dict = OrderedDict()

        logger.log(99, 'Profiling ' + result_name + ' ...')

        # Load dataset
        def load_dataset():
            loaded_datas = {}
            di = opt.data_iterator
            loaded_datas[di] = di.next()
            return loaded_datas
        profile(config, 'load_dataset', load_dataset, result_dict)

        # Let data
        loaded_datas = load_dataset()
        for v, d in o.dataset_assign.items():
            def let_data():
                try:
                    data = loaded_datas[opt.data_iterator][
                        opt.data_iterator.variables.index(d)]
                except:
                    print(opt.data_iterator.variables)
                    raise ValueError(
                        'Data "' + d + '" is not found in dataset.')
                let_data_to_variable(v.variable_instance, data=data)
            profile(config, 'let_data (%s to %s)' %
                    (d, v.name), let_data, result_dict)

        # Generate data
        for v, generator in o.generator_assign.items():
            def generate_data():
                let_data_to_variable(v.variable_instance,
                                     data=generator(v.shape))
            profile(config, 'generate_data (%s)' %
                    v.name, generate_data, result_dict)

        # Setup (detail)
        for func in o.forward_sequence:
            def setup():
                o.network.setup_function(func)
            profile(config, 'setup_function (%s : %s)' % (
                func.name, func.function_instance.name), setup, result_dict)

        # Forward (detail)
        for func in o.forward_sequence:
            def forward():
                o.network.forward_function(func)
            in_place_str = ' : in_place' if func.function_instance.inplace_data(
                0) > 0 else ''
            profile(config, 'forward_function (%s : %s%s)' % (
                func.name, func.function_instance.name, in_place_str), forward, result_dict)

        # Backward (detail)
        def prepare_backward():
            o.network.prepare_backward(o.backward_sequence)
        profile(config, 'prepare_backward', prepare_backward, result_dict)
        for seq in o.backward_sequence.sequence:
            o.network.prepare_backward(o.backward_sequence)

            def backward():
                o.network.backward_function(seq)
            in_place_str = ' : in_place' if seq.func.function_instance.inplace_grad(
                0) > 0 else ''
            profile(config, 'backward_function (%s : %s%s)' % (
                seq.func.name, seq.func.function_instance.name, in_place_str), backward, result_dict)

        # Forward (all)
        def forward_all():
            o.network.forward(o.forward_sequence)
        profile(config, 'forward_all', forward_all, result_dict)

        # Backward (all)
        def backward_all():
            o.network.backward(o.backward_sequence)
        profile(config, 'backward_all', backward_all, result_dict)

        # Backward (all)
        def backward_all_wo_zero_grad():
            o.network.backward(o.backward_sequence, parameter_zero_grad=False)
        profile(config, 'backward_all(wo param zero_grad)',
                backward_all_wo_zero_grad, result_dict)

        # Update (weight decay)
        if o.weight_decay > 0:
            def weight_decay():
                o.solver.weight_decay(o.weight_decay)
            profile(config, 'weight_decay (%s)' %
                    o.solver.name, weight_decay, result_dict)

        # Update
        def update():
            o.solver.update()
        profile(config, 'update (%s)' % o.solver.name, update, result_dict)

        # Monitor loss
        def monitor_loss():
            for l in o.loss_variables:
                np.mean(l.variable_instance.d)
        profile(config, 'monitor_loss', monitor_loss, result_dict)

        result_array = add_result(result_name, result_dict, result_array)

    return result_array
Esempio n. 34
0
def forward_command(args):
    configure_progress(os.path.join(args.outdir, 'progress.txt'))
    files = []
    files.append(args.config)
    if args.param:
        files.append(args.param)

    class ForwardConfig:
        pass
    config = ForwardConfig
    info = load.load(files, prepare_data_iterator=False)
    config.global_config = info.global_config

    config.executors = info.executors.values()

    config.networks = []
    for e in config.executors:
        if e.network.name in info.networks.keys():
            config.networks.append(info.networks[e.network.name])
        else:
            logger.critical('Network {} does not found.'.format(
                config.executor.network.name))
            return

    normalize = True
    for d in info.datasets.values():
        if d.uri == args.dataset:
            normalize = d.normalize
    data_iterator = (lambda: data_iterator_csv_dataset(
        args.dataset, config.networks[0].batch_size, False, normalize=normalize))

    # load dataset as csv
    with open(args.dataset, 'rt') as f:
        rows = [row for row in csv.reader(f)]
    row0 = rows.pop(0)
    root_path = os.path.dirname(args.dataset)
    root_path = os.path.abspath(root_path.replace('/|\\', os.path.sep))
    rows = list(map(lambda row: list(map(lambda x: x if is_float(
        x) else compute_full_path(root_path, x), row)), rows))

    with data_iterator() as di:
        index = 0
        while index < di.size:
            data = di.next()
            result, outputs = forward(args, index, config, data, di.variables)
            if index == 0:
                for name, dim in zip(result.names, result.dims):
                    if dim == 1:
                        row0.append(name)
                    else:
                        for d in range(dim):
                            row0.append(name + '__' + str(d))
            for i, output in enumerate(outputs):
                if index + i < len(rows):
                    rows[index + i].extend(output)
            index += len(outputs)
            logger.log(
                99, 'data {} / {}'.format(min([index, len(rows)]), len(rows)))

    with open(os.path.join(args.outdir, 'output_result.csv'), 'w') as f:
        writer = csv.writer(f, lineterminator='\n')
        writer.writerow(row0)
        writer.writerows(rows)

    logger.log(99, 'Forward Completed.')
    progress(None)
Esempio n. 35
0
def forward_command(args):
    callback.update_status(args)

    configure_progress(os.path.join(args.outdir, 'progress.txt'))
    files = []
    files.append(args.config)
    if args.param:
        files.append(args.param)
    batch_size = args.batch_size
    if batch_size < 1:
        batch_size = None

    class ForwardConfig:
        pass

    config = ForwardConfig
    info = load.load(files, prepare_data_iterator=False, batch_size=batch_size)
    config.global_config = info.global_config

    config.executors = info.executors.values()

    config.networks = []
    for e in config.executors:
        if e.network.name in info.networks.keys():
            config.networks.append(info.networks[e.network.name])
        else:
            logger.critical('Network {} is not found.'.format(
                config.executor.network.name))
            return False

    normalize = True
    for d in info.datasets.values():
        if d.uri == args.dataset or d.cache_dir == args.dataset:
            normalize = d.normalize
    for e in config.executors:
        normalize = normalize and not e.no_image_normalization

    orders = {}
    # With CSV
    if os.path.splitext(args.dataset)[1] == '.csv':
        data_iterator = (lambda: data_iterator_csv_dataset(
            uri=args.dataset,
            batch_size=config.networks[0].batch_size,
            shuffle=False,
            normalize=normalize,
            with_memory_cache=False,
            with_file_cache=False))

        # load dataset as csv
        filereader = FileReader(args.dataset)
        with filereader.open(textmode=True, encoding='utf-8-sig') as f:
            rows = [row for row in csv.reader(f)]
        row0 = rows.pop(0)
        if args.replace_path:
            root_path = os.path.dirname(args.dataset)
            root_path = os.path.abspath(root_path.replace('/|\\', os.path.sep))
        else:
            root_path = '.'
        rows = [row for row in rows if len(row)]
        rows = list(
            map(
                lambda row: list(
                    map(
                        lambda i, x: x if row0[i][0] == '#' or is_float(
                            x) else compute_full_path(root_path, x),
                        range(len(row)), row)), rows))
        for i in range(len(rows)):
            orders[i] = i
    # With Cache
    elif os.path.splitext(args.dataset)[1] == '.cache':
        data_iterator = (lambda: data_iterator_cache(uri=args.dataset,
                                                     batch_size=config.
                                                     networks[0].batch_size,
                                                     shuffle=False,
                                                     normalize=normalize))

        # Get original CSV
        original_csv = os.path.join(args.dataset, 'original.csv')
        try:
            # load dataset as csv
            filereader = FileReader(original_csv)
            with filereader.open(textmode=True, encoding='utf-8-sig') as f:
                rows = [row for row in csv.reader(f)]
            row0 = rows.pop(0)
            root_path = '.'
            rows = list(
                map(
                    lambda row: list(
                        map(
                            lambda x: x if is_float(x) else compute_full_path(
                                root_path, x), row)), rows))
        except:
            print('Cannot open', original_csv)
            pass

        # Get original Data order.
        order_csv = os.path.join(args.dataset, 'order.csv')
        try:
            filereader = FileReader(order_csv)
            with filereader.open(textmode=True) as f:
                for original, shuffled in [[int(x) for x in row]
                                           for row in csv.reader(f)]:
                    orders[original] = shuffled
        except:
            print('Cannot open', order_csv)
            for i in range(len(rows)):
                orders[i] = i
    else:
        print('Unsupported extension "{}" in "{}".'.format(
            os.path.splitext(args.dataset)[1], args.dataset))

    callback.update_status(('data.max', len(rows)))
    callback.update_status(('data.current', 0))
    callback.update_status('processing', True)

    result_csv_filename = os.path.join(args.outdir, args.outfile)
    with open(result_csv_filename, 'w', encoding='utf-8') as f:
        writer = csv.writer(f, lineterminator='\n')
        with data_iterator() as di:
            index = 0
            while index < di.size:
                data = di.next()
                result, outputs = _forward(args, index, config, data,
                                           di.variables)
                if index == 0:
                    for name, dim in zip(result.names, result.dims):
                        if dim == 1:
                            if e.repeat_evaluation_type == "std":
                                name = "Uncertainty(Std)"
                            row0.append(name)
                        else:
                            for d in range(dim):
                                row0.append(name + '__' + str(d))
                    writer.writerow(row0)
                for i, output in enumerate(outputs):
                    if index + i < len(rows):
                        import copy
                        row = copy.deepcopy(rows[orders[index + i]])
                        row.extend(output)
                        writer.writerow(row)
                index += len(outputs)

                callback.update_status(('data.current', min([index,
                                                             len(rows)])))
                callback.update_forward_time()
                callback.update_status()

                logger.log(
                    99, 'data {} / {}'.format(min([index, len(rows)]),
                                              len(rows)))

    callback.process_evaluation_result(args.outdir, result_csv_filename)

    logger.log(99, 'Forward Completed.')
    progress(None)

    callback.update_status(('output_result.csv_header', ','.join(row0)))
    callback.update_status(('output_result.column_num', len(row0)))
    callback.update_status(('output_result.data_num', len(rows)))
    callback.update_status('finished')

    return True
Esempio n. 36
0
def profile_optimizer(config, result_array, synchronize):
    # Profile Training
    for opt in config.optimizers.values():
        o = opt.optimizer
        result_name = "optimizer '%s' with network '%s'" % (o.name,
                                                            o.network.name)
        result_dict = OrderedDict()

        logger.log(99, 'Profiling ' + result_name + ' ...')
        # Clear weight
        for name, p in o.parameters.items():
            if name[-2:] in ('/W', '/b'):
                p.data.zero()

        # Load dataset
        def load_dataset():
            loaded_data = {}
            di = opt.data_iterator
            loaded_data[di] = di.next()
            return loaded_data

        profile(config, 'load_dataset', load_dataset, result_dict, synchronize)

        # Let data
        loaded_data = load_dataset()
        for v, d in o.dataset_assign.items():

            def let_data():
                try:
                    data = loaded_data[opt.data_iterator][
                        opt.data_iterator.variables.index(d)]
                except:
                    print(opt.data_iterator.variables)
                    raise ValueError('Data "' + d +
                                     '" is not found in dataset.')
                let_data_to_variable(v.variable_instance,
                                     data=data,
                                     data_name=d,
                                     variable_name=v.name)

            profile(config, 'let_data (%s to %s)' % (d, v.name), let_data,
                    result_dict, synchronize)

        # Generate data
        for v, generator in o.generator_assign.items():

            def generate_data():
                let_data_to_variable(v.variable_instance,
                                     data=generator(v.shape),
                                     variable_name=v.name)

            profile(config, 'generate_data (%s)' % v.name, generate_data,
                    result_dict, synchronize)
        '''
        # Setup (detail)
        for func in o.forward_sequence:
            def setup():
                o.network.setup_function(func)
            profile(config, 'setup_function (%s : %s)' % (
                func.name, func.function_instance.name), setup, result_dict, synchronize)
        '''
        # Warm-up
        o.network.forward(o.forward_sequence)
        o.network.prepare_backward(o.backward_sequence)
        o.network.backward(o.backward_sequence)

        # Forward (detail)
        for func in o.forward_sequence:

            def forward():
                o.network.forward_function(func)

            in_place_str = ' : in_place' if func.function_instance.inplace_data(
                0) > 0 else ''
            profile(
                config, 'forward_function (%s : %s%s)' %
                (func.name, func.function_instance.name, in_place_str),
                forward, result_dict, synchronize)

        # Backward (detail)
        def prepare_backward():
            o.network.prepare_backward(o.backward_sequence)

        profile(config, 'prepare_backward', prepare_backward, result_dict,
                synchronize)
        for seq in o.backward_sequence.sequence:
            o.network.prepare_backward(o.backward_sequence)

            def backward():
                o.network.backward_function(seq)

            in_place_str = ' : in_place' if seq.func.function_instance.inplace_grad(
                0) > 0 else ''
            profile(
                config, 'backward_function (%s : %s%s)' %
                (seq.func.name, seq.func.function_instance.name, in_place_str),
                backward, result_dict, synchronize)

        # Forward (all)
        def forward_all():
            o.network.forward(o.forward_sequence)

        profile(config, 'forward_all', forward_all, result_dict, synchronize)

        # Backward (all)
        def backward_all():
            o.network.backward(o.backward_sequence)

        profile(config, 'backward_all', backward_all, result_dict, synchronize)

        # Backward (all)
        def backward_all_wo_zero_grad():
            o.network.backward(o.backward_sequence, parameter_zero_grad=False)

        profile(config, 'backward_all(wo param zero_grad)',
                backward_all_wo_zero_grad, result_dict, synchronize)

        # Update (weight decay)
        if o.weight_decay > 0:

            def weight_decay():
                o.solver.weight_decay(o.weight_decay)

            profile(config, 'weight_decay (%s)' % o.solver.name, weight_decay,
                    result_dict, synchronize)

        # Update
        def update():
            o.solver.update()

        profile(config, 'update (%s)' % o.solver.name, update, result_dict,
                synchronize)

        # Monitor loss
        def monitor_loss():
            for l in o.loss_variables:
                np.mean(l.variable_instance.d)

        profile(config, 'monitor_loss', monitor_loss, result_dict, synchronize)

        result_array = add_result(result_name, result_dict, result_array)

    return result_array
Esempio n. 37
0
def train_command(args):

    if single_or_rankzero():
        configure_progress(os.path.join(args.outdir, 'progress.txt'))

    info = load.load([args.config], exclude_parameter=True)

    # Check dataset uri is empty.
    dataset_error = False
    for dataset in info.datasets.values():
        if dataset.uri.strip() == '':
            dataset_error = True
    if dataset_error:
        logger.log(99, 'Fatal error. Dataset URI is empty.')
        return False

    class TrainConfig:
        pass

    config = TrainConfig()
    config.timelimit = -1
    if args.param:
        load.load([args.param], parameter_only=True)

    config.global_config = info.global_config
    config.training_config = info.training_config

    if single_or_rankzero():
        logger.log(99, 'Train with contexts {}'.format(available_contexts))

    class OptConfig:
        pass

    config.optimizers = OrderedDict()
    for name, opt in info.optimizers.items():
        o = OptConfig()
        o.optimizer = opt
        o.data_iterator = None
        config.optimizers[name] = o

    class MonConfig:
        pass

    config.monitors = OrderedDict()
    for name, mon in info.monitors.items():
        m = MonConfig()
        m.monitor = mon
        m.data_iterator = None
        config.monitors[name] = m

    # Training
    comm = current_communicator()
    config.training_config.iter_per_epoch //= comm.size if comm else 1
    max_iteration = config.training_config.max_epoch * \
        config.training_config.iter_per_epoch

    global _save_parameter_info
    _save_parameter_info = {}
    _, config_ext = os.path.splitext(args.config)
    if config_ext == '.prototxt' or config_ext == '.nntxt':
        _save_parameter_info['config'] = args.config
    elif config_ext == '.nnp':
        with zipfile.ZipFile(args.config, 'r') as nnp:
            for name in nnp.namelist():
                _, ext = os.path.splitext(name)
                if ext == '.nntxt' or ext == '.prototxt':
                    nnp.extract(name, args.outdir)
                    _save_parameter_info['config'] = os.path.join(
                        args.outdir, name)

    result = False
    if max_iteration > 0:
        data_iterators = {'optimizer': {}, 'monitor': {}}
        rng = np.random.RandomState(comm.rank if comm else 0)
        with ExitStack() as stack:
            for name, o in config.optimizers.items():
                o.data_iterator = stack.enter_context(
                    o.optimizer.data_iterator())
                if comm and comm.size > 1:
                    o.data_iterator = o.data_iterator.slice(
                        rng, comm.size, comm.rank)
            for name, m in config.monitors.items():
                m.data_iterator = stack.enter_context(
                    m.monitor.data_iterator())
                if comm and comm.size > 1:
                    m.data_iterator = m.data_iterator.slice(
                        rng, comm.size, comm.rank)
            result = _train(args, config)
    else:
        # save parameters without training (0 epoch learning)
        logger.log(99, '0 epoch learning. (Just save parameter.)')
        if single_or_rankzero():
            _save_parameters(args, 'current', 0, True)
        result = True

    if single_or_rankzero():
        if result:
            logger.log(99, 'Training Completed.')
        else:
            logger.log(99, 'Training Incompleted.')
    if single_or_rankzero():
        progress(None)

    return True
Esempio n. 38
0
def compare_with_cpu_command(args):
    configure_progress(os.path.join(args.outdir, 'progress.txt'))

    class TrainConfig:
        pass

    class OptConfig:
        pass

    class MonConfig:
        pass

    # Load config with current context
    files = []
    files.append(args.config)

    with nn.parameter_scope('current'):
        info = load.load(files)
        parameters = get_parameters(grad_only=False)

    config = TrainConfig()
    config.global_config = info.global_config
    config.training_config = info.training_config

    config.optimizers = OrderedDict()
    for name, opt in info.optimizers.items():
        o = OptConfig()
        o.optimizer = opt
        o.data_iterator = None
        config.optimizers[name] = o

    config.monitors = OrderedDict()
    for name, mon in info.monitors.items():
        m = MonConfig()
        m.monitor = mon
        m.data_iterator = None
        config.monitors[name] = m

    # Load config with cpu context
    files = []
    files.append(args.config2)

    with nn.parameter_scope('cpu'):
        info_cpu = load.load(files)
        cpu_parameters = get_parameters(grad_only=False)

    config_cpu = TrainConfig()
    config_cpu.global_config = info_cpu.global_config
    config_cpu.training_config = info_cpu.training_config

    config_cpu.optimizers = OrderedDict()
    for name, opt in info_cpu.optimizers.items():
        o = OptConfig()
        o.optimizer = opt
        o.data_iterator = None
        config_cpu.optimizers[name] = o

    config_cpu.monitors = OrderedDict()
    for name, mon in info_cpu.monitors.items():
        m = MonConfig()
        m.monitor = mon
        m.data_iterator = None
        config_cpu.monitors[name] = m

    result_array = [['1-Correl']]

    # Profile Optimizer
    with ExitStack() as stack:
        for name, o in config.optimizers.items():
            o.data_iterator = stack.enter_context(
                o.optimizer.data_iterator())
        for name, o in config_cpu.optimizers.items():
            o.data_iterator = stack.enter_context(
                o.optimizer.data_iterator())
        result_array = compare_optimizer(
            config, parameters, config_cpu, cpu_parameters, result_array)

    # Write profiling result
    import csv
    with open(args.outdir + os.sep + 'compare_with_cpu.csv', 'w') as f:
        writer = csv.writer(f, lineterminator='\n')
        writer.writerows(result_array)

    logger.log(99, 'Compare with CPU Completed.')
    progress(None)