Example #1
0
def main(args=None):
    args = parser.parse_args(args)

    # read config file and default config
    with open('config/default.yaml') as f:
        default_config = utils.AttrDict(yaml.safe_load(f))

    with open(args.config) as f:
        config = utils.AttrDict(yaml.safe_load(f))

        if args.learning_rate is not None:
            args.reset_learning_rate = True

        # command-line parameters have higher precedence than config file
        for k, v in vars(args).items():
            if v is not None:
                config[k] = v

        # set default values for parameters that are not defined
        for k, v in default_config.items():
            config.setdefault(k, v)

    # enforce parameter constraints
    assert config.steps_per_eval % config.steps_per_checkpoint == 0, (
        'steps-per-eval should be a multiple of steps-per-checkpoint')
    assert args.decode is not None or args.eval or args.train or args.align, (
        'you need to specify at least one action (decode, eval, align, or train)'
    )

    if args.purge:
        utils.log('deleting previous model')
        shutil.rmtree(config.model_dir, ignore_errors=True)

    logging_level = logging.DEBUG if args.verbose else logging.INFO
    # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs)
    logger = utils.create_logger(config.log_file if args.train else None)
    logger.setLevel(logging_level)

    utils.log(' '.join(sys.argv))  # print command line
    try:  # print git hash
        commit_hash = subprocess.check_output(['git', 'rev-parse',
                                               'HEAD']).decode().strip()
        utils.log('commit hash {}'.format(commit_hash))
    except:
        pass

    # list of encoder and decoder parameter names (each encoder and decoder can have a different value
    # for those parameters)
    model_parameters = [
        'cell_size', 'layers', 'vocab_size', 'embedding_size',
        'attention_filters', 'attention_filter_length', 'use_lstm',
        'time_pooling', 'attention_window_size', 'dynamic', 'binary',
        'character_level', 'bidir', 'load_embeddings', 'pooling_avg',
        'swap_memory', 'parallel_iterations', 'input_layers',
        'residual_connections', 'attn_size'
    ]
    # TODO: independent model dir for each task
    task_parameters = [
        'data_dir', 'train_prefix', 'dev_prefix', 'vocab_prefix', 'ratio',
        'lm_file', 'learning_rate', 'learning_rate_decay_factor',
        'max_input_len', 'max_output_len', 'encoders', 'decoder'
    ]

    # in case no task is defined (standard mono-task settings), define a "main" task
    config.setdefault('tasks', [{
        'encoders': config.encoders,
        'decoder': config.decoder,
        'name': 'main',
        'ratio': 1.0
    }])
    config.tasks = [utils.AttrDict(task) for task in config.tasks]

    for task in config.tasks:
        for parameter in task_parameters:
            task.setdefault(parameter, config.get(parameter))

        if isinstance(task.dev_prefix,
                      str):  # for back-compatibility with old config files
            task.dev_prefix = [task.dev_prefix]

        # convert dicts to AttrDicts for convenience
        task.encoders = [utils.AttrDict(encoder) for encoder in task.encoders]
        task.decoder = utils.AttrDict(task.decoder)

        for encoder_or_decoder in task.encoders + [task.decoder]:
            # move parameters all the way up from base level to encoder/decoder level:
            # default values for encoder/decoder parameters can be defined at the task level and base level
            # default values for tasks can be defined at the base level
            for parameter in model_parameters:
                if parameter in encoder_or_decoder:
                    continue
                elif parameter in task:
                    encoder_or_decoder[parameter] = task[parameter]
                else:
                    encoder_or_decoder[parameter] = config.get(parameter)

    # log parameters
    utils.log('program arguments')
    for k, v in sorted(config.items(), key=itemgetter(0)):
        if k == 'tasks':
            utils.log('  {:<20}\n{}'.format(k, pformat(v)))
        elif k not in model_parameters and k not in task_parameters:
            utils.log('  {:<20} {}'.format(k, pformat(v)))

    device = None
    if config.no_gpu:
        device = '/cpu:0'
    elif config.gpu_id is not None:
        device = '/gpu:{}'.format(config.gpu_id)

    utils.log('creating model')
    utils.log('using device: {}'.format(device))

    with tf.device(device):
        checkpoint_dir = os.path.join(config.model_dir, 'checkpoints')
        # All parameters except recurrent connexions and attention parameters are initialized with this.
        # Recurrent connexions are initialized with orthogonal matrices, and the parameters of the attention model
        # with a standard deviation of 0.001
        if config.weight_scale:
            initializer = tf.random_normal_initializer(
                stddev=config.weight_scale)
        else:
            initializer = None

        tf.get_variable_scope().set_initializer(initializer)
        decode_only = args.decode is not None or args.eval or args.align  # exempt from creating gradient ops
        model = MultiTaskModel(name='main',
                               checkpoint_dir=checkpoint_dir,
                               decode_only=decode_only,
                               **config)

    utils.log('model parameters ({})'.format(len(tf.global_variables())))
    parameter_count = 0
    for var in tf.global_variables():
        utils.log('  {} {}'.format(var.name, var.get_shape()))

        v = 1
        for d in var.get_shape():
            v *= d.value
        parameter_count += v
    utils.log('number of parameters: {}'.format(parameter_count))

    tf_config = tf.ConfigProto(log_device_placement=False,
                               allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = config.allow_growth
    tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction

    with tf.Session(config=tf_config) as sess:
        best_checkpoint = os.path.join(checkpoint_dir, 'best')

        if config.ensemble and (args.eval or args.decode is not None):
            # create one session for each model in the ensemble
            sess = [tf.Session() for _ in config.checkpoints]
            for sess_, checkpoint in zip(sess, config.checkpoints):
                model.initialize(sess_, [checkpoint], reset=True)
        elif (not config.checkpoints
              and (args.eval or args.decode is not None or args.align)
              and (os.path.isfile(best_checkpoint + '.index')
                   or os.path.isfile(best_checkpoint + '.index'))):
            # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`),
            # try to load the best checkpoint)
            model.initialize(sess, [best_checkpoint], reset=True)
        else:
            # loads last checkpoint, unless `reset` is true
            model.initialize(sess, **config)

        # Inspect variables:
        # tf.get_variable_scope().reuse_variables()
        # import pdb; pdb.set_trace()
        if args.decode is not None:
            model.decode(sess, **config)
        elif args.eval:
            model.evaluate(sess, on_dev=False, **config)
        elif args.align:
            model.align(sess, **config)
        elif args.train:
            eval_output = os.path.join(config.model_dir, 'eval')
            try:
                model.train(sess, eval_output=eval_output, **config)
            except KeyboardInterrupt:
                utils.log('exiting...')
                model.save(sess)
                sys.exit()
Example #2
0
def main(args=None):
    args = parser.parse_args(args)

    # read config file and default config
    with open('config/default.yaml') as f:
        default_config = utils.AttrDict(yaml.safe_load(f))

    with open(args.config) as f:
        config = utils.AttrDict(yaml.safe_load(f))
        
        if args.learning_rate is not None:
            args.reset_learning_rate = True
        
        # command-line parameters have higher precedence than config file
        for k, v in vars(args).items():
            if v is not None:
                config[k] = v

        # set default values for parameters that are not defined
        for k, v in default_config.items():
            config.setdefault(k, v)

    if config.score_function:
        config.score_functions = evaluation.name_mapping[config.score_function]

    if args.crash_test:
        config.max_train_size = 0

    if not config.debug:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # disable TensorFlow's debugging logs
    decoding_mode = any(arg is not None for arg in (args.decode, args.eval, args.align))

    # enforce parameter constraints
    assert config.steps_per_eval % config.steps_per_checkpoint == 0, (
        'steps-per-eval should be a multiple of steps-per-checkpoint')
    assert decoding_mode or args.train or args.save or args.save_embedding, (
        'you need to specify at least one action (decode, eval, align, or train)')
    assert not (args.average and args.ensemble)

    if args.train and args.purge:
        utils.log('deleting previous model')
        shutil.rmtree(config.model_dir, ignore_errors=True)

    os.makedirs(config.model_dir, exist_ok=True)

    # copy config file to model directory
    config_path = os.path.join(config.model_dir, 'config.yaml')
    if args.train and not os.path.exists(config_path):
        with open(args.config) as config_file, open(config_path, 'w') as dest_file:
            content = config_file.read()
            content = re.sub(r'model_dir:.*?\n', 'model_dir: {}\n'.format(config.model_dir), content,
                             flags=re.MULTILINE)
            dest_file.write(content)

    # also copy default config
    config_path = os.path.join(config.model_dir, 'default.yaml')
    if args.train and not os.path.exists(config_path):
        shutil.copy('config/default.yaml', config_path)

    # copy source code to model directory
    tar_path =  os.path.join(config.model_dir, 'code.tar.gz')
    if args.train and not os.path.exists(tar_path):
        with tarfile.open(tar_path, "w:gz") as tar:
            for filename in os.listdir('translate'):
                if filename.endswith('.py'):
                    tar.add(os.path.join('translate', filename), arcname=filename)

    logging_level = logging.DEBUG if args.verbose else logging.INFO
    # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs)
    log_path = os.path.join(config.model_dir, config.log_file)
    logger = utils.create_logger(log_path if args.train else None)
    logger.setLevel(logging_level)

    utils.log('label: {}'.format(config.label))
    utils.log('description:\n  {}'.format('\n  '.join(config.description.strip().split('\n'))))

    utils.log(' '.join(sys.argv))  # print command line
    try:  # print git hash
        commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
        utils.log('commit hash {}'.format(commit_hash))
    except:
        pass

    utils.log('tensorflow version: {}'.format(tf.__version__))

    # log parameters
    utils.debug('program arguments')
    for k, v in sorted(config.items(), key=itemgetter(0)):
        utils.debug('  {:<20} {}'.format(k, pformat(v)))

    if isinstance(config.dev_prefix, str):
        config.dev_prefix = [config.dev_prefix]

    if config.tasks is not None:
        config.tasks = [utils.AttrDict(task) for task in config.tasks]
        tasks = config.tasks
    else:
        tasks = [config]

    for task in tasks:
        for parameter, value in config.items():
            task.setdefault(parameter, value)

        task.encoders = [utils.AttrDict(encoder) for encoder in task.encoders]
        task.decoders = [utils.AttrDict(decoder) for decoder in task.decoders]

        for encoder_or_decoder in task.encoders + task.decoders:
            for parameter, value in task.items():
                encoder_or_decoder.setdefault(parameter, value)

        if args.max_len:
            args.max_input_len = args.max_len
        if args.max_output_len:   # override decoder's max len
            task.decoders[0].max_len = args.max_output_len
        if args.max_input_len:    # override encoder's max len
            task.encoders[0].max_len = args.max_input_len

    config.checkpoint_dir = os.path.join(config.model_dir, 'checkpoints')

    # setting random seeds
    if config.seed is None:
        config.seed = random.randrange(sys.maxsize)
    if config.tf_seed is None:
        config.tf_seed = random.randrange(sys.maxsize)
    utils.log('python random seed: {}'.format(config.seed))
    utils.log('tf random seed:     {}'.format(config.tf_seed))
    random.seed(config.seed)
    tf.set_random_seed(config.tf_seed)

    device = None
    if config.no_gpu:
        device = '/cpu:0'
        device_id = None
    elif config.gpu_id is not None:
        device = '/gpu:{}'.format(config.gpu_id)
        device_id = config.gpu_id
    else:
        device_id = 0

    # hide other GPUs so that TensorFlow won't use memory on them
    os.environ['CUDA_VISIBLE_DEVICES'] = '' if device_id is None else str(device_id)

    utils.log('creating model')
    utils.log('using device: {}'.format(device))

    with tf.device(device):
        if config.weight_scale:
            if config.initializer == 'uniform':
                initializer = tf.random_uniform_initializer(minval=-config.weight_scale, maxval=config.weight_scale)
            else:
                initializer = tf.random_normal_initializer(stddev=config.weight_scale)
        else:
            initializer = None

        tf.get_variable_scope().set_initializer(initializer)

        # exempt from creating gradient ops
        config.decode_only = decoding_mode

        if config.tasks is not None:
            model = MultiTaskModel(**config)
        else:
            model = TranslationModel(**config)

    # count parameters
    # not counting parameters created by training algorithm (e.g. Adam)
    variables = [var for var in tf.global_variables() if not var.name.startswith('gradients')]
    utils.log('model parameters ({})'.format(len(variables)))
    parameter_count = 0
    for var in sorted(variables, key=lambda var: var.name):
        utils.log('  {} {}'.format(var.name, var.get_shape()))
        v = 1
        for d in var.get_shape():
            v *= d.value
        parameter_count += v
    utils.log('number of parameters: {:.2f}M'.format(parameter_count / 1e6))

    tf_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = config.allow_growth
    tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction

    def average_checkpoints(main_sess, sessions):
        for var in tf.global_variables():
            avg_value = sum(sess.run(var) for sess in sessions) / len(sessions)
            main_sess.run(var.assign(avg_value))

    with tf.Session(config=tf_config) as sess:
        best_checkpoint = os.path.join(config.checkpoint_dir, 'best')

        params = {'variable_mapping': config.variable_mapping, 'reverse_mapping': config.reverse_mapping,
                  'rnn_lm_model_dir': None, 'rnn_mt_model_dir': None,
                  'rnn_lm_cell_name': None, 'origin_model_ckpt': None}
        if config.ensemble and len(config.checkpoints) > 1:
            model.initialize(config.checkpoints, **params)
        elif config.average and len(config.checkpoints) > 1:
            model.initialize(reset=True)
            sessions = [tf.Session(config=tf_config) for _ in config.checkpoints]
            for sess_, checkpoint in zip(sessions, config.checkpoints):
                model.initialize(sess=sess_, checkpoints=[checkpoint], **params)
            average_checkpoints(sess, sessions)
        elif (not config.checkpoints and decoding_mode and
             (os.path.isfile(best_checkpoint + '.index') or os.path.isfile(best_checkpoint + '.index'))):
            # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`),
            # try to load the best checkpoint
            model.initialize([best_checkpoint], **params)
        else:
            # loads last checkpoint, unless `reset` is true
            model.initialize(**config)

        if config.output is not None:
            dirname = os.path.dirname(config.output)
            if dirname:
                os.makedirs(dirname, exist_ok=True)

        try:
            if args.save:
                model.save()
            elif args.save_embedding:
                if config.embedding_output_dir is None:
                    output_dir = "."
                else:
                    output_dir = config.embedding_output_dir
                model.save_embedding(output_dir)
            elif args.decode is not None:
                if config.align is not None:
                    config.align = True
                model.decode(**config)
            elif args.eval is not None:
                model.evaluate(on_dev=False, **config)
            elif args.align is not None:
                model.align(**config)
            elif args.train:
                model.train(**config)
        except KeyboardInterrupt:
            sys.exit()
Example #3
0
def main(args=None):
    args = parser.parse_args(args)

    # read config file and default config
    with open('config/default.yaml') as f:
        default_config = utils.AttrDict(yaml.safe_load(f))

    with open(args.config) as f:
        config = utils.AttrDict(yaml.safe_load(f))

        if args.learning_rate is not None:
            args.reset_learning_rate = True

        # command-line parameters have higher precedence than config file
        for k, v in vars(args).items():
            if v is not None:
                config[k] = v

        # set default values for parameters that are not defined
        for k, v in default_config.items():
            config.setdefault(k, v)

    # enforce parameter constraints
    assert config.steps_per_eval % config.steps_per_checkpoint == 0, (
        'steps-per-eval should be a multiple of steps-per-checkpoint')
    assert args.decode is not None or args.eval or args.train or args.align, (
        'you need to specify at least one action (decode, eval, align, or train)'
    )
    assert not (args.avg_checkpoints and args.ensemble)

    if args.purge:
        utils.log('deleting previous model')
        shutil.rmtree(config.model_dir, ignore_errors=True)

    os.makedirs(config.model_dir, exist_ok=True)

    # copy config file to model directory
    config_path = os.path.join(config.model_dir, 'config.yaml')
    if not os.path.exists(config_path):
        shutil.copy(args.config, config_path)

    # also copy default config
    config_path = os.path.join(config.model_dir, 'default.yaml')
    if not os.path.exists(config_path):
        shutil.copy('config/default.yaml', config_path)

    # copy source code to model directory
    tar_path = os.path.join(config.model_dir, 'code.tar.gz')
    if not os.path.exists(tar_path):
        with tarfile.open(tar_path, "w:gz") as tar:
            for filename in os.listdir('translate'):
                if filename.endswith('.py'):
                    tar.add(os.path.join('translate', filename),
                            arcname=filename)

    logging_level = logging.DEBUG if args.verbose else logging.INFO
    # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs)
    log_path = os.path.join(config.model_dir, config.log_file)
    logger = utils.create_logger(log_path if args.train else None)
    logger.setLevel(logging_level)

    utils.log('label: {}'.format(config.label))
    utils.log('description:\n  {}'.format('\n  '.join(
        config.description.strip().split('\n'))))

    utils.log(' '.join(sys.argv))  # print command line
    try:  # print git hash
        commit_hash = subprocess.check_output(['git', 'rev-parse',
                                               'HEAD']).decode().strip()
        utils.log('commit hash {}'.format(commit_hash))
    except:
        pass

    utils.log('tensorflow version: {}'.format(tf.__version__))

    # log parameters
    utils.debug('program arguments')
    for k, v in sorted(config.items(), key=itemgetter(0)):
        utils.debug('  {:<20} {}'.format(k, pformat(v)))

    if isinstance(config.dev_prefix, str):
        config.dev_prefix = [config.dev_prefix]

    if config.tasks is not None:
        config.tasks = [utils.AttrDict(task) for task in config.tasks]
        tasks = config.tasks
    else:
        tasks = [config]

    for task in tasks:
        for parameter, value in config.items():
            task.setdefault(parameter, value)

        task.encoders = [utils.AttrDict(encoder) for encoder in task.encoders]
        task.decoders = [utils.AttrDict(decoder) for decoder in task.decoders]

        for encoder_or_decoder in task.encoders + task.decoders:
            for parameter, value in task.items():
                encoder_or_decoder.setdefault(parameter, value)

    device = None
    if config.no_gpu:
        device = '/cpu:0'
    elif config.gpu_id is not None:
        device = '/gpu:{}'.format(config.gpu_id)

    utils.log('creating model')
    utils.log('using device: {}'.format(device))

    with tf.device(device):
        config.checkpoint_dir = os.path.join(config.model_dir, 'checkpoints')

        if config.weight_scale:
            if config.initializer == 'uniform':
                initializer = tf.random_uniform_initializer(
                    minval=-config.weight_scale, maxval=config.weight_scale)
            else:
                initializer = tf.random_normal_initializer(
                    stddev=config.weight_scale)
        else:
            initializer = None

        tf.get_variable_scope().set_initializer(initializer)

        config.decode_only = args.decode is not None or args.eval or args.align  # exempt from creating gradient ops

        if config.tasks is not None:
            model = MultiTaskModel(**config)
        else:
            model = TranslationModel(**config)

    # count parameters
    utils.log('model parameters ({})'.format(len(tf.global_variables())))
    parameter_count = 0
    for var in tf.global_variables():
        utils.log('  {} {}'.format(var.name, var.get_shape()))

        if not var.name.startswith(
                'gradients'
        ):  # not counting parameters created by training algorithm (e.g. Adam)
            v = 1
            for d in var.get_shape():
                v *= d.value
            parameter_count += v
    utils.log('number of parameters: {:.2f}M'.format(parameter_count / 1e6))

    tf_config = tf.ConfigProto(log_device_placement=False,
                               allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = config.allow_growth
    tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction

    def average_checkpoints(main_sess, sessions):
        for var in tf.global_variables():
            avg_value = sum(sess.run(var) for sess in sessions) / len(sessions)
            main_sess.run(var.assign(avg_value))

    with tf.Session(config=tf_config) as sess:
        best_checkpoint = os.path.join(config.checkpoint_dir, 'best')

        if ((config.ensemble or config.avg_checkpoints)
                and (args.eval or args.decode is not None)
                and len(config.checkpoints) > 1):
            # create one session for each model in the ensemble
            sessions = [tf.Session() for _ in config.checkpoints]
            for sess_, checkpoint in zip(sessions, config.checkpoints):
                model.initialize(sess_, [checkpoint])

            if config.ensemble:
                sess = sessions
            else:
                sess = sessions[0]
                average_checkpoints(sess, sessions)
        elif (not config.checkpoints
              and (args.eval or args.decode is not None or args.align)
              and (os.path.isfile(best_checkpoint + '.index')
                   or os.path.isfile(best_checkpoint + '.index'))):
            # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`),
            # try to load the best checkpoint)
            model.initialize(sess, [best_checkpoint])
        else:
            # loads last checkpoint, unless `reset` is true
            model.initialize(sess, **config)

        if args.decode is not None:
            model.decode(sess, **config)
        elif args.eval:
            model.evaluate(sess, on_dev=False, **config)
        elif args.align:
            model.align(sess, **config)
        elif args.train:
            try:
                model.train(sess=sess, **config)
            except (KeyboardInterrupt, utils.FinishedTrainingException):
                utils.log('exiting...')
                model.save(sess)
                sys.exit()