Ejemplo n.º 1
0
    def modify(self, mods, inputs=None, outputs=None):
        """
        Allows to make modifications to the model. It will create a modified configuration file, build the corresponding model and
        set the weights of the current model when possible.

        mods:    a list of dictionaries. Each dictionary can have as key: delete (in this case the value, which is a path, 
                 is deleted from config), or a config path (in this case, the path value is replaced by the value of the dictionary).
        inputs:  a list with the names of the inputs
        outputs: a list with the names of the outputs
        """
        model_weights = self.get_weights()
        yaml_loader = YAML()
        m_conf = Config(self.core_model.processed_config)
        original_keys = list(m_conf.keys())
        deep_conf = Config(shallow_to_deep(m_conf))
        for mod in mods:
            mod_key = list(mod.keys())[0]
            mod_value = mod[mod_key]
            if mod_key == 'delete':
                deep_conf.pop(mod_value)
                if mod_value in original_keys:
                    original_keys.remove(mod_value)
            elif '*' in mod_key:
                mod_key = mod_key.lstrip('/')
                found_paths = [
                    k for k in deep_conf.to_shallow().keys()
                    if fnmatch.fnmatch(k, mod_key)
                ]
                for k in found_paths:
                    k = k.replace('.', '/')
                    if isinstance(mod_value, str):
                        deep_conf[k] = yaml_loader.load(mod_value)
                    else:
                        deep_conf[k] = mod_value
            else:
                mod_key = mod_key.replace('.', '/')
                if mod_key.split('/')[0] not in deep_conf.keys(
                ):  #This means we are adding a new layer
                    layer_name = mod_key.split('/')[0]
                    original_keys.append(layer_name)
                    deep_conf['{}/name'.format(layer_name)] = layer_name
                if isinstance(mod_value, str):
                    deep_conf[mod_key] = yaml_loader.load(mod_value)
                else:
                    deep_conf[mod_key] = mod_value
        new_model_architecture = shallow_to_original_keys(
            deep_conf.to_shallow(), original_keys)
        model = self.build(processed_config=new_model_architecture,
                           input_names=inputs,
                           output_names=outputs)
        layer_names = [l.name for l in model.layers]
        for k, v in model_weights.items():
            if k in layer_names:
                layer = model.get_layer(k)
                layer.set_weights(v)
        self.core_model.model = model
Ejemplo n.º 2
0
def insert_yaml_value(config, special_tags, global_config, default_config,
                      missing_paths):
    found_paths = config.find_path(symbols['insert_config'], mode='startswith')
    #,action=lambda x: process_config(Config(x.split(symbols['insert_config'])[-1],special_tags=special_tags),special_tags=special_tags,global_config=global_config)
    for path in found_paths:
        tag_data = config[path]
        insert_yaml_path = tag_data.split(symbols['insert_config'])[-1]
        insert_config = Config(insert_yaml_path, yaml_tags=special_tags)
        global_config.update(insert_config.get('global', {}))
        if 'defaults' in insert_config:
            default_config.update(insert_config.pop('defaults'))
        insert_config = process_config(insert_config, special_tags,
                                       global_config, default_config,
                                       missing_paths)
        config[path] = insert_config
Ejemplo n.º 3
0
def replace_yamls(main_config, special_tags):
    main_config.find_path(
        symbols['insert_config'],
        mode='startswith',
        action=lambda x: Config(x.split(symbols['insert_config'])[-1],
                                special_tags=special_tags))
    return main_config
Ejemplo n.º 4
0
    def _serialize_model(self, save_optimizer=False, extra_data=None):
        model_output = {}
        model_output['weights'] = self.get_weights()
        original_config = Config(self.original_config)
        for p in original_config.all_paths():
            if type(original_config[p]).__name__ == 'BatchGenerator':
                original_config[p] = original_config[p].data_processor_config
        model_output['original_config'] = original_config
        if self.architecture_config:
            model_output['hierarchy'] = self.architecture_config.hierarchy
        if self.core_model:
            model_output['unfolded_config'] = self.core_model.processed_config
            if save_optimizer:
                model_output['optimizer_state'] = self.get_optimizer()
            if extra_data:
                model_output.update(extra_data)
        model_output['input_shapes'] = self.input_shapes
        model_output['output_shapes'] = self.output_shapes

        return model_output
Ejemplo n.º 5
0
Archivo: core.py Proyecto: mrpep/paips
    def __make_hash_dict(self):
        """
        Creates a dictionary to hash the task. Parameters that are TaskIOs get replaced by their hash
        """
        self._hash_dict = copy.deepcopy(self.parameters)
        #Remove not cacheable parameters
        if not isinstance(self._hash_dict, Config):
            self._hash_dict = Config(self._hash_dict)
        if not isinstance(self.parameters, Config):
            self.parameters = Config(self.parameters)

        _ = self._hash_dict.find_path(symbols['nocache'],
                                      mode='startswith',
                                      action='remove_value')
        _ = self.parameters.find_path(symbols['nocache'],
                                      mode='startswith',
                                      action='remove_substring')

        for k, v in self._hash_dict.to_shallow().items():
            if isinstance(v, TaskIO):
                self._hash_dict[k] = self._hash_dict[k].get_hash()
Ejemplo n.º 6
0
def include_config(config, special_tags, global_config, default_config,
                   missing_paths):
    found_paths = config.find_keys('include')

    for p in found_paths:
        includes = config[p]
        switch = None
        if isinstance(includes, dict):
            switch = includes.get('switch', None)
        if switch is not None:
            if not isinstance(switch, list):
                switch = [switch]
            filtered_includes = []
            for include_config in includes['configs']:
                if include_config.get('name', None) in switch:
                    filtered_includes.append(include_config)
            includes = filtered_includes

        for include_config in includes:
            if include_config.get('enable', True) and include_config.get(
                    'config', None):
                path_yaml_to_include = Path(config.yaml_path.parent,
                                            include_config.pop('config'))

                imported_config = Config(path_yaml_to_include,
                                         yaml_tags=special_tags)
                if 'defaults' in imported_config:
                    default_config.update(imported_config.pop('defaults'))
                mods = include_config.get('mods', None)
                for r, v in include_config.items():
                    r = '({})'.format(r)
                    imported_config = replace_in_config(imported_config, r, v)
                if '/' in p:
                    p_parent = '/'.join(p.split('/')[:-1])
                else:
                    p_parent = None
                imported_config = process_config(imported_config, special_tags,
                                                 global_config, default_config,
                                                 missing_paths)
                if mods:
                    apply_mods(mods, imported_config)
                if p_parent:
                    p_config = Config(config[p_parent])
                    p_config.yaml_path = config.yaml_path
                    new_config = merge_configs([p_config, imported_config])
                    config[p_parent] = new_config
                else:
                    original_yaml_path = config.yaml_path
                    config = merge_configs([Config(config), imported_config])
                    config.yaml_path = original_yaml_path
        config.pop(p)
    return config
Ejemplo n.º 7
0
def apply_mods(modstr, config):
    yaml = YAML()
    if modstr is not None:
        if isinstance(modstr, str):
            mods = modstr.split('&')
            for mod in mods:
                if '=' in mod:
                    mod_parts = mod.split('=')
                    mod_k = '='.join(mod_parts[:-1])
                    mod_v = mod_parts[-1]
                    #if mod_parts[1].startswith('['):
                    if '!' in mod_v:
                        config[mod_k] = mod_v
                    #elif mod_parts[1].lower() == 'null':
                    #    config[mod_parts[0]] = None
                    else:
                        config[mod_k] = yaml.load(mod_v)

        elif isinstance(modstr, list):
            for mod in modstr:
                config.update(Config(mod).to_shallow())
Ejemplo n.º 8
0
def get_config(filename):
    config = Config(filename, safe=False)
    return config
Ejemplo n.º 9
0
def external_unfold(name, config, metadata=None, logger=None):
    external_models = metadata['externals']['Models']
    external_model_name = config['model']
    external_layer_name = config.get('layer', None)
    external_last_layer = config.get('up_to', None)
    external_first_layer = config.get('from', None)
    external_exclude_inputs = config.get('exclude_input', True)
    external_reset_weights = config.get('reset_weights', False)
    external_mods = config.get('mods', None)
    external_time_distributed = config.get('time_distributed', False)

    trainable_from = config.get('trainable_from', None)
    trainable_layers = config.get('trainable_layers', None)
    trainable = config.get('trainable', True)
    training_flag = config.get('training', False)

    import dienen

    if isinstance(external_models[external_model_name], str):
        external_model = joblib.load(external_models[external_model_name])
        if isinstance(external_model, dict):
            external_model_architecture = external_model['unfolded_config']
            external_hierarchy = external_model['hierarchy']
        elif isinstance(external_model, dienen.core.model.Model):
            external_model_architecture = external_model.core_model.processed_config
            external_hierarchy = external_model.architecture_config.hierarchy
    elif isinstance(external_models[external_model_name],
                    dienen.core.model.Model):
        external_model = external_models[external_model_name]
        external_model_architecture = external_model.core_model.processed_config
        external_hierarchy = external_model.architecture_config.hierarchy

    if external_mods:
        import fnmatch
        yaml_loader = YAML()
        m_conf = Config(external_model_architecture)
        original_keys = list(m_conf.keys())
        deep_conf = Config(shallow_to_deep(m_conf))
        for mod in external_mods:
            mod_key = list(mod.keys())[0]
            mod_value = mod[mod_key]
            if mod_key == 'delete':
                deep_conf.pop(mod_value)
                if mod_value in original_keys:
                    original_keys.remove(mod_value)
            elif '*' in mod_key:
                mod_key = mod_key.lstrip('/')
                found_paths = [
                    k for k in deep_conf.to_shallow().keys()
                    if fnmatch.fnmatch(k, mod_key)
                ]
                for k in found_paths:
                    k = k.replace('.', '/')
                    if isinstance(mod_value, str):
                        deep_conf[k] = yaml_loader.load(mod_value)
                    else:
                        deep_conf[k] = mod_value
            else:
                mod_key = mod_key.replace('.', '/')
                if mod_key.split('/')[0] not in deep_conf.keys(
                ):  #This means we are adding a new layer
                    layer_name = mod_key.split('/')[0]
                    original_keys.append(layer_name)
                    deep_conf['{}/name'.format(layer_name)] = layer_name
                if isinstance(mod_value, str):
                    deep_conf[mod_key] = yaml_loader.load(mod_value)
                else:
                    deep_conf[mod_key] = mod_value

        external_model_architecture = shallow_to_original_keys(
            deep_conf.to_shallow(), original_keys)
    unfolded_layers = []

    g = nx.DiGraph()
    for layer_name, layer_config in external_model_architecture.items():
        if layer_config['class'] != 'Input':
            if isinstance(layer_config['input'], list):
                for k in layer_config['input']:
                    g.add_edge(k, layer_name)
            else:
                g.add_edge(layer_config['input'], layer_name)

    if external_layer_name and external_layer_name not in g.nodes(
    ) and external_layer_name in external_hierarchy:
        external_layer_name = external_hierarchy[external_layer_name][
            'output'][0]
    if external_last_layer and external_last_layer not in g.nodes(
    ) and external_last_layer in external_hierarchy:
        external_last_layer = external_hierarchy[external_last_layer][
            'output'][0]
    if external_first_layer and external_first_layer not in g.nodes(
    ) and external_first_layer in external_hierarchy:
        external_first_layer = external_hierarchy[external_first_layer][
            'input'][0]

    if external_last_layer and not external_first_layer and not external_layer_name:
        layers_subset = list(nx.ancestors(
            g, external_last_layer)) + [external_last_layer]
    elif external_first_layer and not external_last_layer and not external_layer_name:
        layers_subset = nx.dfs_successors(
            g, external_first_layer)[external_first_layer] + [
                external_first_layer
            ]
    elif external_first_layer and external_last_layer and not external_layer_name:
        after_from = set(nx.dfs_successors(g, external_first_layer).keys())
        before_to = set(nx.ancestors(g, external_last_layer))
        layers_subset = list(
            after_from.intersection(before_to)) + [external_last_layer]
    elif external_layer_name:
        layers_subset = [external_layer_name]
    else:
        layers_subset = list(external_model_architecture.keys())
    if external_exclude_inputs:
        layers_subset = [
            layer for layer in layers_subset
            if external_model_architecture[layer]['class'] != 'Input'
        ]

    unfolded_layers = [external_model_architecture[l] for l in layers_subset]

    #if len(unfolded_layers) == 1:
    #    in_layers = [name]
    #else:
    in_layers = []
    for l in unfolded_layers:
        ins = l['input']
        if not isinstance(ins, list):
            ins = [ins]
        for x in ins:
            if x not in layers_subset:
                in_layers.append(l['name'])

    #in_layers = [l['name'] for l in unfolded_layers if l['input'] not in layers_subset]
    new_nodes = [{
        layer['name']: pop_dictreturn(layer, 'name')
    } for layer in unfolded_layers]
    new_config, hierarchy = new_nodes_to_config(new_nodes, name)

    if 'input' in config:
        for layer in in_layers:
            new_config[layer]['input'] = config['input']

    if trainable and not trainable_from and not trainable_layers:
        trainable_layers = [
            layer_name for layer_name, layer in new_config.items()
            if layer['class'] != 'Input'
        ]
    elif trainable_from:
        if trainable_from not in external_model_architecture and trainable_from in external_hierarchy:
            trainable_from = external_hierarchy[trainable_from]['inputs'][0]
        trainable_layers = nx.dfs_successors(g, trainable_from)
    elif not trainable:
        trainable_layers = []

    #Set trainable false in non-trainable layers
    for layer_name, layer_config in new_config.items():
        if layer_name not in trainable_layers:
            layer_config['trainable'] = False
            layer_config[
                'training'] = training_flag  #This is to avoid problems with BN accumulated statistics
        else:
            layer_config['trainable'] = True

    if external_time_distributed:
        for layer_name, layer_config in new_config.items():
            layer_config['time_distributed'] = True

    #Make each layer search for the weights from external model
    if isinstance(external_reset_weights, bool) and not external_reset_weights:
        external_weight_layers = [
            layer_name for layer_name, layer in new_config.items()
            if layer['class'] != 'Input'
        ]
    elif isinstance(external_reset_weights, list):
        external_weight_layers = external_reset_weights

    if not config.get('reset_weights', False):
        for layer in external_weight_layers:
            new_config[layer]['from_model'] = external_model_name
            new_config[layer]['from_layer'] = layer

    return new_config, hierarchy
Ejemplo n.º 10
0
def load_experiment(configs, mods=None, global_config=None, logger=None):
    #Get main config
    #By default, yaml uses custom tags marked as !, however, we want to use it in a more general way even in dictionaries.
    #To avoid raising exceptions, an ignorable tag is created which will return the string unchanged for later processing

    ignorable_tags = [
        v.strip() for k, v in symbols.items() if v.startswith('!')
    ]
    special_tags = [IgnorableTag(tag) for tag in ignorable_tags]

    configs = [Config(path_i, yaml_tags=special_tags) for path_i in configs]
    #main_config = merge_configs(configs)
    main_config = configs[0]
    apply_mods(mods, main_config)

    if global_config is None:
        global_config = {}
    global_config.update(main_config.get('global', {}))
    default_config = main_config.get('defaults', {})

    if 'global' in main_config:
        main_config['global'].update(global_config)
    else:
        main_config['global'] = global_config

    #Config processing/merging/expanding
    missing_paths = []
    main_config = process_config(main_config, special_tags, global_config,
                                 default_config, missing_paths)
    n_tries = 20

    while n_tries > 0 and len(missing_paths) > 0:
        n_tries -= 1
        global_config.update(main_config['global'])
        default_config.update(main_config.get('default', {}))
        missing_paths = []
        main_config = process_config(main_config, special_tags, global_config,
                                     default_config, missing_paths)

    if len(missing_paths) > 0:
        print('Warning: Cannot resolve tags {}'.format(missing_paths))
        for k in missing_paths:
            global_config[k] = None
        missing_paths = []
        main_config = process_config(main_config, special_tags, global_config,
                                     default_config, missing_paths)

    default_cluster_config = {'manager': None, 'n_cores': 1, 'niceness': 20}

    cluster_config = main_config.get('cluster_config', default_cluster_config)
    main_config['cluster_config'] = cluster_config
    main_config['global_config'] = global_config

    #For every task with a variable that we want to loop,
    #we find the tag and create a parameter 'parallel' which holds the names
    #of the loopable params, and adds a '!nocache' so that it is not cached

    parallel_paths = main_config.find_path(symbols['distributed-pool'],
                                           mode='startswith',
                                           action='remove_substring')
    parallel_paths = [p for p in parallel_paths if not p.startswith('global')]
    parallel_paths = [(task_parameters_level_from_path(p),
                       p.split(task_parameters_level_from_path(p) + '/')[-1])
                      for p in parallel_paths]

    parallel_paths_async = [
        k for k in list(main_config.all_paths())
        if k.endswith('async') and main_config[k] == True
    ]
    parallel_paths_async = [
        p for p in parallel_paths_async if not p.startswith('global')
    ]
    parallel_paths_async = [
        (task_parameters_level_from_path(p),
         p.split(task_parameters_level_from_path(p) + '/')[-1])
        for p in parallel_paths_async
    ]

    parallel_paths_ = {}

    for p in parallel_paths_async:
        main_config[p[0] + '/niceness'] = cluster_config.get('niceness', 20)
    for p in parallel_paths:
        path = p[0] + '/parallel'
        if path not in parallel_paths_:
            parallel_paths_[path] = [p[1]]
        else:
            parallel_paths_[path].append(p[1])
        if 'n_cores' not in main_config[p[0]]:
            main_config[p[0] + '/n_cores'] = cluster_config['n_cores']
        if 'niceness' not in main_config[p[0]]:
            main_config[p[0] + '/niceness'] = cluster_config.get(
                'niceness', 20)

    map_paths = main_config.find_path(symbols['serial-map'],
                                      mode='startswith',
                                      action='remove_substring')
    map_paths = [p for p in map_paths if not p.startswith('global')]
    map_paths = [(task_parameters_level_from_path(p),
                  p.split(task_parameters_level_from_path(p) + '/')[-1])
                 for p in map_paths]
    map_paths_ = {}

    for p in map_paths:
        path = p[0] + '/map_vars'
        if path not in map_paths_:
            map_paths_[path] = [p[1]]
        else:
            map_paths_[path].append(p[1])

    yaml = YAML()
    for k, v in parallel_paths_.items():
        v_yaml_stream = StringIO()
        yaml.dump(v, v_yaml_stream)
        parallel_paths_[k] = symbols['nocache'] + ' ' + str(v)
        v_yaml_stream.close()

    for k, v in map_paths_.items():
        v_yaml_stream = StringIO()
        yaml.dump(v, v_yaml_stream)
        map_paths_[k] = symbols['nocache'] + ' ' + str(v)
        v_yaml_stream.close()

    main_config.update(parallel_paths_)
    main_config.update(map_paths_)

    #main_task = TaskGraph(main_config,global_config,name='MainTask',logger=paips_logger)

    return main_config
Ejemplo n.º 11
0
Archivo: core.py Proyecto: mrpep/paips
    def __init__(self,
                 parameters,
                 global_parameters=None,
                 name=None,
                 logger=None,
                 simulate=False):
        """
        parameters: dictionary with all parameters given to a task
        global_parameters: dictionary with parameters common to all tasks
        name: task name
        logger: task logger
        simulate: if True, the task won't get executed
        """
        self.global_parameters = {
            'cache': True,
            'cache_path': 'cache',
            'cache_compression': 0,
            'output_path': 'experiments',
            'overwrite_export': True
        }

        if global_parameters:
            self.global_parameters.update(global_parameters)

        if not GenericFile(self.global_parameters['output_path']).exists():
            GenericFile(
                self.global_parameters['output_path']).mkdir(parents=True)

        self.name = name
        self.valid_args = []
        self.default_no_cache: []

        self.parameters = parameters

        self.simulate = simulate

        self.output_names = self.parameters.pop('output_names', ['out'])
        self.cache = get_delete_param(self.parameters, 'cache',
                                      self.global_parameters['cache'])
        self.in_memory = get_delete_param(self.parameters, 'in_memory',
                                          self.global_parameters['in_memory'])

        self.dependencies = []
        self.logger = logger

        if 'mods' in self.parameters:
            apply_mods(self.parameters['mods'], Config(self.parameters))
            self.parameters.pop('mods')

        self.__make_hash_dict()
        self.initial_parameters = copy.deepcopy(self.parameters)

        self.export_path = Path(self.global_parameters.get('output_path'),
                                self.name)
        self.export = self.parameters.get('export', False)
        self.symlinkdb_path = Path(self.global_parameters.get('output_path'),
                                   'links.txt')

        fname = GenericFile(self.global_parameters['output_path'], 'configs',
                            '{}.yaml'.format(self.name))
        self.parameters.save(Path(fname.local_filename), mode='unsafe')
        if fname.filesystem == 's3':
            fname.upload_from(fname.local_filename)
Ejemplo n.º 12
0
Archivo: core.py Proyecto: mrpep/paips
class Task():
    def __init__(self,
                 parameters,
                 global_parameters=None,
                 name=None,
                 logger=None,
                 simulate=False):
        """
        parameters: dictionary with all parameters given to a task
        global_parameters: dictionary with parameters common to all tasks
        name: task name
        logger: task logger
        simulate: if True, the task won't get executed
        """
        self.global_parameters = {
            'cache': True,
            'cache_path': 'cache',
            'cache_compression': 0,
            'output_path': 'experiments',
            'overwrite_export': True
        }

        if global_parameters:
            self.global_parameters.update(global_parameters)

        if not GenericFile(self.global_parameters['output_path']).exists():
            GenericFile(
                self.global_parameters['output_path']).mkdir(parents=True)

        self.name = name
        self.valid_args = []
        self.default_no_cache: []

        self.parameters = parameters

        self.simulate = simulate

        self.output_names = self.parameters.pop('output_names', ['out'])
        self.cache = get_delete_param(self.parameters, 'cache',
                                      self.global_parameters['cache'])
        self.in_memory = get_delete_param(self.parameters, 'in_memory',
                                          self.global_parameters['in_memory'])

        self.dependencies = []
        self.logger = logger

        if 'mods' in self.parameters:
            apply_mods(self.parameters['mods'], Config(self.parameters))
            self.parameters.pop('mods')

        self.__make_hash_dict()
        self.initial_parameters = copy.deepcopy(self.parameters)

        self.export_path = Path(self.global_parameters.get('output_path'),
                                self.name)
        self.export = self.parameters.get('export', False)
        self.symlinkdb_path = Path(self.global_parameters.get('output_path'),
                                   'links.txt')

        fname = GenericFile(self.global_parameters['output_path'], 'configs',
                            '{}.yaml'.format(self.name))
        self.parameters.save(Path(fname.local_filename), mode='unsafe')
        if fname.filesystem == 's3':
            fname.upload_from(fname.local_filename)

    def __make_hash_dict(self):
        """
        Creates a dictionary to hash the task. Parameters that are TaskIOs get replaced by their hash
        """
        self._hash_dict = copy.deepcopy(self.parameters)
        #Remove not cacheable parameters
        if not isinstance(self._hash_dict, Config):
            self._hash_dict = Config(self._hash_dict)
        if not isinstance(self.parameters, Config):
            self.parameters = Config(self.parameters)

        _ = self._hash_dict.find_path(symbols['nocache'],
                                      mode='startswith',
                                      action='remove_value')
        _ = self.parameters.find_path(symbols['nocache'],
                                      mode='startswith',
                                      action='remove_substring')

        for k, v in self._hash_dict.to_shallow().items():
            if isinstance(v, TaskIO):
                self._hash_dict[k] = self._hash_dict[k].get_hash()

    def search_dependencies(self):
        """
        Finds all the tasks needed to run this task. It does it by searching for the -> symbol in its config
        """
        stop_propagate_dot = self.parameters.get('stop_propagate_dot', None)
        dependency_paths = self.parameters.find_path(symbols['dot'],
                                                     mode='contains')
        #dependency_paths = [p for p in dependency_paths if 'Tasks' not in ]
        if self.__class__.__name__ == 'TaskGraph':
            dependency_paths = [
                p for p in dependency_paths
                if ('Tasks' not in p) and (not p.startswith('outputs'))
            ]
        #Esto es porque dienen tambien usa el simbolo -> entonces debo decir que si encuentra ahi no lo tenga en cuenta.
        if stop_propagate_dot:
            dependency_paths = [
                p for p in dependency_paths
                if not p.startswith(stop_propagate_dot)
            ]

        self._dependencies = [
            self.parameters[path].split(symbols['dot'])[0]
            for path in dependency_paths
        ]
        self._dependencies = [d for d in self._dependencies if d != 'self']

        return self._dependencies

    def reset_task_state(self):
        """
        Returns the task to its initial parameters and hash_dict
        """
        self.parameters = copy.deepcopy(self.initial_parameters)
        self.__make_hash_dict()

    def __check_valid_args(self):
        """
        Each task can have a valid_args list which lists the allowed parameters
        """
        for k in self.parameters.keys():
            if k not in self.valid_args:
                raise Exception(
                    '{} not recognized as a valid parameter'.format(k))

    def send_dependency_data(self, data):
        """
        Replace TaskIOs in parameters with the corresponding data. Also adds its associated hashes to the hash dictionary
        """
        glob_keys = self.parameters.find_path(
            '*',
            mode='contains',
            action=lambda x: fnmatch.filter(list(data.keys()), x)
            if '->' in x else x)
        glob_keys = self._hash_dict.find_path(
            '*',
            mode='contains',
            action=lambda x: fnmatch.filter(list(data.keys()), x)
            if '->' in x else x)

        for k, v in data.items():
            paths = self._hash_dict.find_path(k, action=lambda x: v.get_hash())
            if len(paths) > 0:
                self.parameters.find_path(k, action=lambda x: v.load())
            else:
                if self.simulate and not k.startswith('self'):
                    k_ = k.split('->')[0] + '->'
                    paths = self._hash_dict.find_path(
                        k_, action=lambda x: v.get_hash(), mode='startswith')

    def get_hash(self):
        """
        Returns an unique identifier for the task
        """
        task_hash = self.parameters.get('task_hash', None)
        if task_hash is None:
            return self._hash_dict.hash()
        else:
            return task_hash

    def process(self):
        pass

    def find_cache(self):
        """
        Finds all the associated files in cache
        """
        cache_paths = find_cache(self.task_hash,
                                 self.global_parameters['cache_path'])
        return cache_paths

    def __process_outputs(self, outs):
        """
        Task outputs are turned into TaskIOs and saved if in_memory = False. All outputs TaskIO are returned in a dictionary.
        """
        if type(outs).__name__ == 'ObjectRef':
            filter_outputs = self.parameters.get('outputs', None)
            if filter_outputs:
                self.output_names = []
                for k, v in filter_outputs.items():
                    self.output_names.append(k)

        if not isinstance(outs, tuple):
            outs = (outs, )

        out_dict = {
            '{}{}{}'.format(self.name, symbols['dot'], out_name):
            TaskIO(out_val,
                   self.get_hash(),
                   iotype='data',
                   name=out_name,
                   position=str(i))
            for i, (out_name,
                    out_val) in enumerate(zip(self.output_names, outs))
        }

        if not self.in_memory:
            self.logger.info('{}: Saving outputs'.format(self.name))
            for k, v in out_dict.items():
                if v.iotype == 'data':
                    out_dict[k] = v.save(
                        cache_path=self.global_parameters['cache_path'],
                        export_path=self.export_path,
                        compression_level=self.
                        global_parameters['cache_compression'],
                        export=self.export,
                        symlink_db=self.symlinkdb_path,
                        overwrite_export=self.
                        global_parameters['overwrite_export'])

        return out_dict

    def __parallel_run_ray(self, run_async=False):
        """
        Initializes a ray pool. Asynchronous pools are still not implemented.
        """
        from ray.util.multiprocessing.pool import Pool

        def set_niceness(niceness):  # pool initializer
            os.nice(niceness)

        def worker_wrapper(x):
            os.nice(self.parameters.get('niceness', 20))
            for k, v in zip(self.parameters['parallel'], x):
                self.parameters[k] = v
            out = self.process()
            return out

        iterable_vars = list(
            zip(*[self.parameters[k] for k in self.parameters['parallel']]))
        n_cores = self.parameters.get('n_cores', 4)
        pool = Pool(processes=n_cores,
                    initializer=set_niceness,
                    initargs=(self.parameters.get('niceness', 20), ),
                    ray_address='auto')  #(Run in same host it was called)
        outs = pool.map(worker_wrapper, iterable_vars)

        return self.__process_outputs(outs)

    def __serial_run(self, run_async=False):
        """
        Run the task. Can be ran asynchronously using ray, and custom resources can be assigned through 'resources' parameter.
        """
        if run_async:
            import ray
            import os
            import sys

            def run_process_async(self):
                os.nice(self.parameters.get('niceness', 20))
                self.logger.info('{}: Setting niceness {}'.format(
                    self.name, self.parameters.get('niceness', 20)))
                return self.process()

            resource_settings = self.parameters.get('resources', None)
            if resource_settings and 'gpus' in resource_settings:
                num_gpus = resource_settings['gpus']
                resource_settings.pop('gpus')
            else:
                num_gpus = 0

            if resource_settings:
                outs = ray.remote(run_process_async)._remote(
                    args=[self],
                    resources=resource_settings,
                    num_gpus=num_gpus)
            else:
                outs = ray.remote(run_process_async).remote(self)
        else:
            outs = self.process()
        return self.__process_outputs(outs)

    def __serial_map(self, iteration=None, run_async=False):
        """
        Run the task over each input element. Can be ran asynchronously.
        """
        self.initial_parameters = copy.deepcopy(self.parameters)
        self.original_name = copy.deepcopy(self.name)
        self.original_export_path = copy.deepcopy(self.export_path)

        map_var_names = self.parameters['map_vars']
        map_vars = zip(*[self.parameters[k].load() for k in map_var_names])

        if iteration is not None:
            map_vars = list(map_vars)
            map_vars = [map_vars[iteration]]
            initial_iter = iteration
        else:
            initial_iter = 0

        outs = []

        for i, iteration in enumerate(map_vars):
            self.parameters = copy.deepcopy(self.initial_parameters)
            self.parameters['iter'] = i + initial_iter
            self.cache_dir = Path(self.global_parameters['cache_path'],
                                  self.task_hash)
            self.export_path = Path(self.original_export_path, str(i))

            self.__make_hash_dict()
            self.task_hash = self.get_hash()

            for k, var in zip(map_var_names, iteration):
                self.parameters[k] = TaskIO(var,
                                            self.task_hash,
                                            iotype='data',
                                            name=k)

            if self.cache:
                cache_paths = self.find_cache()
            else:
                cache_paths = False
            if cache_paths:
                self.logger.info('Caching task {}'.format(self.name))
                out_dict = {
                    '{}{}{}'.format(self.name, symbols['dot'],
                                    Path(cache_i).stem):
                    TaskIO(cache_i,
                           self.task_hash,
                           iotype='path',
                           name=Path(cache_i).stem,
                           position=Path(cache_i).parts[-2].split('_')[-1])
                    for cache_i in cache_paths
                }
                for task_name, task in out_dict.items():
                    task.create_link(
                        Path(task.data).parent, Path(self.export_path))
            elif run_async:
                outs.append({
                    '{}->ray_reference'.format(self.name):
                    list(self.__serial_run(run_async=run_async).values())[0],
                    '{}->output_names'.format(self.name):
                    TaskIO(self.output_names,
                           self.get_hash(),
                           iotype='data',
                           name='output_names',
                           position='1')
                })
                self.output_names = ['ray_reference', 'output_names']
            else:
                outs.append(self.__serial_run(run_async=run_async))
            print('serial map')

        #Restore original parameters
        self.parameters = copy.deepcopy(self.initial_parameters)
        self.name = copy.deepcopy(self.original_name)
        self.export_path = Path(self.original_export_path, 'merged')

        self.__make_hash_dict()
        self.task_hash = self.get_hash()

        merge_map = {}
        for iter in outs:
            for k, v in iter.items():
                if k not in merge_map:
                    merge_map[k] = [v]
                else:
                    merge_map[k].extend([v])

        outs = tuple(
            [[r.load() for r in merge_map['{}->{}'.format(self.name, name)]]
             for name in self.output_names])

        return self.__process_outputs(outs)

    def run(self, iteration=None):
        """
        The task is ran. If it is doing a map over the inputs, then iteration is given.
        Handles the different behaviours like return_as_class/function, map and parallel.
        """
        self.task_hash = self.get_hash()
        self.cache_dir = Path(self.global_parameters['cache_path'],
                              self.task_hash)
        self.export_dir = Path(self.global_parameters['output_path'],
                               self.name)

        self.return_as_function = self.parameters.get('return_as_function',
                                                      False)
        self.return_as_class = self.parameters.get('return_as_class', False)
        if self.logger is not None:
            self.logger.info('{}: Hash {}'.format(self.name, self.task_hash))

        if self.cache:
            cache_paths = self.find_cache()
        else:
            cache_paths = False
        if cache_paths:
            if self.logger is not None:
                self.logger.info('{}: Caching'.format(self.name))

            out_dict = {
                '{}{}{}'.format(self.name, symbols['dot'],
                                Path(cache_i).stem):
                TaskIO(cache_i,
                       self.task_hash,
                       iotype='path',
                       name=Path(cache_i).stem,
                       position=Path(cache_i).parts[-2].split('_')[-1])
                for cache_i in cache_paths
            }
            for task_name, task in out_dict.items():
                export = self.parameters.get('export', False)
                task.create_link(Path(task.data).parent,
                                 Path(self.export_path),
                                 copy_files=export)
        else:
            run_async = self.parameters.get('async', False)
            if self.return_as_function:
                if self.logger is not None:
                    self.logger.info('{}: Lazy run'.format(self.name))
                self.parameters['return_as_function'] = False
                out_dict = self.__process_outputs(self.process)
            elif self.return_as_class:
                if self.logger is not None:
                    self.logger.info('{}: Lazy run'.format(self.name))
                self.parameters['return_as_class'] = False
                out_dict = self.__process_outputs(self)
            elif (('parallel' not in self.parameters)
                  and ('map_vars' not in self.parameters)):
                if self.logger is not None:
                    self.logger.info('{}: Running'.format(self.name))
                out_dict = self.__serial_run(run_async=run_async)
            elif 'parallel' in self.parameters and not 'map_vars' in self.parameters:
                if self.logger is not None:
                    self.logger.info(
                        '{}: Running with pool of {} workers'.format(
                            self.name, self.parameters['n_cores']))
                out_dict = self.__parallel_run_ray(run_async=run_async)
            elif 'map_vars' in self.parameters and not 'parallel' in self.parameters:
                if iteration is not None:
                    if self.logger is not None:
                        self.logger.info('{}: Running iteration {}'.format(
                            self.name, iteration))
                else:
                    if self.logger is not None:
                        self.logger.info(
                            '{}: Running multiple iterations'.format(
                                self.name))
                out_dict = self.__serial_map(iteration=iteration,
                                             run_async=run_async)
            else:
                raise Exception(
                    'Mixing !parallel-map and !map in a task is not allowed')

        return out_dict
Ejemplo n.º 13
0
    def __init__(self, config, logger=None):
        """
        Main class of the dienen library. It represents the model, which is built from a configuration file.
        config: can be a string or pathlib.Path pointing to a .yaml file, a dictionary or a kahnfigh Config.
        logger: optionally, a logger can be supplied to log all information related to dienen model.
        """

        config = Config(config, safe=False)

        self.original_config = config
        self.config = copy.deepcopy(config)
        self.core_model = None
        self.architecture_config = None
        self.model_path = None
        self.name = self.config['Model'].get(
            'name',
            datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
        if not self.model_path:
            self.model_path = self.config['Model'].get('path',
                                                       '{}'.format(self.name))
        self.weights = None
        self.optimizer_weights = None
        self.extra_data = None
        self.modules = self.config.get('Module', [])
        self.gpu_config = self.config.get('gpu_config', {
            'device': 'auto',
            'allow_growth': True
        })
        self.cache = True
        self.logger = logger
        self.input_shapes = None
        self.output_shapes = None

        training_strategy = self.config['Model'].get('DistributedStrategy',
                                                     None)
        if training_strategy is None:
            self.training_strategy = tf.distribute.get_strategy()
        elif training_strategy == 'Mirrored':
            self.training_strategy = tf.distribute.MirroredStrategy()

        if training_strategy is None:
            self.gpu_device = self.gpu_config.get('device', 'auto')
            if self.gpu_device == 'auto':
                gpu, mem = get_available_gpus()
                self.gpu_device = int(gpu)
                if self.logger:
                    self.logger.info(
                        "Automatically selected device {} with {} available memory"
                        .format(self.gpu_device, mem))
            gpu_growth = self.gpu_config.get('allow_growth', True)
            gpus = tf.config.experimental.list_physical_devices('GPU')
            if gpus:
                try:
                    if len(gpus) < self.gpu_device:
                        raise Exception(
                            'There are only {} available GPUs and the {} was requested'
                            .format(len(gpus), self.gpu_device))
                    tf.config.experimental.set_visible_devices(
                        gpus[self.gpu_device], 'GPU')
                except RuntimeError as e:
                    warnings.warn('Failed setting GPUs. {}'.format(e))
            if gpu_growth:
                for gpu in gpus:
                    try:
                        tf.config.experimental.set_memory_growth(
                            gpu, gpu_growth)
                    except RuntimeError as e:
                        warnings.warn(
                            'Failed setting GPU dynamic memory allocation. {}'.
                            format(e))

            logical_gpus = tf.config.experimental.list_logical_devices('GPU')

        if self.logger:
            self.logger.debug("Physical GPUs: {}. Logical GPUs: {}".format(
                len(gpus), len(logical_gpus)))

        self.externals = self.config['Model'].get('External', None)
        self.validation_data = None