Example #1
0
    def verify(self):
        super(Concat3DConf, self).verify()

        # to check if the ranks of all the inputs are equal
        rank_equal_flag = True
        for i in range(len(self.input_ranks)):
            if self.input_ranks[i] != self.input_ranks[0]:
                rank_equal_flag = False
                break
        if rank_equal_flag == False:
            raise ConfigurationError(
                "For layer Concat3D, the ranks of each inputs should be equal!"
            )

        if self.concat3D_axis == 1:
            # to check if the dimensions of all the inputs are equal
            input_dims = list(self.input_dims)
            dim_equal_flag = True
            for i in range(len(input_dims)):
                if input_dims[i][-1] != input_dims[0][-1]:
                    dim_equal_flag = False
                    break
            if dim_equal_flag == False:
                raise Exception(
                    "Concat3D with axis = 1 require that the input dimensions should be the same!"
                )

        # to check if the concat3D_axis is legal
        if self.concat3D_axis not in [1, 2]:
            raise ConfigurationError(
                "For layer Concat3D, the concat axis must be 1 or 2!")
Example #2
0
    def inference(self):
        """ Dimension inference of encoder and decoder is conducted here, but not in the Model.

        Returns:

        """
        self.encoder_conf_cls.use_gpu = self.use_gpu
        self.decoder_conf_cls.use_gpu = self.use_gpu

        # inference inside the encoder and decoder
        self.encoder_conf_cls.input_dims = copy.deepcopy(self.input_dims)
        self.encoder_conf_cls.inference()

        # rank varification between encoder and decoder
        former_output_ranks = [self.encoder_conf_cls.output_rank]
        for input_rank, former_output_rank in zip(
                self.decoder_conf_cls.input_ranks, former_output_ranks):
            if input_rank != -1 and input_rank != former_output_rank:
                raise ConfigurationError(
                    "Input ranks of decoder %s are inconsistent with former encoder %s"
                    % (self.decoder_name, self.encoder_name))
        self.decoder_conf_cls.input_ranks = copy.deepcopy(former_output_ranks)

        # some dimension of decoder are inferenced from encoder
        self.decoder_conf_cls.input_dims = [self.encoder_conf_cls.output_dim]
        self.decoder_conf_cls.input_context_dims = [
            self.encoder_conf_cls.output_context_dim
        ]
        self.decoder_conf_cls.inference()

        self.output_dim = self.decoder_conf_cls.output_dim
        self.output_rank = 3
Example #3
0
    def forward(self, x, x_len, y, y_len):
        """

        Args:
            x: [batch_size, dim]
            x_len: [batch_size]
            y: [batch_size, dim]
            y_len: [batch_size]
        Returns:
            Tensor: [batch_size, 1], None

        """

        batch_size = x.size()[0]
        if "cos" in self.layer_conf.operations:
            result = F.cosine_similarity(x, y)
        elif "euclidean" in self.layer_conf.operations:
            result = torch.sqrt(torch.sum((x - y)**2, dim=1))
        elif "manhattan" in self.layer_conf.operations:
            result = torch.sum(torch.abs((x - y)), dim=1)
        elif "chebyshev" in self.layer_conf.operations:
            result = torch.abs((x - y)).max(dim=1)
        else:
            raise ConfigurationError("This operation is not supported!")

        result = result.view(batch_size, 1)
        return result, None
Example #4
0
    def forward(self, *args):
        """ process input

        Args:
            *args: (Tensor): string, string_len, string2, string2_len
                e.g. string (Tensor): [batch_size, seq_len, dim], string_len (Tensor): [batch_size]


        Returns:
            Tensor: [batch_size, seq_len, output_dim], [batch_size]
        """
        dim_flag = True
        input_dims = list(self.layer_conf.input_dims)
        if (args[0].shape[1] * args[0].shape[2]) != (args[2].shape[1] *
                                                     args[2].shape[2]):
            if args[0].shape[1] == args[2].shape[1] and (
                    input_dims[1][-1] == 1 or input_dims[0][-1] == 1):
                dim_flag = True
            else:
                dim_flag = False
        if dim_flag == False:
            raise ConfigurationError(
                "For layer ElementWisedMultiply3D, the dimensions of each inputs should be equal or 1 ,or the elements number of two inputs (expect for the first dimension) should be equal"
            )
        return torch.addcmul(
            torch.zeros(args[0].size()).to('cuda'), 1, args[0],
            args[2]), args[1]
Example #5
0
    def get_topological_sequence(self):
        """ get topological sequence of nodes in the model

        Returns:

        """
        total_layer_ids = Queue()
        for layer_id in self.layers.keys():
            if layer_id != EMBED_LAYER_ID:
                total_layer_ids.put(layer_id)

        topological_list = []
        circular_cnt = 0     # used for checking if there is at least one legal topological sorting
        while not total_layer_ids.empty():
            layer_id = total_layer_ids.get()
            if len(self.layer_dependencies[layer_id]) == 0:
                for layer_id2 in self.layer_dependencies:
                    if layer_id in self.layer_dependencies[layer_id2]:
                        self.layer_dependencies[layer_id2].remove(layer_id)
                circular_cnt = 0
                topological_list.append(layer_id)
            else:
                total_layer_ids.put(layer_id)
                circular_cnt += 1
                if circular_cnt >= total_layer_ids.qsize():
                    rest_layers = []
                    while not total_layer_ids.empty():
                        rest_layers.append(total_layer_ids.get())
                    raise ConfigurationError("The model architecture is illegal because there is a circular dependency "
                        "or there are some isolated layers. The layers can not be resolved: [%s]" % (", ".join(rest_layers)))

        logging.debug("Topological sequence of nodes: %s" % (",".join(topological_list)))
        return topological_list
Example #6
0
    def get_item(self, keys, default=None, use_default=False):
        """

        Args:
            keys:
            default: if some key is not found and default is None, we would raise an Exception, except that use_default is True
            use_default: if you really want to set default to None, set use_default=True

        Returns:

        """
        item = self.conf
        valid_keys = []
        try:
            for key in keys:
                item = item[key]
                valid_keys.append(key)
        except:
            error_keys = copy.deepcopy(valid_keys)
            error_keys.append(key)
            if default is None and use_default is False:
                raise ConfigurationError(
                    "The configuration file %s is illegal. There should be an item configuration[%s], "
                    "but the item %s is not found." %
                    (self.conf_path, "][".join(error_keys), key))
            else:
                print(
                    "configuration[%s] is not found in %s, use default value %s"
                    % ("][".join(error_keys), self.conf_path, repr(default)))
                item = default

        return item
Example #7
0
    def verify(self):
        super(CombinationConf, self).verify()

        # to check if the ranks of all the inputs are equal
        rank_equal_flag = True
        for i in range(len(self.input_ranks)):
            if self.input_ranks[i] != self.input_ranks[0]:
                rank_equal_flag = False
                break
        if rank_equal_flag == False:
            raise ConfigurationError(
                "For layer Combination, the ranks of each inputs should be consistent!"
            )

        if "difference" in self.operations:
            assert len(
                self.input_dims
            ) == 2, "Difference operation requires that there should be two inputs"

        if "difference" in self.operations or "dot_multiply" in self.operations:
            input_dims = list(self.input_dims)
            dim_equal_flag = True
            for i in range(len(input_dims)):
                if input_dims[i] != input_dims[0]:
                    dim_equal_flag = False
                    break
            if dim_equal_flag == False:
                raise Exception(
                    "Difference and dot multiply require that the input dimensions should be the same"
                )
Example #8
0
    def forward(self, *args):
        """ process input

        Args:
            *args: (Tensor): string, string_len, string2, string2_len
                e.g. string (Tensor): [batch_size, seq_len, dim], string_len (Tensor): [batch_size]

        Returns:
            Tensor: [batch_size, seq_len, output_dim], [batch_size]
        """

        # to check if the dimensions of all the inputs are legal for the Minus3D
        dim_flag = True
        input_dims = list(self.layer_conf.input_dims)
        if (args[0].shape[1] * args[0].shape[2]) != (args[2].shape[1] *
                                                     args[2].shape[2]):
            if args[0].shape[1] == args[2].shape[1] and (
                    input_dims[1][-1] == 1 or input_dims[0][-1] == 1):
                dim_flag = True
            else:
                dim_flag = False
        if dim_flag == False:
            raise ConfigurationError(
                "For layer Minus3D, the dimensions of each inputs should be equal or 1 ,or the elements number of two inputs (expect for the first dimension) should be equal"
            )

        if self.layer_conf.abs_flag == False:
            return (args[0] - args[2]), args[1]
        if self.layer_conf.abs_flag == True:
            return torch.abs(args[0] - args[2]), args[1]
Example #9
0
    def check_version_compat(self, nb_version, conf_version):
        """ check if the version of toolkit and configuration file is compatible

        Args:
            nb_version: x.y.z
            conf_version: x.y.z

        Returns:
            If the x field and y field are both the same, return True, else return False

        """
        nb_version_split = nb_version.split('.')
        conf_version_split = conf_version.split('.')
        if len(nb_version_split) != len(conf_version_split):
            raise ConfigurationError('The tool_version field of your configuration is illegal!')
        if not (nb_version_split[0] == conf_version_split[0] and nb_version_split[1] == conf_version_split[1]):
            raise ConfigurationError('The NeuronBlocks version is %s, but the configuration version is %s, please update your configuration to %s.%s.X' % (nb_version, conf_version, nb_version_split[0], nb_version_split[1]))
Example #10
0
 def inference(self):
     self.output_dim = copy.deepcopy(self.input_dims[0])
     if self.input_dims[0][1] == -1 or self.input_dims[1][1] == -1:
         raise ConfigurationError(
             "For Match layer, the sequence length should be fixed")
     self.output_dim[-1] = self.input_dims[1][1]  # y_len
     super(MatchConf,
           self).inference()  # PUT THIS LINE AT THE END OF inference()
Example #11
0
 def inference(self):
     self.output_dim = copy.deepcopy(self.input_dims[0])
     if self.input_dims[0][1] == -1 or self.input_dims[1][1] == -1:
         raise ConfigurationError(
             "For Expand_plus layer, the sequence length should be fixed")
     self.output_dim.insert(2, self.input_dims[1][1])  # y_len
     super(Expand_plusConf,
           self).inference()  # PUT THIS LINE AT THE END OF inference()
Example #12
0
 def inference(self):
     self.output_dim = []
     if self.input_dims[0][1] == -1:
         raise ConfigurationError("For Flatten layer, the sequence length should be fixed")
     else:
         self.output_dim.append(self.input_dims[0][0])
         self.output_dim.append(self.input_dims[0][1]*self.input_dims[0][-1])
         
     super(FlattenConf, self).inference()
Example #13
0
    def verify(self):
        super(Concat2DConf, self).verify()

        # to check if the ranks of all the inputs are equal
        rank_equal_flag = True
        for i in range(len(self.input_ranks)):
            if self.input_ranks[i] != self.input_ranks[0]:
                rank_equal_flag = False
                break
        if rank_equal_flag == False:
            raise ConfigurationError(
                "For layer Concat2D, the ranks of each inputs should be equal!"
            )

        # to check if the concat2D_axis is legal
        if self.concat2D_axis != 1:
            raise ConfigurationError(
                "For layer Concat2D, the concat axis must be 1!")
Example #14
0
    def add_attr_exist_assertion_for_user(self, attr):
        """ check if there are some attributes being forgot by users

        Args:
            attr (str): the attribution name

        Returns:
            None

        """
        if not hasattr(self, attr):
            raise ConfigurationError("For layer %s, please configure %s attribute for %s in the configuration file!" % (type(self).__name__, attr, type(self).__name__))
Example #15
0
 def inference(self):
     self.output_dim = []
     flatted_length = 1
     for i in range(1, len(self.input_dims[0])):
         if self.input_dims[0][i] == -1:
             raise ConfigurationError("For Flatten layer, the sequence length should be fixed")
         else:
             flatted_length *= self.input_dims[0][i]
     
     self.output_dim = [self.input_dims[0][0], flatted_length]
         
     super(FlattenConf, self).inference()
Example #16
0
    def add_layer(self, layer_id, layer):
        """ register a layer

        Args:
            layer_id:
            layer:

        Returns:

        """
        if layer_id in self.layers:
            raise ConfigurationError("The layer id %s is not unique!")
        else:
            self.layers[layer_id] = layer
Example #17
0
    def varify(self):
        super(MatrixMultiplyConf, self).varify()
        # # to check if the ranks of all the inputs are equal
        # rank_equal_flag = True
        # for i in range(len(self.input_ranks)):
        #     if self.input_ranks[i] != self.input_ranks[0]:
        #         rank_equal_flag = False
        #         break
        # if rank_equal_flag == False:
        #     raise ConfigurationError("For layer MatrixMultiply, the ranks of each inputs should be equal!")

        # to check if the value of operation is legal
        if self.operation not in ['common', 'seq_based', 'dim_based']:
            raise ConfigurationError(
                "the operation must be one of the 'common', 'seq_based' and 'dim_based'"
            )
Example #18
0
    def verify(self):
        super(CalculateDistanceConf, self).verify()

        assert len(self.input_dims
                   ) == 2, "Operation requires that there should be two inputs"

        # to check if the ranks of all the inputs are equal
        rank_equal_flag = True
        for i in range(len(self.input_ranks)):
            if self.input_ranks[i] != self.input_ranks[0] or self.input_ranks[
                    i] != 2:
                rank_equal_flag = False
                break
        if rank_equal_flag == False:
            raise ConfigurationError(
                "For layer CalculateDistance, the ranks of each inputs should be equal and 2!"
            )
Example #19
0
 def get_value_by_key(json, key, key_prefix='', use_default=False, default=None):
     """
     Args:
         json: a json object
         key: a key pointing to the value wanted to acquire
         use_default: if you really want to use default value when key can not be found in json object, set use_default=True
         default: if key is not found and default is None, we would raise an Exception, except that use_default is True
     Returns:
         value: 
     """
     try:
         value = json[key]
     except:
         if not use_default:
             raise ConfigurationError("key[%s] can not be found in configuration file" % (key_prefix + key))
         else:
             value = default
     return value
Example #20
0
 def inference(self):
     shape1 = self.input_dims[0]
     shape2 = self.input_dims[1]
     if shape1[1] == -1 or shape2[1] == -1:
         raise ConfigurationError(
             "For Interaction layer, the sequence length should be fixed")
     # print(shape1,shape2)
     self.output_dim = None
     if self.matching_type in ['mul', 'plus', 'minus']:
         self.output_dim = [shape1[0], shape1[1], shape2[1], shape1[2]]
     elif self.matching_type in ['dot', 'general']:
         self.output_dim = [shape1[0], shape1[1], shape2[1], 1]
     elif self.matching_type == 'concat':
         self.output_dim = [
             shape1[0], shape1[1], shape2[1], shape1[2] + shape2[2]
         ]
     else:
         raise ValueError("Invalid `matching_type`."
                          "{self.matching_type} received."
                          "Must be in `mul`, `general`, `plus`, `minus` "
                          "`dot` and `concat`.")
     # print(self.output_dim)
     super(InteractionConf,
           self).inference()  # PUT THIS LINE AT THE END OF inference()
Example #21
0
    def verify(self):
        super(Minus2DConf, self).verify()

        # # to check if the ranks of all the inputs are equal
        # rank_equal_flag = True
        # for i in range(len(self.input_ranks)):
        #     if self.input_ranks[i] != self.input_ranks[0]:
        #         rank_equal_flag = False
        #         break
        # if rank_equal_flag == False:
        #     raise ConfigurationError("For layer Minus2D, the ranks of each inputs should be equal!")

        # to check if the dimensions of all the inputs are equal or is 1
        dim_flag = True
        input_dims = list(self.input_dims)
        for i in range(len(input_dims)):
            if input_dims[i][1] != input_dims[0][1] and input_dims[i][
                    1] != 1 and input_dims[0][1] != 1:
                dim_flag = False
                break
        if dim_flag == False:
            raise ConfigurationError(
                "For layer Minus2D, the dimensions of each inputs should be equal or 1"
            )
 def _check_command_executor_is_set(self):
     if not self._command_executor:
         raise ConfigurationError("Command_executor is required property!")
Example #23
0
def get_conf(layer_id,
             layer_name,
             input_layer_ids,
             all_layer_configs,
             model_input_ids,
             use_gpu,
             conf_dict=None,
             shared_conf=None,
             succeed_embedding_flag=False,
             output_layer_flag=False,
             target_num=None,
             fixed_lengths=None):
    """ get layer configuration

    Args
        layer_id: layer identifier
        layer_name: name of layer such as BiLSTM
        input_layer_ids (list): the inputs of current layer
        all_layer_configs (dict): records the conf class of each layer.
        model_input_ids (set): the inputs of the model, e.g. ['query', 'passage']
        use_gpu:
        conf_dict:
        shared_conf: if fixed_lengths is not None, the output_dim of shared_conf should be corrected!
        flag:
        output_layer_flag:
        target_num: used for inference the dimension of output space if someone declare a dimension of -1
        fixed_lengths
    Returns:
        configuration class coresponds to the layer

    """
    if shared_conf:
        conf = copy.deepcopy(shared_conf)
    else:
        try:
            conf_dict['use_gpu'] = use_gpu

            # for classification tasks, we usually add a Linear layer to project the output to dimension of number of classes. If we don't know the #classes, we can use '-1' instead and we would calculate the number of classes from the corpus.
            if layer_name == 'Linear':
                if isinstance(conf_dict['hidden_dim'],
                              list) and conf_dict['hidden_dim'][-1] == -1:
                    assert output_layer_flag is True, "Only in the last layer, hidden_dim == -1 is allowed!"
                    assert target_num is not None, "Number of targets should be given!"
                    conf_dict['hidden_dim'][-1] = target_num
                elif isinstance(conf_dict['hidden_dim'],
                                int) and conf_dict['hidden_dim'] == -1:
                    assert output_layer_flag is True, "Only in the last layer, hidden_dim == -1 is allowed!"
                    assert target_num is not None, "Number of targets should be given!"
                    conf_dict['hidden_dim'] = target_num

            conf = eval(layer_name + "Conf")(**conf_dict)
        except NameError as e:
            raise LayerConfigUndefinedError("\"%sConf\" has not been defined" %
                                            layer_name)

    # verify the rank consistence of joint layers
    if layer_name == EMBED_LAYER_NAME:
        # the embedding layer
        pass
    else:
        # make sure all the inputs to current layer exist
        for input_layer_id in input_layer_ids:
            if not (input_layer_id in all_layer_configs
                    or input_layer_id in model_input_ids):
                raise ConfigurationError(
                    "The input %s of layer %s does not exist. Please define it before "
                    "defining layer %s!" %
                    (input_layer_id, layer_id, layer_id))

        former_output_ranks = [
            all_layer_configs[input_layer_id].output_rank
            if input_layer_id in all_layer_configs else
            all_layer_configs[EMBED_LAYER_ID].output_rank
            for input_layer_id in input_layer_ids
        ]
        # inference input_dim
        conf.input_dims = [
            all_layer_configs[input_layer_id].output_dim
            if input_layer_id in all_layer_configs else
            all_layer_configs[EMBED_LAYER_ID].output_dim
            for input_layer_id in input_layer_ids
        ]

        # If the inputs come from embedding layer and fixed_lengths exist, set the length to input_dims
        if len(input_layer_ids) == 1 and input_layer_ids[
                0] in model_input_ids and fixed_lengths:
            conf.input_dims[0][1] = fixed_lengths[input_layer_ids[0]]

        # check and verify input ranks
        if conf.num_of_inputs > 0:
            if conf.num_of_inputs != len(input_layer_ids):
                raise ConfigurationError("%s only accept %d inputs but you feed %d inputs to it!" % \
                        (layer_name, conf.num_of_inputs, len(input_layer_ids)))
        elif conf.num_of_inputs == -1:
            conf.num_of_inputs = len(input_layer_ids)
            if isinstance(conf.input_ranks, list):
                conf.input_ranks = conf.input_ranks * conf.num_of_inputs
            else:
                logging.warning(
                    "[For developer of %s] The input_ranks attribute should be a list!"
                    % (layer_name))
                [conf.input_ranks] * conf.num_of_inputs

        for input_rank, former_output_rank in zip(conf.input_ranks,
                                                  former_output_ranks):
            if input_rank != -1 and input_rank != former_output_rank:
                raise ConfigurationError(
                    "Input ranks of %s are inconsistent with former layers" %
                    layer_id)
        conf.input_ranks = copy.deepcopy(former_output_ranks)

    # inference and varification inside the layer
    conf.inference(
    )  # update some attributes which relies on input dimension or something else
    conf.verify()  # verify if the configuration is legal

    logging.debug(
        'Layer id: %s; name: %s; input_dims: %s; input_ranks: %s; output_dim: %s; output_rank: %s'
        % (layer_id, layer_name, conf.input_dims if layer_id != 'embedding'
           else 'None', conf.input_ranks, conf.output_dim, conf.output_rank))

    return conf
Example #24
0
    def __init__(self, conf, problem, vocab_info, use_gpu):
        """

        Args:
            inputs: ['string1', 'string2']
            layer_archs:  The layers must produce tensors with similar shapes. The layers may be nested.
                [
                    {
                    'layer': Layer name,
                    'conf': {xxxx}
                    },
                    [
                        {
                        'layer': Layer name,
                        'conf': {},
                        },
                        {
                        'layer': Layer name,
                        'conf': {},
                        }
                    ]
                ]
            vocab_info:
                {
                    'word':  {
                        'vocab_size': xxx,
                        'init_weights': np matrix
                        }
                    'postag': {
                        'vocab_size': xxx,
                        'init_weights': None
                        }
                }
        """
        super(Model, self).__init__()

        inputs = conf.object_inputs_names
        layer_archs = conf.architecture
        target_num = problem.output_target_num()

        # correct the real fixed length if begin/end of sentence are added
        if conf.fixed_lengths:
            fixed_lengths_corrected = copy.deepcopy(conf.fixed_lengths)
            for seq in fixed_lengths_corrected:
                if problem.with_bos_eos:
                    fixed_lengths_corrected[seq] += 2
        else:
            fixed_lengths_corrected = None

        self.use_gpu = use_gpu

        all_layer_configs = dict()
        self.layers = nn.ModuleDict()
        self.layer_inputs = dict()
        self.layer_dependencies = dict()
        self.layer_dependencies[EMBED_LAYER_ID] = set()
        # change output_layer_id to list for support multi_output
        self.output_layer_id = []

        for layer_index, layer_arch in enumerate(layer_archs):
            output_layer_flag = True if 'output_layer_flag' in layer_arch and layer_arch[
                'output_layer_flag'] is True else False
            succeed_embedding_flag = True if layer_index > 0 and 'inputs' in layer_arch and \
                    [input in inputs for input in layer_arch['inputs']].count(True) == len(layer_arch['inputs']) else False

            if output_layer_flag:
                self.output_layer_id.append(layer_arch['layer_id'])
                # if hasattr(self, 'output_layer_id'):
                #     raise ConfigurationError("There should be only one output!")
                # else:
                #     self.output_layer_id = layer_arch['layer_id']

            if layer_index == 0:
                # embedding layer
                emb_conf = copy.deepcopy(vocab_info)
                for input_cluster in emb_conf:
                    emb_conf[input_cluster]['dim'] = layer_arch['conf'][
                        input_cluster]['dim']
                    emb_conf[input_cluster]['fix_weight'] = layer_arch['conf'][
                        input_cluster].get('fix_weight', False)

                all_layer_configs[EMBED_LAYER_ID] = get_conf(
                    EMBED_LAYER_ID,
                    layer_arch['layer'],
                    None,
                    all_layer_configs,
                    inputs,
                    self.use_gpu,
                    conf_dict={'conf': emb_conf},
                    shared_conf=None,
                    succeed_embedding_flag=False,
                    output_layer_flag=output_layer_flag,
                    target_num=target_num,
                    fixed_lengths=fixed_lengths_corrected)
                self.add_layer(
                    EMBED_LAYER_ID,
                    get_layer(layer_arch['layer'],
                              all_layer_configs[EMBED_LAYER_ID]))
            else:
                if layer_arch[
                        'layer'] in self.layers and not 'conf' in layer_arch:
                    # reuse formly defined layers (share the same parameters)
                    logging.debug(
                        "Layer id: %s; Sharing configuration with layer %s" %
                        (layer_arch['layer_id'], layer_arch['layer']))
                    conf_dict = None
                    shared_conf = all_layer_configs[layer_arch['layer']]
                else:
                    conf_dict = layer_arch['conf']
                    shared_conf = None

                # if the layer is EncoderDecoder, inference the vocab size
                if layer_arch['layer'] == 'EncoderDecoder':
                    layer_arch['conf']['decoder_conf'][
                        'decoder_vocab_size'] = target_num
                all_layer_configs[layer_arch['layer_id']] = get_conf(
                    layer_arch['layer_id'],
                    layer_arch['layer'],
                    layer_arch['inputs'],
                    all_layer_configs,
                    inputs,
                    self.use_gpu,
                    conf_dict=conf_dict,
                    shared_conf=shared_conf,
                    succeed_embedding_flag=succeed_embedding_flag,
                    output_layer_flag=output_layer_flag,
                    target_num=target_num,
                    fixed_lengths=fixed_lengths_corrected)

                if layer_arch[
                        'layer'] in self.layers and not 'conf' in layer_arch:
                    self.add_layer(layer_arch['layer_id'],
                                   self.layers[layer_arch['layer']])
                else:
                    self.add_layer(
                        layer_arch['layer_id'],
                        get_layer(layer_arch['layer'],
                                  all_layer_configs[layer_arch['layer_id']]))

                self.layer_inputs[
                    layer_arch['layer_id']] = layer_arch['inputs']

                # register dependencies, except embeddings
                cur_layer_depend = set()
                for layer_depend_id in layer_arch['inputs']:
                    if not layer_depend_id in inputs:
                        cur_layer_depend.add(layer_depend_id)
                self.add_dependency(layer_arch['layer_id'], cur_layer_depend)

        logging.debug("Layer dependencies: %s" % repr(self.layer_dependencies))

        if not hasattr(self, 'output_layer_id'):
            raise ConfigurationError("Please define an output layer")

        self.layer_topological_sequence = self.get_topological_sequence()
Example #25
0
    def load_from_file(self, conf_path):
        with codecs.open(conf_path, 'r', encoding='utf-8') as fin:
            try:
                self.conf = json.load(fin)
            except Exception as e:
                raise ConfigurationError(
                    "%s is not a legal JSON file, please check your JSON format!"
                    % conf_path)

        self.tool_version = self.get_item(['tool_version'])
        self.language = self.get_item(['language'], default='english').lower()
        self.problem_type = self.get_item(['inputs', 'dataset_type']).lower()
        #if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
        self.tagging_scheme = self.get_item(['inputs', 'tagging_scheme'],
                                            default=None,
                                            use_default=True)

        if self.mode == 'normal':
            self.use_cache = self.get_item(['inputs', 'use_cache'], True)
        elif self.mode == 'philly':
            self.use_cache = True

        # OUTPUTS
        if hasattr(self.params,
                   'model_save_dir') and self.params.model_save_dir:
            self.save_base_dir = self.params.model_save_dir
        else:
            self.save_base_dir = self.get_item(['outputs', 'save_base_dir'])

        if self.phase == 'train':
            # in train.py, it is called pretrained_model_path
            if hasattr(self.params, 'pretrained_model_path'
                       ) and self.params.pretrained_model_path:
                self.pretrained_model_path = self.previous_model_path = self.params.pretrained_model_path
            else:
                self.pretrained_model_path = self.previous_model_path = self.get_item(
                    ['inputs', 'data_paths', 'pretrained_model_path'],
                    default=None,
                    use_default=True)
        elif self.phase == 'test' or self.phase == 'predict':
            # in test.py and predict.py, it is called pretrained_model_path
            if hasattr(
                    self.params,
                    'previous_model_path') and self.params.previous_model_path:
                self.previous_model_path = self.pretrained_model_path = self.params.previous_model_path
            else:
                self.previous_model_path = self.pretrained_model_path = os.path.join(
                    self.save_base_dir,
                    self.get_item(['outputs', 'model_name'
                                   ]))  # namely, the model_save_path

        if hasattr(
                self, 'pretrained_model_path'
        ) and self.pretrained_model_path:  # namely self.previous_model_path
            tmp_saved_problem_path = os.path.join(
                os.path.dirname(self.pretrained_model_path),
                '.necessary_cache', 'problem.pkl')
            self.saved_problem_path = tmp_saved_problem_path if os.path.isfile(tmp_saved_problem_path) \
                else os.path.join(os.path.dirname(self.pretrained_model_path), 'necessary_cache', 'problem.pkl')
            if not (os.path.isfile(self.pretrained_model_path)
                    and os.path.isfile(self.saved_problem_path)):
                raise Exception(
                    'Previous trained model %s or its dictionaries %s does not exist!'
                    % (self.pretrained_model_path, self.saved_problem_path))

        if self.phase != 'cache':
            prepare_dir(
                self.save_base_dir,
                True,
                allow_overwrite=self.params.force or self.mode == 'philly',
                extra_info='will overwrite model file and train.log' if
                self.phase == 'train' else 'will add %s.log and predict file' %
                self.phase)

        if hasattr(self.params, 'log_dir') and self.params.log_dir:
            self.log_dir = self.params.log_dir
            if self.phase != 'cache':
                prepare_dir(self.log_dir, True, allow_overwrite=True)
        else:
            self.log_dir = self.save_base_dir

        if self.phase == 'train':
            self.train_log_path = os.path.join(
                self.log_dir, self.get_item(['outputs', 'train_log_name']))
            if self.mode == 'philly' or self.params.debug:
                log_set(self.train_log_path,
                        console_level='DEBUG',
                        console_detailed=True,
                        disable_log_file=self.params.disable_log_file)
            else:
                log_set(self.train_log_path,
                        disable_log_file=self.params.disable_log_file)
        elif self.phase == 'test':
            self.test_log_path = os.path.join(
                self.log_dir, self.get_item(['outputs', 'test_log_name']))
            if self.mode == 'philly' or self.params.debug:
                log_set(self.test_log_path,
                        console_level='DEBUG',
                        console_detailed=True,
                        disable_log_file=self.params.disable_log_file)
            else:
                log_set(self.test_log_path,
                        disable_log_file=self.params.disable_log_file)
        elif self.phase == 'predict':
            self.predict_log_path = os.path.join(
                self.log_dir, self.get_item(['outputs', 'predict_log_name']))
            if self.mode == 'philly' or self.params.debug:
                log_set(self.predict_log_path,
                        console_level='DEBUG',
                        console_detailed=True,
                        disable_log_file=self.params.disable_log_file)
            else:
                log_set(self.predict_log_path,
                        disable_log_file=self.params.disable_log_file)
        if self.phase != 'cache':
            self.predict_output_path = self.params.predict_output_path if self.params.predict_output_path else os.path.join(
                self.save_base_dir,
                self.get_item(['outputs', 'predict_output_name'],
                              default='predict.tsv'))
            logging.debug('Prepare dir for: %s' % self.predict_output_path)
            prepare_dir(self.predict_output_path,
                        False,
                        allow_overwrite=self.params.force
                        or self.mode == 'philly')
        self.predict_fields = self.get_item(
            ['outputs', 'predict_fields'],
            default=DefaultPredictionFields[ProblemTypes[self.problem_type]])

        self.model_save_path = os.path.join(
            self.save_base_dir, self.get_item(['outputs', 'model_name']))

        # INPUTS
        if hasattr(self.params,
                   'train_data_path') and self.params.train_data_path:
            self.train_data_path = self.params.train_data_path
        else:
            if self.mode == 'normal':
                self.train_data_path = self.get_item(
                    ['inputs', 'data_paths', 'train_data_path'],
                    default=None,
                    use_default=True)
            else:
                self.train_data_path = None
        if hasattr(self.params,
                   'valid_data_path') and self.params.valid_data_path:
            self.valid_data_path = self.params.valid_data_path
        else:
            if self.mode == 'normal':
                self.valid_data_path = self.get_item(
                    ['inputs', 'data_paths', 'valid_data_path'],
                    default=None,
                    use_default=True)
            else:
                self.valid_data_path = None
        if hasattr(self.params,
                   'test_data_path') and self.params.test_data_path:
            self.test_data_path = self.params.test_data_path
        else:
            if self.mode == 'normal':
                self.test_data_path = self.get_item(
                    ['inputs', 'data_paths', 'test_data_path'],
                    default=None,
                    use_default=True)
            else:
                self.test_data_path = None

        if self.phase == 'predict':
            if self.params.predict_data_path:
                self.predict_data_path = self.params.predict_data_path
            else:
                if self.mode == 'normal':
                    self.predict_data_path = self.get_item(
                        ['inputs', 'data_paths', 'predict_data_path'],
                        default=None,
                        use_default=True)
                else:
                    self.predict_data_path = None

        if self.phase == 'train' or self.phase == 'cache':
            if self.valid_data_path is None and self.test_data_path is not None:
                # We support test_data_path == None, if someone set valid_data_path to None while test_data_path is not None,
                # swap the valid_data_path and test_data_path
                self.valid_data_path = self.test_data_path
                self.test_data_path = None
        elif self.phase == 'predict':
            if self.predict_data_path is None and self.test_data_path is not None:
                self.predict_data_path = self.test_data_path
                self.test_data_path = None

        if self.phase == 'train' or self.phase == 'test' or self.phase == 'cache':
            self.file_columns = self.get_item(['inputs', 'file_header'])
        else:
            self.file_columns = self.get_item(['inputs', 'file_header'],
                                              default=None,
                                              use_default=True)

        if self.phase == 'predict':
            if self.file_columns is None:
                self.predict_file_columns = self.get_item(
                    ['inputs', 'predict_file_header'])
            else:
                self.predict_file_columns = self.get_item(
                    ['inputs', 'predict_file_header'],
                    default=None,
                    use_default=True)
                if self.predict_file_columns is None:
                    self.predict_file_columns = self.file_columns

        if self.phase != 'predict':
            if self.phase == 'cache':
                self.answer_column_name = self.get_item(['inputs', 'target'],
                                                        default=None,
                                                        use_default=True)
            else:
                self.answer_column_name = self.get_item(['inputs', 'target'])
        self.input_types = self.get_item(['architecture', 0, 'conf'])
        # add extra feature
        feature_all = set([_.lower() for _ in self.input_types.keys()])
        formal_feature = set(['word', 'char'])
        self.extra_feature = len(feature_all - formal_feature) != 0

        # add char embedding config
        # char_emb_type = None
        # char_emb_type_cols = None
        # for single_type in self.input_types:
        #     if single_type.lower() == 'char':
        #         char_emb_type = single_type
        #         char_emb_type_cols = [single_col.lower() for single_col in self.input_types[single_type]['cols']]
        #         break
        self.object_inputs = self.get_item(['inputs', 'model_inputs'])
        # if char_emb_type and char_emb_type_cols:
        #     for single_input in self.object_inputs:
        #         for single_col in char_emb_type_cols:
        #             if single_input.lower() in single_col:
        #                 self.object_inputs[single_input].append(single_col)

        self.object_inputs_names = [name for name in self.object_inputs]

        # vocabulary setting
        self.max_vocabulary = self.get_item(
            ['training_params', 'vocabulary', 'max_vocabulary'],
            default=800000,
            use_default=True)
        self.min_word_frequency = self.get_item(
            ['training_params', 'vocabulary', 'min_word_frequency'],
            default=3,
            use_default=True)

        # file column header setting
        self.file_with_col_header = self.get_item(
            ['inputs', 'file_with_col_header'],
            default=False,
            use_default=True)

        if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
            self.add_start_end_for_seq = self.get_item(
                ['inputs', 'add_start_end_for_seq'], default=True)
        else:
            self.add_start_end_for_seq = self.get_item(
                ['inputs', 'add_start_end_for_seq'], default=False)

        if hasattr(self.params,
                   'pretrained_emb_path') and self.params.pretrained_emb_path:
            self.pretrained_emb_path = self.params.pretrained_emb_path
        else:
            if self.mode == 'normal':
                self.pretrained_emb_path = self.get_item(
                    ['inputs', 'data_paths', 'pre_trained_emb'],
                    default=None,
                    use_default=True)
            else:
                self.pretrained_emb_path = None

        if 'word' in self.get_item(['architecture', 0, 'conf'
                                    ]) and self.pretrained_emb_path:
            if hasattr(self.params, 'involve_all_words_in_pretrained_emb'
                       ) and self.params.involve_all_words_in_pretrained_emb:
                self.involve_all_words_in_pretrained_emb = self.params.involve_all_words_in_pretrained_emb
            else:
                self.involve_all_words_in_pretrained_emb = self.get_item(
                    ['inputs', 'involve_all_words_in_pretrained_emb'],
                    default=False)
            if hasattr(
                    self.params,
                    'pretrained_emb_type') and self.params.pretrained_emb_type:
                self.pretrained_emb_type = self.params.pretrained_emb_type
            else:
                self.pretrained_emb_type = self.get_item(
                    ['inputs', 'pretrained_emb_type'], default='glove')
            if hasattr(self.params, 'pretrained_emb_binary_or_text'
                       ) and self.params.pretrained_emb_binary_or_text:
                self.pretrained_emb_binary_or_text = self.params.pretrained_emb_binary_or_text
            else:
                self.pretrained_emb_binary_or_text = self.get_item(
                    ['inputs', 'pretrained_emb_binary_or_text'],
                    default='text')
            self.pretrained_emb_dim = self.get_item(
                ['architecture', 0, 'conf', 'word', 'dim'])
        else:
            self.pretrained_emb_path = None
            self.involve_all_words_in_pretrained_emb = None
            self.pretrained_emb_binary_or_text = None
            self.pretrained_emb_dim = None
            self.pretrained_emb_type = None

        if self.phase == 'train':
            if hasattr(self.params, 'cache_dir') and self.params.cache_dir:
                # for aether
                self.cache_dir = self.params.cache_dir
            else:
                if self.mode == 'normal':
                    if self.use_cache:
                        self.cache_dir = self.get_item(
                            ['outputs', 'cache_dir'])
                    else:
                        self.cache_dir = os.path.join(
                            tempfile.gettempdir(), 'neuron_blocks', ''.join(
                                random.sample(
                                    string.ascii_letters + string.digits, 16)))
                else:
                    # for philly mode, we can only save files in model_path or scratch_path
                    self.cache_dir = os.path.join(self.save_base_dir, 'cache')

            self.problem_path = os.path.join(self.cache_dir, 'problem.pkl')
            if self.pretrained_emb_path is not None:
                self.emb_pkl_path = os.path.join(self.cache_dir, 'emb.pkl')
            else:
                self.emb_pkl_path = None
        else:
            tmp_problem_path = os.path.join(self.save_base_dir,
                                            '.necessary_cache', 'problem.pkl')
            self.problem_path = tmp_problem_path if os.path.isfile(
                tmp_problem_path) else os.path.join(
                    self.save_base_dir, 'necessary_cache', 'problem.pkl')

        # training params
        self.training_params = self.get_item(['training_params'])

        if self.phase == 'train':
            self.optimizer_name = self.get_item(
                ['training_params', 'optimizer', 'name'])
            self.optimizer_params = self.get_item(
                ['training_params', 'optimizer', 'params'])
            self.clip_grad_norm_max_norm = self.get_item(
                ['training_params', 'clip_grad_norm_max_norm'], default=5)

            if hasattr(self.params,
                       'learning_rate') and self.params.learning_rate:
                self.optimizer_params['lr'] = self.params.learning_rate

        if hasattr(self.params, 'batch_size') and self.params.batch_size:
            self.batch_size_each_gpu = self.params.batch_size
        else:
            self.batch_size_each_gpu = self.get_item([
                'training_params', 'batch_size'
            ])  #the batch_size in conf file is the batch_size on each GPU
        self.lr_decay = self.get_item(['training_params', 'lr_decay'],
                                      default=1)  # by default, no decay
        self.minimum_lr = self.get_item(['training_params', 'minimum_lr'],
                                        default=0)
        self.epoch_start_lr_decay = self.get_item(
            ['training_params', 'epoch_start_lr_decay'], default=1)
        if hasattr(self.params, 'max_epoch') and self.params.max_epoch:
            self.max_epoch = self.params.max_epoch
        else:
            self.max_epoch = self.get_item(['training_params', 'max_epoch'],
                                           default=float('inf'))
        self.valid_times_per_epoch = self.get_item(
            ['training_params', 'valid_times_per_epoch'], default=1)
        self.batch_num_to_show_results = self.get_item(
            ['training_params', 'batch_num_to_show_results'], default=10)
        self.max_lengths = self.get_item(['training_params', 'max_lengths'],
                                         default=None,
                                         use_default=True)
        self.fixed_lengths = self.get_item(
            ['training_params', 'fixed_lengths'],
            default=None,
            use_default=True)
        if self.fixed_lengths:
            self.max_lengths = None

        if torch.cuda.device_count() > 1:
            self.batch_size_total = torch.cuda.device_count(
            ) * self.training_params['batch_size']
            self.batch_num_to_show_results = self.batch_num_to_show_results // torch.cuda.device_count(
            )
        else:
            self.batch_size_total = self.batch_size_each_gpu

        self.cpu_num_workers = self.get_item(
            ['training_params', 'cpu_num_workers'],
            default=-1)  #by default, use all workers cpu supports

        # text preprocessing
        self.__text_preprocessing = self.get_item(
            ['training_params', 'text_preprocessing'], default=list())
        self.DBC2SBC = True if 'DBC2SBC' in self.__text_preprocessing else False
        self.unicode_fix = True if 'unicode_fix' in self.__text_preprocessing else False
        self.remove_stopwords = True if 'remove_stopwords' in self.__text_preprocessing else False

        # tokenzier
        if self.language == 'chinese':
            self.tokenizer = self.get_item(['training_params', 'tokenizer'],
                                           default='jieba')
        else:
            self.tokenizer = self.get_item(['training_params', 'tokenizer'],
                                           default='nltk')

        if self.extra_feature:
            if self.DBC2SBC:
                logging.warning(
                    "Detect the extra feature %s, set the DBC2sbc is False." %
                    ''.join(list(feature_all - formal_feature)))
            if self.unicode_fix:
                logging.warning(
                    "Detect the extra feature %s, set the unicode_fix is False."
                    % ''.join(list(feature_all - formal_feature)))
            if self.remove_stopwords:
                logging.warning(
                    "Detect the extra feature %s, set the remove_stopwords is False."
                    % ''.join(list(feature_all - formal_feature)))

        if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
            if self.unicode_fix:
                logging.warning(
                    'For sequence tagging task, unicode_fix may change the number of words.'
                )
            if self.remove_stopwords:
                self.remove_stopwords = True
                logging.warning(
                    'For sequence tagging task, remove stopwords is forbidden! It is disabled now.'
                )

        if self.phase != 'cache':
            if torch.cuda.is_available(
            ) and torch.cuda.device_count() > 0 and self.training_params.get(
                    'use_gpu', True):
                self.use_gpu = True
                logging.info(
                    "Activating GPU mode, there are %d GPUs available" %
                    torch.cuda.device_count())
            else:
                self.use_gpu = False
                logging.info("Activating CPU mode")

        self.architecture = self.get_item(['architecture'])
        self.output_layer_id = []
        for single_layer in self.architecture:
            if 'output_layer_flag' in single_layer and single_layer[
                    'output_layer_flag']:
                self.output_layer_id.append(single_layer['layer_id'])

        # check CNN layer & change min sentence length
        cnn_rele_layers = ['Conv', 'ConvPooling']
        self.min_sentence_len = 0
        for layer_index, single_layer in enumerate(self.architecture):
            if layer_index == 0:
                continue
            if sum([_ == single_layer['layer'] for _ in cnn_rele_layers]):
                # get window_size conf: type maybe int or list
                for single_conf, single_conf_value in single_layer[
                        'conf'].items():
                    if 'window' in single_conf.lower():
                        self.min_sentence_len = max(
                            self.min_sentence_len,
                            np.max(np.array([single_conf_value])))
                        break

        if self.phase == 'train' or self.phase == 'test':
            self.loss = BaseLossConf.get_conf(**self.get_item(['loss']))
            self.metrics = self.get_item(['metrics'])
            if 'auc' in self.metrics and ProblemTypes[
                    self.problem_type] == ProblemTypes.classification:
                self.pos_label = self.get_item(['inputs', 'positive_label'],
                                               default=None,
                                               use_default=True)
Example #26
0
 def raise_configuration_error(self, key):
     raise ConfigurationError(
         "The configuration file %s is illegal. the item [%s] is not found."
         % (self.conf_path, key))