Esempio n. 1
0
    def train(self, train_data, arg_space, phrase_space):
        """
        Trains a composition model and sets its learned parameters.

        Args:
            train_data: list of string tuples. Each tuple contains 3
            string elements: (arg1, arg2, phrase).

            arg_space: argument space(s). Space object or a tuple of two
            Space objects (e.g. my_space, or (my_space1, my_space2)).
            If two spaces are provided, arg1 elements of train data are
            interpreted in space1, and arg2 in space2.

            phrase space: phrase space, of type Space.

        Calls the specific training routine of the current composition
        model. Training tuples which contain strings not found in their
        respective spaces are ignored.

        The id2column attribute of the resulted composed space is set to
        be equal to that of the phrase space given as an input.
        """

        start = time.time()

        arg1_space, arg2_space = self.extract_arg_spaces(arg_space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(train_data,
                                                                     (arg1_space.row2id,
                                                                      arg2_space.row2id,
                                                                      phrase_space.row2id)
                                                                     )


        self._train(arg1_space, arg2_space, phrase_space,
                 arg1_list, arg2_list, phrase_list)

        self.composed_id2column = phrase_space.id2column

        log.print_composition_model_info(logger, self, 1, "\nTrained composition model:")
        log.print_info(logger, 2, "With total data points:%s" % len(arg1_list))
        log.print_matrix_info(logger, arg1_space.cooccurrence_matrix, 3,
                              "Semantic space of argument 1:")
        log.print_matrix_info(logger, arg2_space.cooccurrence_matrix, 3,
                              "Semantic space of argument 2:")
        log.print_matrix_info(logger, phrase_space.cooccurrence_matrix, 3,
                              "Semantic space of phrases:")
        log.print_time_info(logger, time.time(), start, 2)
Esempio n. 2
0
    def train(self, train_data, arg_space, phrase_space):
        """
        Trains a lexical function composition model to learn a function
        space and sets the function_space parameter. 
                
        Args:
            train_data: list of string tuples. Each tuple contains 3 
            string elements: (function_word, arg, phrase).
            
            arg_space: argument space, of type Space. arg elements of 
            train data are interpreted in this space.
        
            phrase space: phrase space, of type Space. phrase elements of 
            the train data are interpreted in this space.
            
        Training tuples which contain strings not found in their 
        respective spaces are ignored. Function words containing less than
        _MIN_SAMPLES training instances are ignored. For example, if
        _MIN_SAMPLES=2 and function word "red" occurs in only one phrase, "red"
        is ignored.
        
        The id2column attribute of the resulted composed space is set to
        be equal to that of the phrase space given as an input.
        """
        
        start = time.time()

        self._has_intercept = self._regression_learner.has_intercept()

        if not isinstance(arg_space, Space):
            raise ValueError("expected one input spaces!")  
               
        result_mats = []
               
        train_data = sorted(train_data, key=lambda tup: tup[0])
        function_word_list, arg_list, phrase_list = self.valid_data_to_lists(train_data,
                                                                             (None,
                                                                              arg_space.row2id,
                                                                              phrase_space.row2id))
        #partitions the sorted input data
        keys, key_ranges = get_partitions(function_word_list, self._MIN_SAMPLES)
        
        if not keys:
            raise ValueError("No valid training data found!")
                
        assert(len(arg_space.element_shape) == 1)
        
        if self._has_intercept:
            new_element_shape = phrase_space.element_shape + (arg_space.element_shape[0] + 1,)
        else:
            new_element_shape = phrase_space.element_shape + (arg_space.element_shape[0],)
            
        for i in xrange(len(key_ranges)):
            
            idx_beg, idx_end = key_ranges[i]
            
            print ("Training lexical function...%s with %d samples" 
                     % (keys[i], idx_end - idx_beg))
                            
            arg_mat = arg_space.get_rows(arg_list[idx_beg:idx_end]) 
            phrase_mat = phrase_space.get_rows(phrase_list[idx_beg:idx_end])
 
            #convert them to the same type
            matrix_type = get_type_of_largest([arg_mat, phrase_mat])
            [arg_mat, phrase_mat] = resolve_type_conflict([arg_mat, phrase_mat],
                                                          matrix_type)

            result_mat = self._regression_learner.train(arg_mat, phrase_mat).transpose()
            
            result_mat.reshape((1, np.prod(new_element_shape)))
            
            result_mats.append(result_mat)

        new_space_mat = arg_mat.nary_vstack(result_mats)
        
        self.composed_id2column = phrase_space.id2column
            
        self._function_space = Space(new_space_mat, keys, [], 
                                     element_shape=new_element_shape)
        
        log.print_composition_model_info(logger, self, 1, "\nTrained composition model:")
        log.print_info(logger, 3, "Trained: %s lexical functions" % len(keys))
        log.print_info(logger, 3, "With total data points:%s" % len(function_word_list))
        log.print_matrix_info(logger, arg_space.cooccurrence_matrix, 3, 
                              "Semantic space of arguments:")
        log.print_info(logger, 3, "Shape of lexical functions learned:%s" 
                       % (new_element_shape,))
        log.print_matrix_info(logger, new_space_mat, 3, 
                              "Semantic space of lexical functions:")
        log.print_time_info(logger, time.time(), start, 2)
    def train(self, train_data, arg_space, phrase_space):
        """
        Trains a lexical function composition model to learn a function
        space and sets the function_space parameter.

        Args:
            train_data: list of string tuples. Each tuple contains 3
            string elements: (function_word, arg, phrase).

            arg_space: argument space, of type Space. arg elements of
            train data are interpreted in this space.

            phrase space: phrase space, of type Space. phrase elements of
            the train data are interpreted in this space.

        Training tuples which contain strings not found in their
        respective spaces are ignored. Function words containing less than
        _MIN_SAMPLES training instances are ignored. For example, if
        _MIN_SAMPLES=2 and function word "red" occurs in only one phrase, "red"
        is ignored.

        The id2column attribute of the resulted composed space is set to
        be equal to that of the phrase space given as an input.
        """

        start = time.time()

        self._has_intercept = self._regression_learner.has_intercept()

        if not isinstance(arg_space, Space):
            raise ValueError("expected one input spaces!")

        result_mats = []

        train_data = sorted(train_data, key=lambda tup: tup[0])
        function_word_list, arg_list, phrase_list = self.valid_data_to_lists(
            train_data, (None, arg_space.row2id, phrase_space.row2id))
        #partitions the sorted input data
        keys, key_ranges = get_partitions(function_word_list,
                                          self._MIN_SAMPLES)

        if not keys:
            raise ValueError("No valid training data found!")

        assert (len(arg_space.element_shape) == 1)

        if self._has_intercept:
            new_element_shape = phrase_space.element_shape + (
                arg_space.element_shape[0] + 1, )
        else:
            new_element_shape = phrase_space.element_shape + (
                arg_space.element_shape[0], )

        for i in range(len(key_ranges)):
            idx_beg, idx_end = key_ranges[i]

            print(("Training lexical function...%s with %d samples" %
                   (keys[i], idx_end - idx_beg)))

            arg_mat = arg_space.get_rows(arg_list[idx_beg:idx_end])
            phrase_mat = phrase_space.get_rows(phrase_list[idx_beg:idx_end])

            #convert them to the same type
            matrix_type = get_type_of_largest([arg_mat, phrase_mat])
            [arg_mat,
             phrase_mat] = resolve_type_conflict([arg_mat, phrase_mat],
                                                 matrix_type)

            result_mat = self._regression_learner.train(
                arg_mat, phrase_mat).transpose()

            result_mat.reshape((1, np.prod(new_element_shape)))

            result_mats.append(result_mat)

        new_space_mat = arg_mat.nary_vstack(result_mats)

        self.composed_id2column = phrase_space.id2column

        self._function_space = Space(new_space_mat,
                                     keys, [],
                                     element_shape=new_element_shape)

        log.print_composition_model_info(logger, self, 1,
                                         "\nTrained composition model:")
        log.print_info(logger, 3, "Trained: %s lexical functions" % len(keys))
        log.print_info(logger, 3,
                       "With total data points:%s" % len(function_word_list))
        log.print_matrix_info(logger, arg_space.cooccurrence_matrix, 3,
                              "Semantic space of arguments:")
        log.print_info(
            logger, 3,
            "Shape of lexical functions learned:%s" % (new_element_shape, ))
        log.print_matrix_info(logger, new_space_mat, 3,
                              "Semantic space of lexical functions:")
        log.print_time_info(logger, time.time(), start, 2)