Esempio n. 1
0
    def apply(self, transformation):
        """
        Applies a transformation on the current space.

        All transformations affect the data matrix. If the transformation
        reduces the dimensionality of the space, the column indexing
        structures are also updated. The operation applied is appended
        to the list of operations that the space holds.

        Args:
            transformation: of type Scaling, DimensionalityReduction or
              FeatureSelection

        Returns:
            A new space on which the transformation has been applied.

        """
        start = time.time()
        #TODO , FeatureSelection, DimReduction ..
        assert_is_instance(
            transformation,
            (Scaling, DimensionalityReduction, FeatureSelection))
        op = transformation.create_operation()
        new_matrix = op.apply(self.cooccurrence_matrix)

        new_operations = list(self.operations)
        new_operations.append(op)

        id2row, row2id = list(self.id2row), self.row2id.copy()

        if isinstance(op, DimensionalityReductionOperation):
            self.assert_1dim_element()
            id2column, column2id = [], {}
        elif isinstance(op, FeatureSelectionOperation):
            self.assert_1dim_element()
            op.original_columns = self.id2column

            if op.original_columns:
                id2column = list(
                    array(op.original_columns)[op.selected_columns])
                column2id = list2dict(id2column)
            else:
                id2column, column2id = [], {}
        else:
            id2column, column2id = list(self.id2column), self.column2id.copy()

        log.print_transformation_info(logger, transformation, 1,
                                      "\nApplied transformation:")
        log.print_matrix_info(logger, self.cooccurrence_matrix, 2,
                              "Original semantic space:")
        log.print_matrix_info(logger, new_matrix, 2,
                              "Resulted semantic space:")
        log.print_time_info(logger, time.time(), start, 2)

        return Space(new_matrix,
                     id2row,
                     id2column,
                     row2id,
                     column2id,
                     operations=new_operations)
Esempio n. 2
0
    def svd(matrix_, reduced_dimension):
        """
        Performs SVD decomposition.

        If the rank is smaller than the requested reduced dimension,
        reduction to rank is performed. Dense SVD uses Linalg._SVD_TOL to decide
        the rank of the matrix.


        Args:
           matrix_: input of type Matrix
           reduced_dimension: int, the desired reduced dimension

        Returns:
            U,S,V of the decomposition X = USV^T. U, V: Matrix type,
            S: ndarray of singular values.

        """
        log.print_info(logger, 4, "In SVD..reducing to dim %d" % reduced_dimension)
        log.print_matrix_info(logger, matrix_, 5, "Input matrix:")

        #TODO: IMPORTANT!! do the sign normalization COLUMN-wise!!!not
        #for the full matrix at once!!
        if reduced_dimension == 0:
            raise ValueError("Cannot reduce to dimensionality 0.")

        if isinstance(matrix_, SparseMatrix):
            result =  Linalg._sparse_svd(matrix_, reduced_dimension)
        elif isinstance(matrix_, DenseMatrix):
            result =  Linalg._dense_svd(matrix_, reduced_dimension)
        else:
            raise TypeError("expected Matrix type, received %s" % type(matrix_))

        log.print_matrix_info(logger, result[0], 5, "Resulting matrix U:")
        return result
Esempio n. 3
0
    def compose(self, data, arg_space):
        """
        Uses a composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the
            elements to be composed and composed_phrase is the string associated
            to their composition.

            arg_space: argument space(s). Space object or a tuple of two
            Space objects (e.g. my_space, or (my_space1, my_space2)).
            If two spaces are provided, arg1 elements of data are
            interpreted in space1, and arg2 in space2.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        arg1_space, arg2_space = self.extract_arg_spaces(arg_space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data,
                                                                     (arg1_space.row2id,
                                                                      arg2_space.row2id,
                                                                      None))
        
        # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead
        # the /3.0 is needed
        # because the composing data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector)
        chunk_size = int(max(arg1_space.cooccurrence_matrix.shape[0],arg2_space.cooccurrence_matrix.shape[0],len(phrase_list))
                          * self.MAX_MEM_OVERHEAD / 3.0) + 1
        
        composed_mats = []
        for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))):
            beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list))

            arg1_mat = arg1_space.get_rows(arg1_list[beg:end])
            arg2_mat = arg2_space.get_rows(arg2_list[beg:end])

            [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat],
                                                                    DenseMatrix)
            composed_mat = self._compose(arg1_mat, arg2_mat)
            composed_mats.append(composed_mat)
        
        composed_phrase_mat = composed_mat.nary_vstack(composed_mats)
        
        if self.composed_id2column is None:
            self.composed_id2column = self._build_id2column(arg1_space, arg2_space)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0])
        log.print_matrix_info(logger, composed_phrase_mat, 4,
                              "Resulted (composed) semantic space::")
        log.print_time_info(logger, time.time(), start, 2)
        
        return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
    def compose(self, data, arg_space):
        """
        Uses a lexical function composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (function_word, arg, composed_phrase). function_word and
            arg are the elements to be composed and composed_phrase is the
            string associated to their composition. function_word elements
            are interpreted in self.function_space.

            arg_space: argument space, of type Space. arg elements of data are
            interpreted in this space.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        assert_is_instance(arg_space, Space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(
            data, (self._function_space.row2id, arg_space.row2id, None))

        composed_vec_list = []
        for i in range(len(arg1_list)):
            arg1_vec = self._function_space.get_row(arg1_list[i])
            arg2_vec = arg_space.get_row(arg2_list[i])

            matrix_type = get_type_of_largest([arg1_vec, arg2_vec])
            [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec],
                                                         matrix_type)

            composed_ph_vec = self._compose(arg1_vec, arg2_vec,
                                            self._function_space.element_shape)

            composed_vec_list.append(composed_ph_vec)

        result_element_shape = self._function_space.element_shape[0:-1]
        composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3,
                       "Composed total data points:%s" % len(arg1_list))
        log.print_info(
            logger, 3,
            "Functional shape of the resulted (composed) elements:%s" %
            (result_element_shape, ))
        log.print_matrix_info(logger, composed_ph_mat, 4,
                              "Resulted (composed) semantic space:")
        log.print_time_info(logger, time.time(), start, 2)

        return Space(composed_ph_mat,
                     phrase_list,
                     self.composed_id2column,
                     element_shape=result_element_shape)
Esempio n. 5
0
    def apply(self, transformation):
        """
        Applies a transformation on the current space.
        
        All transformations affect the data matrix. If the transformation 
        reduces the dimensionality of the space, the column indexing
        structures are also updated. The operation applied is appended
        to the list of operations that the space holds. 
        
        Args:
            transformation: of type Scaling, DimensionalityReduction or 
              FeatureSelection  

        Returns:
            A new space on which the transformation has been applied.
            
        """
        start = time.time()
        #TODO , FeatureSelection, DimReduction ..
        assert_is_instance(transformation, (Scaling, DimensionalityReduction, 
                                            FeatureSelection))
        op = transformation.create_operation()
        new_matrix =  op.apply(self.cooccurrence_matrix)
        
        new_operations = list(self.operations)
        new_operations.append(op)

        id2row, row2id = list(self.id2row), self.row2id.copy() 
        
        
        if isinstance(op, DimensionalityReductionOperation):
            self.assert_1dim_element()
            id2column, column2id = [], {}
        elif isinstance(op, FeatureSelectionOperation):
            self.assert_1dim_element()
            op.original_columns = self.id2column
            
            if op.original_columns: 
                id2column = list(array(op.original_columns)[op.selected_columns])
                column2id = list2dict(id2column)
            else:
                id2column, column2id = [],{}
        else:
            id2column, column2id = list(self.id2column), self.column2id.copy()

        log.print_transformation_info(logger, transformation, 1, 
                                      "\nApplied transformation:")
        log.print_matrix_info(logger, self.cooccurrence_matrix, 2, 
                              "Original semantic space:")
        log.print_matrix_info(logger, new_matrix, 2, "Resulted semantic space:")
        log.print_time_info(logger, time.time(), start, 2)
                        
        return Space(new_matrix, id2row, id2column,
                     row2id, column2id, operations = new_operations)
Esempio n. 6
0
    def compose(self, data, arg_space):
        """
        Uses a lexical function composition model to compose elements.
        
        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (function_word, arg, composed_phrase). function_word and 
            arg are the elements to be composed and composed_phrase is the 
            string associated to their composition. function_word elements
            are interpreted in self.function_space. 
            
            arg_space: argument space, of type Space. arg elements of data are 
            interpreted in this space. 
        
        Returns:
            composed space: a new object of type Space, containing the 
            phrases obtained through composition.
            
        """
        start = time.time()
        
        assert_is_instance(arg_space, Space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data,
                                                                     (self._function_space.row2id,
                                                                      arg_space.row2id,
                                                                      None))

        composed_vec_list = []
        for i in xrange(len(arg1_list)):
            arg1_vec = self._function_space.get_row(arg1_list[i])
            arg2_vec = arg_space.get_row(arg2_list[i])
        
            matrix_type = get_type_of_largest([arg1_vec, arg2_vec])
            [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec],
                                                              matrix_type)
                
            composed_ph_vec = self._compose(arg1_vec, arg2_vec,
                                            self._function_space.element_shape)

            composed_vec_list.append(composed_ph_vec)
        
        result_element_shape = self._function_space.element_shape[0:-1]
        composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list)
        
        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3, "Composed total data points:%s" % len(arg1_list))
        log.print_info(logger, 3, "Functional shape of the resulted (composed) elements:%s" 
                       % (result_element_shape,))
        log.print_matrix_info(logger, composed_ph_mat, 4, 
                              "Resulted (composed) semantic space:")
        log.print_time_info(logger, time.time(), start, 2)
        
        return Space(composed_ph_mat, phrase_list, self.composed_id2column, 
                     element_shape = result_element_shape)
Esempio n. 7
0
    def compose(self, data, arg_space):
        """
        Uses a composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the
            elements to be composed and composed_phrase is the string associated
            to their composition.

            arg_space: argument space(s). Space object or a tuple of two
            Space objects (e.g. my_space, or (my_space1, my_space2)).
            If two spaces are provided, arg1 elements of data are
            interpreted in space1, and arg2 in space2.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        arg1_space, arg2_space = self.extract_arg_spaces(arg_space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(
            data, (arg1_space.row2id, arg2_space.row2id, None))

        arg1_mat = arg1_space.get_rows(arg1_list)
        arg2_mat = arg2_space.get_rows(arg2_list)

        [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat],
                                                     DenseMatrix)

        composed_phrase_mat = self._compose(arg1_mat, arg2_mat)
        if self.composed_id2column is None:
            self.composed_id2column = self._build_id2column(
                arg1_space, arg2_space)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3,
                       "Composed total data points:%s" % arg1_mat.shape[0])
        log.print_matrix_info(logger, composed_phrase_mat, 4,
                              "Resulted (composed) semantic space::")
        log.print_time_info(logger, time.time(), start, 2)

        return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
    def compose(self, data, arg_space):
        """
        Uses a composition model to compose elements.
        
        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the 
            elements to be composed and composed_phrase is the string associated
            to their composition.
            
            arg_space: argument space(s). Space object or a tuple of two 
            Space objects (e.g. my_space, or (my_space1, my_space2)). 
            If two spaces are provided, arg1 elements of data are 
            interpreted in space1, and arg2 in space2.
        
        Returns:
            composed space: a new object of type Space, containing the 
            phrases obtained through composition.
            
        """
        start = time.time()
         
        arg1_space, arg2_space = self.extract_arg_spaces(arg_space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data,
                                                                     (arg1_space.row2id,
                                                                      arg2_space.row2id,
                                                                      None))
                                                                     
        arg1_mat = arg1_space.get_rows(arg1_list)
        arg2_mat = arg2_space.get_rows(arg2_list)
        
        [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat], DenseMatrix) 
        
        composed_phrase_mat = self._compose(arg1_mat, arg2_mat)
        if self.composed_id2column is None:
            self.composed_id2column = self._build_id2column(arg1_space, arg2_space)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0])
        log.print_matrix_info(logger, composed_phrase_mat, 4, 
                              "Resulted (composed) semantic space::")
        log.print_time_info(logger, time.time(), start, 2)
                
        return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
Esempio n. 9
0
 def export(self, file_prefix, **kwargs):
     """
     Exports the current space to disk.
     If the space has no column information, it cannot be exported in 
     sparse format (sm).
     
     Args:
         file_prefix: string, prefix of the files to be exported
         format: string, one of dm/sm
    
     Prints: 
         - matrix in file_prefix.<format> 
         - row elements in file_prefix.<row>
         - col elements in file_prefix.<col>
     
     Raises:
         ValueError: if the space has no column info and "sm" exporting
             is attempted
         NotImplementedError: the space matrix is dense and "sm" exporting
             is attempted
     
     """
     
     start = time.time()
     create_parent_directories(file_prefix)
     format_ = "dm"
     if "format" in kwargs:
         format_ = kwargs["format"]
         if not format_ in ["dm","sm"]:
             raise ValueError("Unrecognized format: %s" %format_)
         elif format_ == "dm":
             print_cooc_mat_dense_format(self.cooccurrence_matrix,
                                         self.id2row, file_prefix)
         else:
             print_cooc_mat_sparse_format(self.cooccurrence_matrix,
                                          self.id2row,
                                          self.id2column, file_prefix)
     self._export_row_column(file_prefix)
     
     log.print_matrix_info(logger, self.cooccurrence_matrix, 1, 
                           "Printed semantic space:")
     log.print_time_info(logger, time.time(), start, 2) 
Esempio n. 10
0
    def export(self, file_prefix, **kwargs):
        """
        Exports the current space to disk.
        If the space has no column information, it cannot be exported in
        sparse format (sm).

        Args:
            file_prefix: string, prefix of the files to be exported
            format: string, one of dm/sm

        Prints:
            - matrix in file_prefix.<format>
            - row elements in file_prefix.<row>
            - col elements in file_prefix.<col>

        Raises:
            ValueError: if the space has no column info and "sm" exporting
                is attempted
            NotImplementedError: the space matrix is dense and "sm" exporting
                is attempted

        """

        start = time.time()
        create_parent_directories(file_prefix)
        format_ = "dm"
        if "format" in kwargs:
            format_ = kwargs["format"]
            if not format_ in ["dm", "sm"]:
                raise ValueError("Unrecognized format: %s" % format_)
            elif format_ == "dm":
                print_cooc_mat_dense_format(self.cooccurrence_matrix,
                                            self.id2row, file_prefix)
            else:
                print_cooc_mat_sparse_format(self.cooccurrence_matrix,
                                             self.id2row, self.id2column,
                                             file_prefix)
        self._export_row_column(file_prefix)

        log.print_matrix_info(logger, self.cooccurrence_matrix, 1,
                              "Printed semantic space:")
        log.print_time_info(logger, time.time(), start, 2)
Esempio n. 11
0
    def svd(matrix_, reduced_dimension):
        """
        Performs SVD decomposition.

        If the rank is smaller than the requested reduced dimension,
        reduction to rank is performed. Dense SVD uses Linalg._SVD_TOL to decide
        the rank of the matrix.


        Args:
           matrix_: input of type Matrix
           reduced_dimension: int, the desired reduced dimension

        Returns:
            U,S,V of the decomposition X = USV^T. U, V: Matrix type,
            S: ndarray of singular values.

        """
        log.print_info(logger, 4,
                       "In SVD..reducing to dim %d" % reduced_dimension)
        log.print_matrix_info(logger, matrix_, 5, "Input matrix:")

        #TODO: IMPORTANT!! do the sign normalization COLUMN-wise!!!not
        #for the full matrix at once!!
        if reduced_dimension == 0:
            raise ValueError("Cannot reduce to dimensionality 0.")

        if isinstance(matrix_, SparseMatrix):
            result = Linalg._sparse_svd(matrix_, reduced_dimension)
        elif isinstance(matrix_, DenseMatrix):
            result = Linalg._dense_svd(matrix_, reduced_dimension)
        else:
            raise TypeError("expected Matrix type, received %s" %
                            type(matrix_))

        log.print_matrix_info(logger, result[0], 5, "Resulting matrix U:")
        return result
Esempio n. 12
0
    def vstack(cls, space1, space2):
        """
        Classmethod. Stacks two semantic spaces.

        The rows in the two spaces are concatenated.

        Args:
            space1, space2: spaces to be stacked, of type Space

        Returns:
            Stacked space, type Space.

        Raises:
            ValueError: if the spaces have different number of columns
                        or their columns are not identical

        """
        if space1.cooccurrence_matrix.shape[
                1] != space2.cooccurrence_matrix.shape[1]:
            raise ValueError("Inconsistent shapes: %s, %s" %
                             (space1.cooccurrence_matrix.shape[1],
                              space2.cooccurrence_matrix.shape[1]))

        if space1.id2column != space2.id2column:
            raise ValueError("Identical columns required")

        new_row2id = add_items_to_dict(space1.row2id.copy(), space2.id2row)
        new_id2row = space1.id2row + space2.id2row

        matrix_type = get_type_of_largest(
            [space1.cooccurrence_matrix, space2.cooccurrence_matrix])
        [new_mat1, new_mat2] = resolve_type_conflict(
            [space1.cooccurrence_matrix, space2.cooccurrence_matrix],
            matrix_type)

        new_mat = new_mat1.vstack(new_mat2)

        log.print_info(logger, 1, "\nVertical stack of two spaces")
        log.print_matrix_info(logger, space1.cooccurrence_matrix, 2,
                              "Semantic space 1:")
        log.print_matrix_info(logger, space2.cooccurrence_matrix, 2,
                              "Semantic space 2:")
        log.print_matrix_info(logger, new_mat, 2, "Resulted semantic space:")

        return Space(new_mat,
                     new_id2row,
                     list(space1.id2column),
                     new_row2id,
                     space1.column2id.copy(),
                     operations=[])
Esempio n. 13
0
    def train(self, train_data, arg_space, phrase_space):
        """
        Trains a composition model and sets its learned parameters.

        Args:
            train_data: list of string tuples. Each tuple contains 3
            string elements: (arg1, arg2, phrase).

            arg_space: argument space(s). Space object or a tuple of two
            Space objects (e.g. my_space, or (my_space1, my_space2)).
            If two spaces are provided, arg1 elements of train data are
            interpreted in space1, and arg2 in space2.

            phrase space: phrase space, of type Space.

        Calls the specific training routine of the current composition
        model. Training tuples which contain strings not found in their
        respective spaces are ignored.

        The id2column attribute of the resulted composed space is set to
        be equal to that of the phrase space given as an input.
        """

        start = time.time()

        arg1_space, arg2_space = self.extract_arg_spaces(arg_space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(train_data,
                                                                     (arg1_space.row2id,
                                                                      arg2_space.row2id,
                                                                      phrase_space.row2id)
                                                                     )


        self._train(arg1_space, arg2_space, phrase_space,
                 arg1_list, arg2_list, phrase_list)

        self.composed_id2column = phrase_space.id2column

        log.print_composition_model_info(logger, self, 1, "\nTrained composition model:")
        log.print_info(logger, 2, "With total data points:%s" % len(arg1_list))
        log.print_matrix_info(logger, arg1_space.cooccurrence_matrix, 3,
                              "Semantic space of argument 1:")
        log.print_matrix_info(logger, arg2_space.cooccurrence_matrix, 3,
                              "Semantic space of argument 2:")
        log.print_matrix_info(logger, phrase_space.cooccurrence_matrix, 3,
                              "Semantic space of phrases:")
        log.print_time_info(logger, time.time(), start, 2)
Esempio n. 14
0
 def vstack(cls, space1, space2):
     """
     Classmethod. Stacks two semantic spaces.
     
     The rows in the two spaces are concatenated.
         
     Args:
         space1, space2: spaces to be stacked, of type Space
         
     Returns:
         Stacked space, type Space.
         
     Raises:
         ValueError: if the spaces have different number of columns
                     or their columns are not identical
         
     """
     if space1.cooccurrence_matrix.shape[1] != space2.cooccurrence_matrix.shape[1]:
         raise ValueError("Inconsistent shapes: %s, %s" 
                          % (space1.cooccurrence_matrix.shape[1], 
                             space2.cooccurrence_matrix.shape[1]))
     
     if space1.id2column != space2.id2column:
         raise ValueError("Identical columns required")
     
     new_row2id = add_items_to_dict(space1.row2id.copy(), space2.id2row)
     new_id2row = space1.id2row + space2.id2row
     
     matrix_type = get_type_of_largest([space1.cooccurrence_matrix,
                                        space2.cooccurrence_matrix])
     [new_mat1, new_mat2] = resolve_type_conflict([space1.cooccurrence_matrix, 
                                                   space2.cooccurrence_matrix],
                                                  matrix_type)
     
     new_mat = new_mat1.vstack(new_mat2)
     
     log.print_info(logger, 1, "\nVertical stack of two spaces")
     log.print_matrix_info(logger, space1.cooccurrence_matrix, 2, 
                           "Semantic space 1:")
     log.print_matrix_info(logger, space2.cooccurrence_matrix, 2, 
                           "Semantic space 2:")
     log.print_matrix_info(logger, new_mat, 2, "Resulted semantic space:")
     
     return Space(new_mat, new_id2row, list(space1.id2column), new_row2id, 
                  space1.column2id.copy(), operations=[])
Esempio n. 15
0
    def build(cls, **kwargs):
        """
        Reads in data files and extracts the data to construct a semantic space.
        
        If the data is read in dense format and no columns are provided, 
        the column indexing structures are set to empty.
         
        Args:
            data: file containing the counts
            format: format on the input data file: one of sm/dm
            rows: file containing the row elements. Optional, if not provided,
                extracted from the data file.
            cols: file containing the column elements
           
        Returns:
            A semantic space build from the input data files.
            
        Raises:
            ValueError: if one of data/format arguments is missing.
                        if cols is missing and format is "sm"
                        if the input columns provided are not consistent with
                        the shape of the matrix (for "dm" format)
            
        """
        start = time.time()
        id2row = None
        id2column = None
        
        if "data" in kwargs:
            data_file = kwargs["data"]
        else:
            raise ValueError("Space data file needs to be specified")
            
        if "format" in kwargs:
            format_ = kwargs["format"]
            if not format_ in ["dm","sm"]:
                raise ValueError("Unrecognized format: %s" % format_)
        else:
            raise ValueError("Format of input files needs to be specified")
        
        if "rows" in kwargs and not kwargs["rows"] is None:
            [id2row], [row2id] = extract_indexing_structs(kwargs["rows"], [0])
            
        if "cols" in kwargs and not kwargs["cols"] is None:
            [id2column], [column2id] = extract_indexing_structs(kwargs["cols"], [0])
        elif format_ == "sm":
            raise ValueError("Need to specify column file when input format is sm!")
            
        if format_ == "sm":
            if id2row is None:
                [id2row], [row2id] = extract_indexing_structs(data_file, [0])
            mat = read_sparse_space_data(data_file, row2id, column2id)

        else:
            if id2row is None:
                [id2row],[row2id] = extract_indexing_structs(data_file, [0])
            mat = read_dense_space_data(data_file, row2id)
                
        if id2column and len(id2column) != mat.shape[1]:
            raise ValueError("Columns provided inconsistent with shape of input matrix!")

        if id2column is None:
            id2column, column2id = [], {}
                            
        log.print_matrix_info(logger, mat, 1, "Built semantic space:")
        log.print_time_info(logger, time.time(), start, 2)    
        return Space(mat, id2row, id2column, row2id, column2id)
Esempio n. 16
0
    def nmf(v, w_init, h_init):
        """
        Performs Non-negative Matrix Factorization.

        It solves the problem:
        :math:`W,H = argmin(||X - WH||_2)` such that W and H are non-negative matrices.

        Args:
            w_init: initial value for matrix W, type Matrix
            h_init: initial value for matrix H, type Matrix

        Returns:
            W, H <Matrix>: where W, H solve the NMF problem stated above.

        """

        log.print_info(logger, 4, "In NMF..reducing to dim %d" % w_init.shape[1])
        log.print_matrix_info(logger, w_init, 5, "W init matrix:")
        log.print_matrix_info(logger, h_init, 5, "H init matrix:")

        if not isinstance(v, Matrix):
            raise TypeError("expected Matrix type, received %s" % type(v))
        w = w_init
        h = h_init
        init_time = time()

        wt = w.transpose()
        ht = h.transpose()
        vt = v.transpose()
        gradW = (w * (h * ht)) - (v * ht)
        gradH = ((wt * w) * h) - (wt * v)

        gradW_norm = gradW.norm()
        gradH_norm = gradH.norm()
        initgrad = sqrt(pow(gradW_norm, 2) + pow(gradH_norm, 2))

        #print 'Init gradient norm %f' % initgrad
        tolW = max(Linalg._NMF_MIN_TOL, Linalg._NMF_TOL) * initgrad
        tolH = tolW

        #loop_time = init_time
        for iteration in xrange(1, Linalg._NMF_MAX_ITER):
            log.print_info(logger, 5, "Iteration: %d(%d)" % (iteration, Linalg._NMF_MAX_ITER))

            if time() - init_time > Linalg._NMF_TIME_LIMIT:
                break

            w, gradW, iterW = Linalg._nmf_nlssubprob(vt, h.transpose(), h,
                                              w.transpose(), tolW,
                                              Linalg._NMF_MAX_ITER_SUBPROB)
            old_w = w
            w = w.transpose()
            gradW = gradW.transpose()

            if iterW == 1:
                tolW = Linalg._NMF_TOL_DECREASE_FACTOR * tolW

            h, gradH, iterH = Linalg._nmf_nlssubprob(v, w, old_w, h, tolH,
                                              Linalg._NMF_MAX_ITER_SUBPROB)

            if iterH == 1:
                tolH = Linalg._NMF_TOL_DECREASE_FACTOR * tolH

        log.print_matrix_info(logger, w, 5, "Return W matrix:")
        log.print_matrix_info(logger, h, 5, "Return H matrix:")
        return w, h
Esempio n. 17
0
    def nmf(v, w_init, h_init):
        """
        Performs Non-negative Matrix Factorization.

        It solves the problem:
        :math:`W,H = argmin(||X - WH||_2)` such that W and H are non-negative matrices.

        Args:
            w_init: initial value for matrix W, type Matrix
            h_init: initial value for matrix H, type Matrix

        Returns:
            W, H <Matrix>: where W, H solve the NMF problem stated above.

        """

        log.print_info(logger, 4,
                       "In NMF..reducing to dim %d" % w_init.shape[1])
        log.print_matrix_info(logger, w_init, 5, "W init matrix:")
        log.print_matrix_info(logger, h_init, 5, "H init matrix:")

        if not isinstance(v, Matrix):
            raise TypeError("expected Matrix type, received %s" % type(v))
        w = w_init
        h = h_init
        init_time = time()

        wt = w.transpose()
        ht = h.transpose()
        vt = v.transpose()
        gradW = (w * (h * ht)) - (v * ht)
        gradH = ((wt * w) * h) - (wt * v)

        gradW_norm = gradW.norm()
        gradH_norm = gradH.norm()
        initgrad = sqrt(pow(gradW_norm, 2) + pow(gradH_norm, 2))

        #print 'Init gradient norm %f' % initgrad
        tolW = max(Linalg._NMF_MIN_TOL, Linalg._NMF_TOL) * initgrad
        tolH = tolW

        #loop_time = init_time
        for iteration in xrange(1, Linalg._NMF_MAX_ITER):
            log.print_info(
                logger, 5,
                "Iteration: %d(%d)" % (iteration, Linalg._NMF_MAX_ITER))

            if time() - init_time > Linalg._NMF_TIME_LIMIT:
                break

            w, gradW, iterW = Linalg._nmf_nlssubprob(
                vt, h.transpose(), h, w.transpose(), tolW,
                Linalg._NMF_MAX_ITER_SUBPROB)
            old_w = w
            w = w.transpose()
            gradW = gradW.transpose()

            if iterW == 1:
                tolW = Linalg._NMF_TOL_DECREASE_FACTOR * tolW

            h, gradH, iterH = Linalg._nmf_nlssubprob(
                v, w, old_w, h, tolH, Linalg._NMF_MAX_ITER_SUBPROB)

            if iterH == 1:
                tolH = Linalg._NMF_TOL_DECREASE_FACTOR * tolH

        log.print_matrix_info(logger, w, 5, "Return W matrix:")
        log.print_matrix_info(logger, h, 5, "Return H matrix:")
        return w, h
Esempio n. 18
0
    def train(self, train_data, arg_space, phrase_space):
        """
        Trains a lexical function composition model to learn a function
        space and sets the function_space parameter. 
                
        Args:
            train_data: list of string tuples. Each tuple contains 3 
            string elements: (function_word, arg, phrase).
            
            arg_space: argument space, of type Space. arg elements of 
            train data are interpreted in this space.
        
            phrase space: phrase space, of type Space. phrase elements of 
            the train data are interpreted in this space.
            
        Training tuples which contain strings not found in their 
        respective spaces are ignored. Function words containing less than
        _MIN_SAMPLES training instances are ignored. For example, if
        _MIN_SAMPLES=2 and function word "red" occurs in only one phrase, "red"
        is ignored.
        
        The id2column attribute of the resulted composed space is set to
        be equal to that of the phrase space given as an input.
        """
        
        start = time.time()

        self._has_intercept = self._regression_learner.has_intercept()

        if not isinstance(arg_space, Space):
            raise ValueError("expected one input spaces!")  
               
        result_mats = []
               
        train_data = sorted(train_data, key=lambda tup: tup[0])
        function_word_list, arg_list, phrase_list = self.valid_data_to_lists(train_data,
                                                                             (None,
                                                                              arg_space.row2id,
                                                                              phrase_space.row2id))
        #partitions the sorted input data
        keys, key_ranges = get_partitions(function_word_list, self._MIN_SAMPLES)
        
        if not keys:
            raise ValueError("No valid training data found!")
                
        assert(len(arg_space.element_shape) == 1)
        
        if self._has_intercept:
            new_element_shape = phrase_space.element_shape + (arg_space.element_shape[0] + 1,)
        else:
            new_element_shape = phrase_space.element_shape + (arg_space.element_shape[0],)
            
        for i in xrange(len(key_ranges)):
            
            idx_beg, idx_end = key_ranges[i]
            
            print ("Training lexical function...%s with %d samples" 
                     % (keys[i], idx_end - idx_beg))
                            
            arg_mat = arg_space.get_rows(arg_list[idx_beg:idx_end]) 
            phrase_mat = phrase_space.get_rows(phrase_list[idx_beg:idx_end])
 
            #convert them to the same type
            matrix_type = get_type_of_largest([arg_mat, phrase_mat])
            [arg_mat, phrase_mat] = resolve_type_conflict([arg_mat, phrase_mat],
                                                          matrix_type)

            result_mat = self._regression_learner.train(arg_mat, phrase_mat).transpose()
            
            result_mat.reshape((1, np.prod(new_element_shape)))
            
            result_mats.append(result_mat)

        new_space_mat = arg_mat.nary_vstack(result_mats)
        
        self.composed_id2column = phrase_space.id2column
            
        self._function_space = Space(new_space_mat, keys, [], 
                                     element_shape=new_element_shape)
        
        log.print_composition_model_info(logger, self, 1, "\nTrained composition model:")
        log.print_info(logger, 3, "Trained: %s lexical functions" % len(keys))
        log.print_info(logger, 3, "With total data points:%s" % len(function_word_list))
        log.print_matrix_info(logger, arg_space.cooccurrence_matrix, 3, 
                              "Semantic space of arguments:")
        log.print_info(logger, 3, "Shape of lexical functions learned:%s" 
                       % (new_element_shape,))
        log.print_matrix_info(logger, new_space_mat, 3, 
                              "Semantic space of lexical functions:")
        log.print_time_info(logger, time.time(), start, 2)
Esempio n. 19
0
    def build(cls, **kwargs):
        """
        Reads in data files and extracts the data to construct a semantic space.

        If the data is read in dense format and no columns are provided,
        the column indexing structures are set to empty.

        Args:
            data: file containing the counts
            format: format on the input data file: one of sm/dm
            rows: file containing the row elements. Optional, if not provided,
                extracted from the data file.
            cols: file containing the column elements

        Returns:
            A semantic space build from the input data files.

        Raises:
            ValueError: if one of data/format arguments is missing.
                        if cols is missing and format is "sm"
                        if the input columns provided are not consistent with
                        the shape of the matrix (for "dm" format)

        """
        start = time.time()
        id2row = None
        id2column = None

        if "data" in kwargs:
            data_file = kwargs["data"]
        else:
            raise ValueError("Space data file needs to be specified")

        if "format" in kwargs:
            format_ = kwargs["format"]
            if not format_ in ["dm", "sm"]:
                raise ValueError("Unrecognized format: %s" % format_)
        else:
            raise ValueError("Format of input files needs to be specified")

        if "rows" in kwargs and not kwargs["rows"] is None:
            [id2row], [row2id] = extract_indexing_structs(kwargs["rows"], [0])

        if "cols" in kwargs and not kwargs["cols"] is None:
            [id2column], [column2id
                          ] = extract_indexing_structs(kwargs["cols"], [0])
        elif format_ == "sm":
            raise ValueError(
                "Need to specify column file when input format is sm!")

        if format_ == "sm":
            if id2row is None:
                [id2row], [row2id] = extract_indexing_structs(data_file, [0])
            mat = read_sparse_space_data(data_file, row2id, column2id)

        else:
            if id2row is None:
                [id2row], [row2id] = extract_indexing_structs(data_file, [0])
            mat = read_dense_space_data(data_file, row2id)

        if id2column and len(id2column) != mat.shape[1]:
            raise ValueError(
                "Columns provided inconsistent with shape of input matrix!")

        if id2column is None:
            id2column, column2id = [], {}

        log.print_matrix_info(logger, mat, 1, "Built semantic space:")
        log.print_time_info(logger, time.time(), start, 2)
        return Space(mat, id2row, id2column, row2id, column2id)
    def train(self, train_data, arg_space, phrase_space):
        """
        Trains a lexical function composition model to learn a function
        space and sets the function_space parameter.

        Args:
            train_data: list of string tuples. Each tuple contains 3
            string elements: (function_word, arg, phrase).

            arg_space: argument space, of type Space. arg elements of
            train data are interpreted in this space.

            phrase space: phrase space, of type Space. phrase elements of
            the train data are interpreted in this space.

        Training tuples which contain strings not found in their
        respective spaces are ignored. Function words containing less than
        _MIN_SAMPLES training instances are ignored. For example, if
        _MIN_SAMPLES=2 and function word "red" occurs in only one phrase, "red"
        is ignored.

        The id2column attribute of the resulted composed space is set to
        be equal to that of the phrase space given as an input.
        """

        start = time.time()

        self._has_intercept = self._regression_learner.has_intercept()

        if not isinstance(arg_space, Space):
            raise ValueError("expected one input spaces!")

        result_mats = []

        train_data = sorted(train_data, key=lambda tup: tup[0])
        function_word_list, arg_list, phrase_list = self.valid_data_to_lists(
            train_data, (None, arg_space.row2id, phrase_space.row2id))
        #partitions the sorted input data
        keys, key_ranges = get_partitions(function_word_list,
                                          self._MIN_SAMPLES)

        if not keys:
            raise ValueError("No valid training data found!")

        assert (len(arg_space.element_shape) == 1)

        if self._has_intercept:
            new_element_shape = phrase_space.element_shape + (
                arg_space.element_shape[0] + 1, )
        else:
            new_element_shape = phrase_space.element_shape + (
                arg_space.element_shape[0], )

        for i in range(len(key_ranges)):
            idx_beg, idx_end = key_ranges[i]

            print(("Training lexical function...%s with %d samples" %
                   (keys[i], idx_end - idx_beg)))

            arg_mat = arg_space.get_rows(arg_list[idx_beg:idx_end])
            phrase_mat = phrase_space.get_rows(phrase_list[idx_beg:idx_end])

            #convert them to the same type
            matrix_type = get_type_of_largest([arg_mat, phrase_mat])
            [arg_mat,
             phrase_mat] = resolve_type_conflict([arg_mat, phrase_mat],
                                                 matrix_type)

            result_mat = self._regression_learner.train(
                arg_mat, phrase_mat).transpose()

            result_mat.reshape((1, np.prod(new_element_shape)))

            result_mats.append(result_mat)

        new_space_mat = arg_mat.nary_vstack(result_mats)

        self.composed_id2column = phrase_space.id2column

        self._function_space = Space(new_space_mat,
                                     keys, [],
                                     element_shape=new_element_shape)

        log.print_composition_model_info(logger, self, 1,
                                         "\nTrained composition model:")
        log.print_info(logger, 3, "Trained: %s lexical functions" % len(keys))
        log.print_info(logger, 3,
                       "With total data points:%s" % len(function_word_list))
        log.print_matrix_info(logger, arg_space.cooccurrence_matrix, 3,
                              "Semantic space of arguments:")
        log.print_info(
            logger, 3,
            "Shape of lexical functions learned:%s" % (new_element_shape, ))
        log.print_matrix_info(logger, new_space_mat, 3,
                              "Semantic space of lexical functions:")
        log.print_time_info(logger, time.time(), start, 2)