Example #1
0
    def transform(
        self,
        corpus: Corpus,
        selector: Callable[[CorpusComponent],
                           bool] = lambda x: True) -> Corpus:
        """
        Computes the vector matrix for the Corpus component objects and then stores it in a ConvoKitMatrix object,
        which is saved in the Corpus as `vector_name`.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus component object and returns True or False
            (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.

        :return: the target Corpus annotated
        """
        objs = list(corpus.iter_objs(self.obj_type, selector))
        ids = [obj.id for obj in objs]
        docs = [self.text_func(obj) for obj in objs]

        matrix = self.vectorizer.transform(docs)
        try:
            column_names = self.vectorizer.get_feature_names()
        except AttributeError:
            column_names = np.arange(matrix.shape[1])
        corpus.set_vector_matrix(self.vector_name,
                                 matrix=matrix,
                                 ids=ids,
                                 columns=column_names)

        for obj in objs:
            obj.add_vector(self.vector_name)

        return corpus
    def transform(self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True) -> Corpus:
        """
        Annotate the corpus objects with the vectorized representation of the object's text, with an optional
        selector that filters for objects to be transformed. Objects that are not selected will get a metadata value
        of 'None' instead of the vector.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.

        :return: the target Corpus annotated
        """
        objs = list(corpus.iter_objs(self.obj_type, selector))
        ids = [obj.id for obj in objs]
        docs = [self.text_func(obj) for obj in objs]

        matrix = self.vectorizer.transform(docs)
        try:
            column_names = self.vectorizer.get_feature_names()
        except AttributeError:
            column_names = np.arange(matrix.shape[1])
        corpus.set_vector_matrix(self.vector_name, matrix=matrix, ids=ids, columns=column_names)

        for obj in objs:
            obj.add_vector(self.vector_name)

        return corpus