Beispiel #1
0
    def fit(self, raw_documents, y=None):
        self.analyzer_func = self.build_analyzer()

        with open(self.cluster_fn) as f:
            self.cluster_index = ClusterIndex(f, prefix_size=self.prefix_size)

        self.vocabulary_ = _build_vocabulary(self.cluster_index)

        return self
Beispiel #2
0
class BrownClusterVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin):
    def __init__(self, cluster_fn, prefix_size=16, preprocessor=None):
        self.cluster_fn = cluster_fn

        self.cluster_index = None
        self.vocabulary_ = None

        self.analyzer = 'word'
        self.preprocessor = preprocessor
        self.strip_accents = 'unicode'
        self.lowercase = True
        self.stop_words = None
        self.tokenizer = None
        self.token_pattern = r"(?u)\b\w\w+\b"
        self.input = None
        self.ngram_range = (1, 1)
        self.encoding = 'utf-8'
        self.decode_error = 'strict'
        self.prefix_size = prefix_size

        self.analyzer_func = None

    # no data needed, this just reads the model
    def fit(self, raw_documents, y=None):
        self.analyzer_func = self.build_analyzer()

        with open(self.cluster_fn) as f:
            self.cluster_index = ClusterIndex(f, prefix_size=self.prefix_size)

        self.vocabulary_ = _build_vocabulary(self.cluster_index)

        return self

    def transform(self, raw_documents):
        x = lil_matrix((len(raw_documents), self.cluster_index.n_cluster))

        for row, doc in enumerate(raw_documents):
            for token in self.analyzer_func(doc):
                c = self.cluster_index.cluster(token)

                if c:
                    x[row, self.vocabulary_[c]] += 1

        return x.tocsr()