Ejemplo n.º 1
0
    def fit_transform(self, raw_documents, y=None, vector_source=None, stats_hdf_file=None,
                      cv_fold=-1, train_time_extractor=None, decode_time_extractor=None):
        self.cv_fold = cv_fold
        self.feature_extractor = train_time_extractor
        self.decode_time_extractor = decode_time_extractor
        self.thesaurus = vector_source
        self.handler = get_token_handler(self.train_token_handler,
                                         self.k,
                                         self.sim_compressor,
                                         self.thesaurus)
        # requested stats that to go HDF, store the name so we can record stats to that name at decode time too
        self.stats_hdf_file_ = stats_hdf_file
        self.stats = get_stats_recorder(self.debug_level, stats_hdf_file, 'tr', cv_fold, self.k)
        # a different stats recorder will be used for the testing data

        # ########## BEGIN super.fit_transform ##########
        # this is a modified version of super.fit_transform which works with an empty vocabulary
        self._validate_vocabulary()
        max_df = self.max_df
        min_df = self.min_df
        max_features = self.max_features

        vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
        X = X.tocsc()

        if self.binary:
            X.data.fill(1)

        if not self.fixed_vocabulary_:
            if vocabulary:
                X = self._sort_features(X, vocabulary)

                n_doc = X.shape[0]
                max_doc_count = (max_df
                                 if isinstance(max_df, numbers.Integral)
                                 else int(round(max_df * n_doc)))
                min_doc_count = (min_df
                                 if isinstance(min_df, numbers.Integral)
                                 else int(round(min_df * n_doc)))
                if max_doc_count < min_doc_count:
                    raise ValueError(
                        "max_df corresponds to < documents than min_df")
                X, self.stop_words_ = self._limit_features(X, vocabulary,
                                                           max_doc_count,
                                                           min_doc_count,
                                                           max_features)

            self.vocabulary_ = vocabulary
        # ######### END super.fit_transform ##########
        if (self.thesaurus and hasattr(self.thesaurus, 'get_nearest_neighbours') and
                hasattr(self.thesaurus.get_nearest_neighbours, 'cache_info')):
            logging.info('NN cache info: %s', self.thesaurus.get_nearest_neighbours.cache_info())
        logging.info('Matrix shape is %r after vectorization', X.shape)
        return X, self.vocabulary_
Ejemplo n.º 2
0
    def transform(self, raw_documents):
        self.feature_extractor = self.decode_time_extractor
        if not hasattr(self, 'vocabulary_'):
            self._check_vocabulary()

        if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0:
            raise ValueError("Vocabulary wasn't fitted or is empty!")
        # record stats separately for the test set
        self.stats = get_stats_recorder(self.debug_level, self.stats_hdf_file_, 'ev',
                                        self.cv_fold, self.k)

        if self.random_neighbour_thesaurus:
            # this is a bit of hack and a waste of effort, since a thesaurus will have been loaded first
            logging.info('Building random neighbour vector source with vocabulary of size %d', len(self.vocabulary_))
            self.thesaurus.k = self.k
            self.thesaurus.vocab = list(self.vocabulary_.keys())

        self.handler = get_token_handler(self.decode_token_handler,
                                         self.k,
                                         self.sim_compressor,
                                         self.thesaurus)

        # todo can't populate at this stage of the pipeline, because the vocabulary might
        # change if feature selection is enabled. Trying to do this will result in attempts to compose
        # features that we do not know how to compose because these have not been removed by FS
        # if self.thesaurus:
        # logging.info('Populating vector source %s prior to transform', self.thesaurus)
        # self.thesaurus.populate_vector_space(self.vocabulary_.keys())

        # BEGIN a modified version of super.transform that works when vocabulary is empty
        _, X = self._count_vocab(raw_documents, fixed_vocab=True)
        if self.binary:
            X.data.fill(1)
            # END super.transform

        if (self.thesaurus and hasattr(self.thesaurus, 'get_nearest_neighbours') and
                hasattr(self.thesaurus.get_nearest_neighbours, 'cache_info')):
            logging.info('NN cache info: %s', self.thesaurus.get_nearest_neighbours.cache_info())
        return X, self.vocabulary_