Ejemplo n.º 1
0
    def _count_vocab(self, analyzed_docs):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        vocabulary = self.vocabulary_
        j_indices = _make_int_array()
        indptr = _make_int_array()
        indptr.append(0)
        for doc in analyzed_docs:
            for feature in doc:
                try:
                    j_indices.append(vocabulary[feature])
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue
            indptr.append(len(j_indices))

        j_indices = frombuffer_empty(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.ones(len(j_indices))

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sum_duplicates()

        if self.binary:
            X.data.fill(1)

        return X
Ejemplo n.º 2
0
    def _count_vocab(self, analyzed_docs):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        vocabulary = self.vocabulary_
        j_indices = _make_int_array()
        indptr = _make_int_array()
        indptr.append(0)
        for doc in analyzed_docs:
            for feature in doc:
                try:
                    j_indices.append(vocabulary[feature])
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue
            indptr.append(len(j_indices))

        j_indices = frombuffer_empty(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.ones(len(j_indices))

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sum_duplicates()

        if self.binary:
            X.data.fill(1)

        return X
Ejemplo n.º 3
0
    def _extract_features(self,raw_documents):
        j_indices = []
        indptr = _make_int_array()
        values = _make_int_array()
        indptr.append(0)
        vocabulary = defaultdict(int)
        for doc in raw_documents:
            feature_counter = {}
            #for feature,feature_idx in contains_keywords(doc).items():
            for feature,feature_idx in self.body_words_in_headline(doc).items():
                vocabulary[feature]+=1
                try:
                    if feature_idx not in feature_counter:
                        feature_counter[feature_idx] = 1
                    else:
                        feature_counter[feature_idx] += 1
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue

            j_indices.extend(feature_counter.keys())
            values.extend(feature_counter.values())
            indptr.append(len(j_indices))


        j_indices = np.asarray(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = frombuffer_empty(values, dtype=np.intc)

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sort_indices()
        return vocabulary, X
Ejemplo n.º 4
0
    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        if fixed_vocab:
            vocabulary = self.vocabulary_
        else:
            # Add a new value when a new vocabulary item is seen
            vocabulary = defaultdict()
            vocabulary.default_factory = vocabulary.__len__

        j_indices = _make_int_array()
        indptr = _make_int_array()
        values = _make_float_array() if self.apply_socal_mask else None
        indptr.append(0)

        for doc in raw_documents:
            # doc: meu cajado eh muito grande
            # [1, 1, 1, 0, 2]
            if self.apply_socal_mask is True:
                doc_mask = self.socal.mask(doc)

            for index, feature in enumerate(doc):
                try:
                    if feature in self.stopwords:
                        continue

                    # j_incides for a doc: [2, 10, 9, 102, 65]
                    if not fixed_vocab or feature in vocabulary:
                        j_indices.append(vocabulary[feature])

                        if self.apply_socal_mask:
                            values.append(doc_mask[index])

                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue
            indptr.append(len(j_indices))

        if not fixed_vocab:
            # disable defaultdict behaviour
            vocabulary = dict(vocabulary)
            if not vocabulary:
                raise ValueError("empty vocabulary; perhaps the documents only"
                                 " contain stop words")

        j_indices = frombuffer_empty(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = values if values else np.ones(len(j_indices))

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sum_duplicates()
        return vocabulary, X
Ejemplo n.º 5
0
    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        if fixed_vocab:
            vocabulary = self.vocabulary_
        else:
            # Add a new value when a new vocabulary item is seen
            vocabulary = defaultdict()
            vocabulary.default_factory = vocabulary.__len__

        analyze = self.build_analyzer()
        j_indices = []
        indptr = _make_int_array()
        values = _make_int_array()
        indptr.append(0)
        for doc in raw_documents:
            feature_counter = {}
            for feature in analyze(doc):
                try:
                    feature_idx = vocabulary[feature]
                    if feature_idx not in feature_counter:
                        feature_counter[feature_idx] = 1
                    else:
                        feature_counter[feature_idx] += 1
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue

            j_indices.extend(feature_counter.keys())
            values.extend(feature_counter.values())
            indptr.append(len(j_indices))

        if not fixed_vocab:
            # disable defaultdict behaviour
            vocabulary = dict(vocabulary)
            if not vocabulary:
                raise ValueError("empty vocabulary; perhaps the documents only"
                                 " contain stop words")

        j_indices = np.asarray(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = frombuffer_empty(values, dtype=np.intc)

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sort_indices()
        return vocabulary, X
Ejemplo n.º 6
0
    def partial_transform(self, X, fitting=None):
        self.add_default()
        transforming = True

        # Process everything as sparse regardless of setting
        X = [X] if isinstance(X, Mapping) else X

        indices = array("i")
        indptr = array("i", [0])
        # XXX we could change values to an array.array as well, but it
        # would require (heuristic) conversion of dtype to typecode...
        values = []

        for x in X:
            for f, v in x.iteritems():
                self.add_element(f, v, fitting, transforming, indices, values)
            indptr.append(len(indices))

        if len(indptr) == 1:
            raise ValueError("Sample sequence X is empty.")

        indices = frombuffer_empty(indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        shape = (len(indptr) - 1, len(self.vocabulary_))

        result_matrix = sp.csr_matrix((values, indices, indptr),
                                      shape=shape,
                                      dtype=self.dtype)

        # Sort everything if asked
        if fitting and self.sort:
            self.feature_names_.sort()
            map_index = np.empty(len(self.feature_names_), dtype=np.int32)
            for new_val, f in enumerate(self.feature_names_):
                map_index[new_val] = self.vocabulary_[f]
                self.vocabulary_[f] = new_val
            result_matrix = result_matrix[:, map_index]

        if self.sparse:
            result_matrix.sort_indices()
        else:
            result_matrix = result_matrix.toarray()

        return result_matrix
    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        if fixed_vocab:
            vocabulary = self.vocabulary_
        else:
            if hasattr(self, 'vocabulary_') and self.vocabulary_:
                vocabulary = defaultdict(None, self.vocabulary_)
            else:
                # Add a new value when a new vocabulary item is seen
                vocabulary = defaultdict()
            vocabulary.default_factory = vocabulary.__len__
            # print vocabulary.__len__()

        analyze = self.build_analyzer()
        j_indices = _make_int_array()
        indptr = _make_int_array()
        indptr.append(0)
        for doc in raw_documents:
            for feature in analyze(doc):
                try:
                    j_indices.append(vocabulary[feature])
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue
            indptr.append(len(j_indices))

        if not fixed_vocab:
            # disable defaultdict behaviour
            vocabulary = dict(vocabulary)
            if not vocabulary:
                raise ValueError("empty vocabulary; perhaps the documents only"
                                 " contain stop words")

        j_indices = frombuffer_empty(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.ones(len(j_indices))

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sum_duplicates()
        return vocabulary, X
Ejemplo n.º 8
0
    def _transform(self, X):
        # Sanity check: Python's array has no way of explicitly requesting the
        # signed 32-bit integers that scipy.sparse needs, so we use the next
        # best thing: typecode "i" (int). However, if that gives larger or
        # smaller integers than 32-bit ones, np.frombuffer screws up.
        assert array("i").itemsize == 4, (
            "sizeof(int) != 4 on your platform; please report this at"
            " https://github.com/scikit-learn/scikit-learn/issues and"
            " include the output from platform.platform() in your bug report")

        dtype = self.dtype
        feature_names = self.feature_names_
        vocab = self.vocabulary_

        # Process everything as sparse regardless of setting

        indices = array("i")
        indptr = array("i", [0])
        # XXX we could change values to an array.array as well, but it
        # would require (heuristic) conversion of dtype to typecode...
        values = []

        if isinstance(X, dict):
            for f, val in X.items():
                if isinstance(val, six.string_types):
                    f = f + self.separator + val
                    val = 1

                if f in vocab and str(val) not in bad_vals_as_strings:
                    # Get the index position from vocab, then append that index position to indices
                    indices.append(vocab[f])
                    # Convert the val to the correct dtype, then append to our values list
                    values.append(dtype(val))

            indptr.append(len(indices))

            if len(indptr) == 1:
                raise ValueError(
                    'The dictionary passed into DataFrameVectorizer is empty')

        else:
            # collect all the possible feature names and build sparse matrix at
            # same time
            for row_idx, row in X.iterrows():
                for col_idx, val in enumerate(row):
                    f = X.columns[col_idx]

                    if isinstance(val, six.string_types):
                        f = f + self.separator + val
                        val = 1

                    # Only include this in our output if it was part of our training data. Silently ignore it otherwise.
                    if f in vocab and str(val) not in bad_vals_as_strings:
                        # Get the index position from vocab, then append that index position to indices
                        indices.append(vocab[f])
                        # Convert the val to the correct dtype, then append to our values list
                        values.append(dtype(val))

                indptr.append(len(indices))

            if len(indptr) == 1:
                raise ValueError(
                    'The DataFrame passed into DataFrameVectorizer is empty')

        indices = frombuffer_empty(indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        shape = (len(indptr) - 1, len(vocab))

        result_matrix = sp.csr_matrix((values, indices, indptr),
                                      shape=shape,
                                      dtype=dtype)

        # # Sort everything if asked
        # if fitting and self.sort:
        #     feature_names.sort()
        #     map_index = np.empty(len(feature_names), dtype=np.int32)
        #     for new_val, f in enumerate(feature_names):
        #         map_index[new_val] = vocab[f]
        #         vocab[f] = new_val
        #     result_matrix = result_matrix[:, map_index]

        if self.sparse:
            result_matrix.sort_indices()
        else:
            result_matrix = result_matrix.toarray()

        # if fitting:
        #     self.feature_names_ = feature_names
        #     self.vocabulary_ = vocab
        return result_matrix
Ejemplo n.º 9
0
    def _transform(self, X):
        # Sanity check: Python's array has no way of explicitly requesting the
        # signed 32-bit integers that scipy.sparse needs, so we use the next
        # best thing: typecode "i" (int). However, if that gives larger or
        # smaller integers than 32-bit ones, np.frombuffer screws up.
        assert array("i").itemsize == 4, (
            "sizeof(int) != 4 on your platform; please report this at"
            " https://github.com/scikit-learn/scikit-learn/issues and"
            " include the output from platform.platform() in your bug report")

        dtype = self.dtype
        feature_names = self.feature_names_
        vocab = self.vocabulary_

        # Process everything as sparse regardless of setting

        indices = array("i")
        indptr = array("i", [0])
        # XXX we could change values to an array.array as well, but it
        # would require (heuristic) conversion of dtype to typecode...
        values = []


        if isinstance(X, dict):
            for f, val in X.items():
                if self.column_descriptions.get(f, False) == 'categorical':
                    if self.get('keep_cat_features', False) == False:
                        f = f + self.separator + str(val)
                        val = 1
                    else:
                        if str(val) in bad_vals_as_strings:
                            val = '_None'
                        val = self.get('label_encoders')[f].transform([val])

                if f in vocab and str(val) not in bad_vals_as_strings:

                    indices.append(vocab[f])
                    # Convert the val to the correct dtype, then append to our values list
                    values.append(dtype(val))

            indptr.append(len(indices))

            if len(indptr) == 1:
                raise ValueError('The dictionary passed into DataFrameVectorizer is empty')


        else:
            # collect all the possible feature names and build sparse matrix at
            # same time
            X_columns = X.columns
            string_types = six.string_types
            separator = self.separator
            indices_append = indices.append
            values_append = values.append
            keep_cat_features = self.get('keep_cat_features', False) == False
            is_categorical = [self.column_descriptions.get(f, False) == 'categorical' for f in X_columns]

            for row in X.itertuples():
                for col_idx, val in enumerate(row[1:]):
                    f = X_columns[col_idx]

                    if is_categorical[col_idx]:
                        if keep_cat_features:
                            f = f + separator + str(val)
                            val = 1
                        else:
                            if str(val) in bad_vals_as_strings:
                                val = '_None'

                            val = self.get('label_encoders')[f].transform([val])

                    # Only include this in our output if it was part of our training data. Silently ignore it otherwise.
                    if f in vocab and str(val) not in bad_vals_as_strings:
                        # Get the index position from vocab, then append that index position to indices
                        indices_append(vocab[f])
                        # Convert the val to the correct dtype, then append to our values list
                        values_append(dtype(val))

                indptr.append(len(indices))

            if len(indptr) == 1:
                raise ValueError('The DataFrame passed into DataFrameVectorizer is empty')

        indices = frombuffer_empty(indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        shape = (len(indptr) - 1, len(vocab))

        result_matrix = sp.csr_matrix((values, indices, indptr),
                                      shape=shape, dtype=dtype)

        if self.sparse:
            result_matrix.sort_indices()
        else:
            result_matrix = result_matrix.toarray()

        return result_matrix