Python CountVectorizer.copy Beispiele

Programmiersprache: Python

Namespace / Paketname: sklearn.feature_extraction.text

Klasse / Typ: CountVectorizer

Methode / Funktion: copy

Beispiele auf hotexamples.com: 2

Python CountVectorizer.copy - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die sklearn.feature_extraction.text.CountVectorizer.copy, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

CountVectorizer(30)

_validate_vocabulary(30)

fit_transform(30)

fit(30)

build_tokenizer(30)

build_analyzer(30)

get_stop_words(30)

get_params(21)

get_feature_names_out(15)

build_preprocessor(13)

__init__(10)

get_feature_names(9)

dictionary_freeze(6)

count(4)

analyzer(4)

fixed_vocabulary(3)

astype(3)

_count_vocab(2)

copy(2)

fit_trainsform(2)

get_features_names(2)

append(2)

_word_ngrams(2)

get_feature_name(1)

getSenVec(1)

_sort_features(1)

get_features(1)

get_sentence_vector(1)

get_shape(1)

getOutputCol(1)

fit_Transform(1)

fit_trasform(1)

fit_transfrom(1)

fit_transforn(1)

__repr__(1)

fir_transform(1)

__dict__(1)

extract_ngrams(1)

delete_temporary_training_data(1)

count_features(1)

_limit_features(1)

fir(1)

Beispiel #1

Datei anzeigen

    def test_model_tfidf_transform(self):
        corpus = numpy.array([
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.',
            'Is this the first document?',
            "Troisième document en français",
        ]).reshape((5, 1))
        data = CountVectorizer(ngram_range=(1, 1)).fit_transform(
            corpus.ravel()).todense()
        data = data.astype(numpy.float32)

        for sublinear_tf in (False, True):
            if sublinear_tf:
                # scikit-learn applies a log on a matrix
                # but only on strictly positive coefficients
                break
            for norm in (None, 'l1', 'l2'):
                for smooth_idf in (False, True):
                    for use_idf in (False, True):
                        model = TfidfTransformer(norm=norm,
                                                 use_idf=use_idf,
                                                 smooth_idf=smooth_idf,
                                                 sublinear_tf=sublinear_tf)
                        model.fit(data)
                        dt = data.copy()
                        model_onnx = convert_sklearn(
                            model, 'TfidfTransformer',
                            [('input', FloatTensorType([1, data.shape[1]]))])
                        self.assertTrue(model_onnx is not None)
                        suffix = norm.upper() if norm else ''
                        suffix += 'Sub' if sublinear_tf else ''
                        suffix += 'Idf' if use_idf else ''
                        suffix += 'Smooth' if smooth_idf else ''
                        dump_data_and_model(
                            data,
                            model,
                            model_onnx,
                            basename="SklearnTfidfTransform" + suffix,
                            # Operator mul is not implemented in onnxruntime
                            allow_failure=
                            "StrictVersion(onnx.__version__) < StrictVersion('1.2')"
                        )

Beispiel #2

Datei anzeigen

class EasyNLP(EasyClassi):
    """ class EasyNLP

    Building the Natural language processing class library.
    
    This class has as father the classification class and as grandfather the
    main generic MayML class. 
    """
    def __init__(self):
        """ __init__(self)
        
        Constructor.
        """

        super(EasyNLP, self).__init__()

    def downloadSTOPWords(self):
        """ downloadSTOPWords
        Get all the words that should be removed from text
        """
        nltk.download("stopwords")

    def cleanTXT(self, fieldToReview="Review"):
        """ cleanTXT
        Clean all text instances from the dataset with :
            
            - special characters
            - words not needed
            - stem: verbe tenses not needed, plurals...

        """
        self.corpus = []
        for iText in self.myDS[fieldToReview]:
            review = re.sub("[^a-zA-Z]", " ", iText)
            review = review.lower()
            review = review.split()
            ps = PorterStemmer()
            review = [
                ps.stem(word) for word in review
                if not word in set(stopwords.words("english"))
            ]
            review = " ".join(review)
            self.corpus.append(review)

    def createBagOfWords(self, maximumFtrs=None):
        """ createBagOfWords
        Create model with bag of words matrix.         
        
        maximumFtrs: Maximum number of words collected.
        """
        if (maximumFtrs == None):
            self.bowMatrix = CountVectorizer()
        else:
            self.bowMatrix = CountVectorizer(max_features=maximumFtrs)

        self.bowMatrix = self.bowMatrix.fit_transform(self.corpus).toarray()

    def split_X_y(self, yColumn=-1):
        """split_X_y (self,yColumn=-1)

        Process and get X and y: 
            X will be taken from the bag of words matrix.
            y will be taken from the original source of data.        
        
        If y column index is not defined, or set to -1, so
        by default we determine that y is at the last column.
        """
        # Getting features X and y
        self.X = self.bowMatrix.copy()
        self.y = self.myDS.iloc[:, yColumn].values