Example #1
0
    def test_model_tfidf_transform(self):
        corpus = numpy.array([
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.',
            'Is this the first document?',
            "Troisième document en français",
        ]).reshape((5, 1))
        data = CountVectorizer(ngram_range=(1, 1)).fit_transform(
            corpus.ravel()).todense()
        data = data.astype(numpy.float32)

        for sublinear_tf in (False, True):
            if sublinear_tf:
                # scikit-learn applies a log on a matrix
                # but only on strictly positive coefficients
                break
            for norm in (None, 'l1', 'l2'):
                for smooth_idf in (False, True):
                    for use_idf in (False, True):
                        model = TfidfTransformer(norm=norm,
                                                 use_idf=use_idf,
                                                 smooth_idf=smooth_idf,
                                                 sublinear_tf=sublinear_tf)
                        model.fit(data)
                        dt = data.copy()
                        model_onnx = convert_sklearn(
                            model, 'TfidfTransformer',
                            [('input', FloatTensorType([1, data.shape[1]]))])
                        self.assertTrue(model_onnx is not None)
                        suffix = norm.upper() if norm else ''
                        suffix += 'Sub' if sublinear_tf else ''
                        suffix += 'Idf' if use_idf else ''
                        suffix += 'Smooth' if smooth_idf else ''
                        dump_data_and_model(
                            data,
                            model,
                            model_onnx,
                            basename="SklearnTfidfTransform" + suffix,
                            # Operator mul is not implemented in onnxruntime
                            allow_failure=
                            "StrictVersion(onnx.__version__) < StrictVersion('1.2')"
                        )
Example #2
0
class EasyNLP(EasyClassi):
    """ class EasyNLP

    Building the Natural language processing class library.
    
    This class has as father the classification class and as grandfather the
    main generic MayML class. 
    """
    def __init__(self):
        """ __init__(self)
        
        Constructor.
        """

        super(EasyNLP, self).__init__()

    def downloadSTOPWords(self):
        """ downloadSTOPWords
        Get all the words that should be removed from text
        """
        nltk.download("stopwords")

    def cleanTXT(self, fieldToReview="Review"):
        """ cleanTXT
        Clean all text instances from the dataset with :
            
            - special characters
            - words not needed
            - stem: verbe tenses not needed, plurals...

        """
        self.corpus = []
        for iText in self.myDS[fieldToReview]:
            review = re.sub("[^a-zA-Z]", " ", iText)
            review = review.lower()
            review = review.split()
            ps = PorterStemmer()
            review = [
                ps.stem(word) for word in review
                if not word in set(stopwords.words("english"))
            ]
            review = " ".join(review)
            self.corpus.append(review)

    def createBagOfWords(self, maximumFtrs=None):
        """ createBagOfWords
        Create model with bag of words matrix.         
        
        maximumFtrs: Maximum number of words collected.
        """
        if (maximumFtrs == None):
            self.bowMatrix = CountVectorizer()
        else:
            self.bowMatrix = CountVectorizer(max_features=maximumFtrs)

        self.bowMatrix = self.bowMatrix.fit_transform(self.corpus).toarray()

    def split_X_y(self, yColumn=-1):
        """split_X_y (self,yColumn=-1)

        Process and get X and y: 
            X will be taken from the bag of words matrix.
            y will be taken from the original source of data.        
        
        If y column index is not defined, or set to -1, so
        by default we determine that y is at the last column.
        """
        # Getting features X and y
        self.X = self.bowMatrix.copy()
        self.y = self.myDS.iloc[:, yColumn].values