def test_model_tfidf_transform(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', "Troisième document en français", ]).reshape((5, 1)) data = CountVectorizer(ngram_range=(1, 1)).fit_transform( corpus.ravel()).todense() data = data.astype(numpy.float32) for sublinear_tf in (False, True): if sublinear_tf: # scikit-learn applies a log on a matrix # but only on strictly positive coefficients break for norm in (None, 'l1', 'l2'): for smooth_idf in (False, True): for use_idf in (False, True): model = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) model.fit(data) dt = data.copy() model_onnx = convert_sklearn( model, 'TfidfTransformer', [('input', FloatTensorType([1, data.shape[1]]))]) self.assertTrue(model_onnx is not None) suffix = norm.upper() if norm else '' suffix += 'Sub' if sublinear_tf else '' suffix += 'Idf' if use_idf else '' suffix += 'Smooth' if smooth_idf else '' dump_data_and_model( data, model, model_onnx, basename="SklearnTfidfTransform" + suffix, # Operator mul is not implemented in onnxruntime allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.2')" )
class EasyNLP(EasyClassi): """ class EasyNLP Building the Natural language processing class library. This class has as father the classification class and as grandfather the main generic MayML class. """ def __init__(self): """ __init__(self) Constructor. """ super(EasyNLP, self).__init__() def downloadSTOPWords(self): """ downloadSTOPWords Get all the words that should be removed from text """ nltk.download("stopwords") def cleanTXT(self, fieldToReview="Review"): """ cleanTXT Clean all text instances from the dataset with : - special characters - words not needed - stem: verbe tenses not needed, plurals... """ self.corpus = [] for iText in self.myDS[fieldToReview]: review = re.sub("[^a-zA-Z]", " ", iText) review = review.lower() review = review.split() ps = PorterStemmer() review = [ ps.stem(word) for word in review if not word in set(stopwords.words("english")) ] review = " ".join(review) self.corpus.append(review) def createBagOfWords(self, maximumFtrs=None): """ createBagOfWords Create model with bag of words matrix. maximumFtrs: Maximum number of words collected. """ if (maximumFtrs == None): self.bowMatrix = CountVectorizer() else: self.bowMatrix = CountVectorizer(max_features=maximumFtrs) self.bowMatrix = self.bowMatrix.fit_transform(self.corpus).toarray() def split_X_y(self, yColumn=-1): """split_X_y (self,yColumn=-1) Process and get X and y: X will be taken from the bag of words matrix. y will be taken from the original source of data. If y column index is not defined, or set to -1, so by default we determine that y is at the last column. """ # Getting features X and y self.X = self.bowMatrix.copy() self.y = self.myDS.iloc[:, yColumn].values