class TF_Transformer(base.BaseEstimator, base.TransformerMixin): def __init__(self): self.cv_bi = CountVectorizer(min_df=2,max_df=0.7,ngram_range=(1,2)) self.tfidf_trans = TfidfTransformer() self.SVD_trans = TruncatedSVD(n_components=300) # X is a list of Fit_Review named tuples, y is none def fit(self, X, y=None): texts = [review.text for review in X] counts = self.cv_bi.fit_transform(texts) counts_tfidf = self.tfidf_trans.fit_transform(counts) self.SVD_trans.fit(counts_tfidf) return self # X is a list of either Fit_Review or Prod_Corpus named tuples def transform(self, X): texts = [review.text for review in X] counts = self.cv_bi.transform(texts) counts_tfidf = self.tfidf_trans.transform(counts) counts_trunc = self.SVD_trans.transform(counts_tfidf) return counts_trunc
def text_sentiment(docs_new): docs_new=[docs_new] twenty_train= load_files('./Sentiment') #the complete data is in this directory; like comp.graphics etc count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # Fit a classifier on the training set #clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) #f = open('my_classifier.pickle', 'wb') #pickle.dump(clf, f) #f = open('my_classifier.pickle',) #clf = pickle.load(f) #f.close() # save the classifier #with open('my_sentiment.pkl', 'wb') as fid: #cPickle.dump(clf, fid) # load it again with open('my_sentiment.pkl', 'rb') as fid: clf = cPickle.load(fid) X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) return twenty_train.target_names[predicted]
def runSVCPipeline(entries, langs): t0 = time() svc_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42))]) vect = CountVectorizer(ngram_range=(1,1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.transform(X_train_counts) clf = LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) #dec = clf.decision_function([[1]]) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time()-t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return svc_pipeline
def getTfidfData(dataTrain, dataTest, dataHold): print dataTrain.target_names count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2) tfidf_transformer = TfidfTransformer(sublinear_tf=True) X_counts = count_vect.fit_transform(dataTrain.data) X_tfidf = tfidf_transformer.fit_transform(X_counts) print X_tfidf.shape Y_counts = count_vect.transform(dataTest.data) Y_tfidf = tfidf_transformer.transform(Y_counts) print Y_tfidf.shape H_counts = count_vect.transform(dataHold.data) H_tfidf = tfidf_transformer.transform(H_counts) print 'feature selection using chi square test', len(dataTrain.target) feature_names = count_vect.get_feature_names() ch2 = SelectKBest(chi2, k='all') X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target) Y_tfidf = ch2.transform(Y_tfidf) H_tfidf = ch2.transform(H_tfidf) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] if feature_names: feature_names = numpy.asarray(feature_names) print 'important features' print feature_names[:10] return X_tfidf, Y_tfidf, H_tfidf
def tf(train,test): """Transform feature vectors: TF""" trf = TfidfTransformer(use_idf=False) trf = trf.fit(train) train = trf.transform(train) test = trf.transform(test) return train,test
class BagOfWordView(View): """ View that process words(stemming, lowercasing) and count each word's frequency """ def __init__(self, *args, **kwargs): self.count_vec1 = None self.count_vec2 = None self.tfidf_vec1 = None self.tfidf_vec2 = None super(BagOfWordView, self).__init__(*args, **kwargs) def fit(self, v1, v2, use_idf=False): """ v1, v2: both should be string|unicode, which is required by CountVectorizer.fit """ ## TODO: add `use_tf` option self.count_vec1 = CountVectorizer().fit(v1) self.count_vec2 = CountVectorizer().fit(v2) self.tfidf_vec1 = TfidfTransformer(use_idf=use_idf).fit( self.count_vec1.transform(v1)) self.tfidf_vec2 = TfidfTransformer(use_idf=use_idf).fit( self.count_vec2.transform(v2)) return self def transform(self, v1, v2): return self.tfidf_vec1.transform(self.count_vec1.transform(v1)), \ self.tfidf_vec2.transform(self.count_vec2.transform(v2))
def tfidf(train,test): """Transform feature vectors: TFIDF""" trf = TfidfTransformer() trf = trf.fit(train) train = trf.transform(train) test = trf.transform(test) return train,test
def train_randomforest(train, test, n_estimators=10, cpus=4): import numpy as np from scipy.sparse import csc_matrix from sklearn.preprocessing import OneHotEncoder vocabulary_size = 2000 #keep commas and colons corpus = [t.text for t in train] test_corpus = [t.text for t in test] from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer """ prepare text training data """ count_vect = CountVectorizer(max_features=None) X_train_counts = count_vect.fit_transform(corpus) X_test_counts = count_vect.transform(test_corpus) tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) X_test_tf = tf_transformer.transform(X_test_counts) X_names = [] X_train = [] y_train = [] for card, token_text in zip(train, X_train_tf): X_names.append(card.name) features = np.concatenate((token_text.toarray().flatten(), card.types, [card.power, card.toughness, card.loyalty], card.colors)) X_train.append(features) y_train.append(card.cost) X_test = [] y_test = [] X_test_names = [] for card, token_text in zip(test, X_test_tf): X_test_names.append(card.name) features = np.concatenate((token_text.toarray().flatten(), card.types, [card.power, card.toughness, card.loyalty], card.colors)) X_test.append(features) y_test.append(card.cost) X_train = np.asarray(X_train) y_train = np.asarray(y_train) X_test = np.asarray(X_test) y_test = np.asarray(y_test) from sklearn.ensemble import RandomForestRegressor from sklearn import cross_validation rf = RandomForestRegressor(n_estimators=n_estimators, n_jobs=cpus) rf.fit(X_train, y_train) y_pred = rf.predict(X_train) print y_pred.shape, y_train.shape print "naive train loss", np.mean(custom_loss(y_train, y_pred)) y_pred = rf.predict(X_test) print "naive test loss", np.mean(custom_loss(y_test, y_pred)) result = print_predictions(y_pred, y_test, X_test_names) print "saving to output.naive.txt and output.naive.p" pickle.dump(result, open('output.naive.p', 'wb'))
def test_transformer_idf_setter(): X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) orig = TfidfTransformer().fit(X) copy = TfidfTransformer() copy.idf_ = orig.idf_ assert_array_equal( copy.transform(X).toarray(), orig.transform(X).toarray())
class NaiveBayesClassifier(object): ''' classdocs ''' def __init__(self): self.classifier = MultinomialNB() #self.model = None def trainClassifier(self, trainingDocs, labels): self.trainingDocs = trainingDocs self.labels = labels self.count_vect = CountVectorizer(stop_words='english') X_train_counts = self.count_vect.fit_transform(self.trainingDocs) self.tf_transformer = TfidfTransformer(use_idf=True,sublinear_tf=True).fit(X_train_counts) X_train_tf = self.tf_transformer.transform(X_train_counts) self.ch2 = SelectKBest(chi2) X_train = self.ch2.fit_transform(X_train_tf, self.labels) #self.classifier.fit(X_train_tf, self.labels) self.classifier.fit(X_train, self.labels) def classify(self, docs_new): X_new_counts = self.count_vect.transform(docs_new) X_new_tfidf = self.tf_transformer.transform(X_new_counts) X_test = self.ch2.transform(X_new_tfidf) #predicted = self.model.predict(X_new_tfidf) #self.predicted = self.classifier.predict(X_new_tfidf) self.predicted = self.classifier.predict(X_test) #for doc, category in zip(docs_new, self.predicted): # print '%r => %s' % (doc,category) return self.predicted def calculate_score(self, doc_new): doc_list = [doc_new] #doc_list.append(doc_new) X_new_counts = self.count_vect.transform(doc_list) X_new_tfidf = self.tf_transformer.transform(X_new_counts) X_test = self.ch2.transform(X_new_tfidf) self.predicted = self.classifier.predict(X_test) return self.predicted #predicted_prob_all = self.classifier.predict_proba(X_test) #predicted_prob = [max(pr) for pr in predicted_prob_all] #return predicted_prob def score(self,docs_test,labels): X_new_counts = self.count_vect.transform(docs_test) X_new_tfidf = self.tf_transformer.transform(X_new_counts) X_test = self.ch2.transform(X_new_tfidf) #self.predicted = self.classifier.predict(X_new_tfidf) self.predicted = self.classifier.predict(X_test) accuracy = np.mean(self.predicted == labels) #accuracy = self.classifier.score(X_new_tfidf, labels) return accuracy
def tfidf_preprocessor(*args): x_train = args[0] x_test = args[1] x_train = [x.doc_2_vec for x in x_train] x_test = [x.doc_2_vec for x in x_test] tfidf_model = TfidfTransformer().fit(x_train) x_train_tfidf = tfidf_model.transform(x_train) x_test_tfidf = tfidf_model.transform(x_test) return x_train_tfidf, x_test_tfidf
def TextTransform(X, Xtest = None): Write("Process Data with TFIDF...\n") tfidf = TfidfTransformer() if Xtest is None: X = tfidf.fit_transform(X).toarray() return X else: tfidf.fit(X) return tfidf.transform(X).toarray(), tfidf.transform(Xtest).toarray()
class feature1: def __init__(self): self.count_vect = CountVectorizer(input='content',ngram_range=(2,3), min_df=0.2, max_df=1.0) def preprocess_X(self, X): X = [ans_to_tag[ans] for ans in X] X_train_counts = self.count_vect.fit_transform(X) self.tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = self.tf_transformer.transform(X_train_counts) return X_train_tf def preprocess_Y(self, Y): X_new_counts = self.count_vect.transform(Y) return self.tf_transformer.transform(X_new_counts)
class OneClassClassifier(object): ''' classdocs ''' def __init__(self): ''' Constructor ''' self.classifier = svm.OneClassSVM( kernel="rbf", gamma=0.0)#(nu=0.1, kernel="rbf", gamma=0.1) def trainClassifier(self, trainingDocs,labels): #self.trainingDocs = trainingDocs #self.labels = labels self.count_vect = CountVectorizer(stop_words='english') #X_train_counts = self.count_vect.fit_transform(self.trainingDocs) X_train_counts = self.count_vect.fit_transform(trainingDocs) self.tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) #self.tf_transformer = TfidfTransformer().fit(X_train_counts) X_train_tf = self.tf_transformer.transform(X_train_counts) self.ch2 = SelectKBest(chi2,k=100) X_train = self.ch2.fit_transform(X_train_tf, labels) #self.classifier.fit(X_train_tf, self.labels) self.classifier.fit(X_train) def calculate_score(self, doc_new): doc_list = [doc_new] #doc_list.append(doc_new) X_new_counts = self.count_vect.transform(doc_list) X_new_tfidf = self.tf_transformer.transform(X_new_counts) #X_test = self.ch2.transform(X_new_tfidf) X_test = X_new_tfidf self.predicted = self.classifier.predict(X_test) return self.predicted def score(self,docs_test,labels): ''' Here labels are 1 and -1 ''' X_new_counts = self.count_vect.transform(docs_test) X_new_tfidf = self.tf_transformer.transform(X_new_counts) X_test = self.ch2.transform(X_new_tfidf) #X_test = X_new_tfidf self.predicted = self.classifier.predict(X_test) print self.predicted accuracy = np.mean(self.predicted == labels) #accuracy = self.classifier.score(X_new_tfidf, labels) return accuracy
def svm_bench(): data_file = "./data/dataset.pkl" train_set, valid_set, test_set, word2id, pop2id, type2id = dataset.load_data(data_file) train_set_x, train_set_y = train_set train_set_pop_y, train_set_type_y, train_set_loc_y = train_set_y valid_set_x, valid_set_y = valid_set valid_set_pop_y, valid_set_type_y, valid_set_loc_y = valid_set_y test_set_x, test_set_y = test_set test_set_pop_y, test_set_type_y, test_set_loc_y = test_set_y id2word = {v:k for k,v in word2id.items()} word_train_set_x = [sen_dig2word(doc, id2word) for doc in train_set_x] word_valid_set_x = [sen_dig2word(doc, id2word) for doc in valid_set_x] word_test_set_x = [sen_dig2word(doc, id2word) for doc in test_set_x] # construct the word count matrix # construct the word count matrix count_vect = CountVectorizer() x_train_count = count_vect.fit_transform(word_train_set_x) x_valid_count = count_vect.transform(word_valid_set_x) x_test_count = count_vect.transform(word_test_set_x) tfidf_transformer = TfidfTransformer() x_train_tfidf = tfidf_transformer.fit_transform(x_train_count) x_valid_tfidf = tfidf_transformer.transform(x_valid_count) x_test_tfidf = tfidf_transformer.transform(x_test_count) # train the pop model pop_clf = svm.LinearSVC().fit(x_train_tfidf, train_set_pop_y) pop_pred = pop_clf.predict(x_valid_tfidf) pop_pred_test = pop_clf.predict(x_test_tfidf) # compute the performance pop_errors = np.mean(np.not_equal(pop_pred, valid_set_pop_y)) pop_errors_test = np.mean(np.not_equal(pop_pred_test, test_set_pop_y)) # train the event type model type_clf = svm.LinearSVC().fit(x_train_tfidf, train_set_type_y) type_pred = type_clf.predict(x_valid_tfidf) type_pred_test = type_clf.predict(x_test_tfidf) # compute the performance type_errors = np.mean(np.not_equal(type_pred, valid_set_type_y)) type_errors_test = np.mean(np.not_equal(type_pred_test, test_set_type_y)) print "SVM Valid--> Type error: %0.2f, Popuation error: %0.2f" % (type_errors, pop_errors) print "SVM Tes--> Type error: %0.2f, Popuation error: %0.2f" % (type_errors_test, pop_errors_test)
def cross_val_score(clf, data, target, k): shuffle_arr = [] size = len(data) for i in range(size): shuffle_arr.append(i) scores = [] for i in range(0, k): #generate shuffled train and test dataset data_train_raw = [] data_test_raw = [] target_train = [] target_test = [] # seperate shuffled train and test dataset random.shuffle(shuffle_arr) shuffle_train = shuffle_arr[:size - size/k] shuffle_test = shuffle_arr[size-size/k :] for j in shuffle_train: data_train_raw.append(data_total[j]) target_train.append(target[j]) for r in shuffle_test: data_test_raw.append(data_total[r]) target_test.append(target[r]) data_train = data_process(data_train_raw) data_test = data_process(data_test_raw) # transform array of string to counts count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(data_train) # transform counts to frequencies tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) # feature selection select = SelectPercentile(chi2, percentile = 10) X_train_fs = select.fit_transform(X_train_tf, target_train) # train the model clf_train = clf.fit(X_train_fs, target_train) # test the model X_new_counts = count_vect.transform(data_test) X_new_tfidf = tf_transformer.transform(X_new_counts) X_new_fs = select.transform(X_new_tfidf) test_result = clf_train.predict(X_new_fs) scores.append(GetPrecisionRecallF1(test_result, target_test)) #clf_score = clf_train.score(X_new_fs, target_test) #scores.append(clf_score) return scores
def tfidf_step_by_step(): """ Example of calculating TF-IDF for OSM nodes. Document is a list of keys. """ learn_data_set = documents_gen() test_data_set = documents_gen() # calculate term-frequency vectorizer = CountVectorizer(stop_words=stop_words, token_pattern='[a-z0-9_\-:]+') vectorizer.fit_transform(learn_data_set) #pprint.pprint(vectorizer.vocabulary_) # freq_term_matrix is a sparse matrix (elemens stored in Coordinate format # http://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_.28COO.29 ) freq_term_matrix = vectorizer.transform(test_data_set) # freq_term_matrix.todense() # l2 - Euclidean normalization # http://en.wikipedia.org/wiki/Norm_%28mathematics%29#Euclidean_norm tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) tf_idf = tfidf.transform(freq_term_matrix) pprint.pprint(tf_idf.todense())
def estimation(file='song_text.txt', separator=u'--text--'): arr = text_split_line(file, u'--text--') dvect = data_vector(arr) target = dvect[0] text = dvect[1] dic = dvect[2] # for converting target integer to artist name # print (target) # print (dic) count_vect = CountVectorizer() word_vect = count_vect.fit_transform(text) tfidf_transformer = TfidfTransformer() vect_tfidf = tfidf_transformer.fit_transform(word_vect) machine = svm.SVC(probability=True) # one of the best for text, see tutorial working with text machine.fit(vect_tfidf, target) print (machine.score(vect_tfidf, target)) prediction = machine.predict(vect_tfidf) # accuracy test (tutorial) print (u'model predictive accuracy: {:.1%}' .format(np.mean(prediction == target))) new_texts = [text[500], text[2345], text[-2], text[0], text[5893]] new_data = count_vect.transform(new_texts) new_tfidf = tfidf_transformer.transform(new_data) prediction = machine.predict(new_tfidf) for i in range(len(new_texts)): print (u'{}\t=> {}'.format(new_texts[i].splitlines()[:2], dic[prediction[i]])) return
def race_tfidf(data, can_be_noun_arg, stop_words): print data = data.groupby('race')['last'] data = dict(list(data)) docs = [] for k in data: docs.append(' '.join(data[k])) count_vectorizer = CountVectorizer(stop_words='english') counts = count_vectorizer.fit_transform(docs) #print counts.todense().shape tfidf = TfidfTransformer(norm="l2", sublinear_tf='True') tfidf.fit(counts) #print "IDF:", tfidf.idf_.shape tf_idf_matrix = tfidf.transform(counts) freqs = {} sorted_voc = sorted(count_vectorizer.vocabulary_.iteritems(), key=operator.itemgetter(1)) terms,_ = zip(*sorted_voc) for i,k in enumerate(data.keys()): # make list row = np.array(tf_idf_matrix.todense()[i,:])[0].tolist() freq = zip(terms, row) freqs[k] = sorted(freq, reverse=True, key=lambda x: x[1]) print freqs[k][:5] #print tf_idf_matrix.todense().shape return freqs
class VectorModel(object): def __init__(self , list_of_comments=None): self.__list_of_comments = list_of_comments self.__vectorizer = [] self.__corpus_simple_vector = [] self.__transformer = [] self.__corpus_tf_idf = [] #self.prepare_models() def prepare_models(self): self.__vectorizer = CountVectorizer() vector = self.__vectorizer.fit_transform(self.__list_of_comments) self.__corpus_simple_vector = vector.toarray() self.__transformer = TfidfTransformer() tfidf = self.__transformer.fit_transform(self.__corpus_simple_vector) self.__corpus_tf_idf = tfidf.toarray() return [self.__vectorizer , self.__corpus_simple_vector , self.__transformer , self.__corpus_tf_idf] def set_models(self , vectorizer , transformer): self.__vectorizer = vectorizer self.__transformer = transformer def get_comment_frequency_vector(self , comments): vec_comments = [] for i in comments: vec_comments.append(i) vectores = self.__vectorizer.transform(vec_comments).toarray() return vectores def get_comment_tf_idf_vector(self , comments): vector = self.get_comment_frequency_vector(comments) result = self.__transformer.transform(vector).toarray() return result
class UnitClassifier(Trainer): def __init__(self, x, y, train_ratio): super(UnitClassifier, self).__init__(x, y, train_ratio) self._count_vec = CountVectorizer() self._tfidf_transformer = TfidfTransformer() def Fit(self): x_count = self._count_vec.fit_transform(self._x_train) self._tfidf_transformer.fit(x_count) def Preprocess(self, x): return self._tfidf_transformer.transform(self._count_vec.transform(x)) def Learn(self, x_train, y_train): LOG.info('x_train.shape = %s', str(x_train.shape)) LOG.info('len(y_train) = %d', len(y_train)) clf = RandomForestClassifier(verbose=0, n_jobs=-1, n_estimators=20) LOG.info('Training...') clf.fit(x_train, y_train) LOG.info('Done...') return clf def Eval(self): LOG.info('Eval ...') y_pred = self.Predict(self._x_test) return { 'misclass': np.mean(y_pred != self._y_test), 'report': classification_report(self._y_test, y_pred, target_names=self._model.classes_) }
class CaloriesRegressor(Trainer): def __init__(self, x, y, train_ratio): super(CaloriesRegressor, self).__init__(x, y, train_ratio) self._count_vec = CountVectorizer() self._tfidf_transformer = TfidfTransformer() def Fit(self): x_count = self._count_vec.fit_transform(self._x_train) self._tfidf_transformer.fit(x_count) def Preprocess(self, x): return self._tfidf_transformer.transform(self._count_vec.transform(x)) def Learn(self, x_train, y_train): LOG.info('x_train.shape = %s', str(x_train.shape)) LOG.info('len(y_train) = %d', len(y_train)) clf = RandomForestRegressor(verbose=0, n_jobs=-1, n_estimators=100) LOG.info('Training...') clf.fit(x_train, y_train) LOG.info('Done...') return clf def Eval(self): LOG.info('Eval ...') y_pred = self.Predict(self._x_test) return { 'median_absolute_error': median_absolute_error(self._y_test, y_pred), 'mean_squared_error': mean_squared_error(self._y_test, y_pred), 'explained_variance_score': explained_variance_score(self._y_test, y_pred), }
def check_webshell(clf,dir): all=0 all_php=0 webshell=0 webshell_files_list = load_files_re(webshell_dir) CV = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", max_features=max_features, token_pattern=r'\b\w+\b', min_df=1, max_df=1.0) x = CV.fit_transform(webshell_files_list).toarray() transformer = TfidfTransformer(smooth_idf=False) transformer.fit_transform(x) g = os.walk(dir) for path, d, filelist in g: for filename in filelist: fulepath=os.path.join(path, filename) t = load_file(fulepath) t_list=[] t_list.append(t) x2 = CV.transform(t_list).toarray() x2 = transformer.transform(x2).toarray() y_pred = clf.predict(x2) all+=1 if filename.endswith('.php'): all_php+=1 if y_pred[0] == 1: print "%s is webshell" % fulepath webshell+=1 print "Scan %d files(%d php files),%d files is webshell" %(all,all_php,webshell)
def extract_text_features(train_data, test_data): """ Returns one types of training and test data features. 1) Term Frequency times Inverse Document Frequency (tf-idf): X_train_tfidf, X_test_tfidf Parameters ---------- train_data : List[str] Training data in list. Will only take 30000 reviews for efficiency purposes test_data : List[str] Test data in list Returns ------- Tuple(scipy.sparse.csr.csr_matrix,.., list) Returns X_train_tfidf, X_test_tfidf, vocab as a tuple. """ # set up a count vectorizer that removes english stopwords when building a term-doc matrix count_vect = CountVectorizer(stop_words=set(stopwords.words('english'))) # build the term frequency per document matrix from a random sublist of 30,000 documents train_counts = count_vect.fit_transform(random.sample(train_data, 30000)) test_counts = count_vect.transform(test_data) tfidf_transformer = TfidfTransformer() train_tfidf = tfidf_transformer.fit_transform(train_counts) test_tfidf = tfidf_transformer.transform(test_counts) vocab = count_vect.get_feature_names() return (train_tfidf, test_tfidf, vocab)
def load_dataset(prefix, sufix, dic_fn, vocab_fn='./data/english_review.trn-100000.vocab'): train_file = prefix + "_train.txt.tok" test_file = prefix + "_test.txt.tok" train_y_file = prefix + "_train." + sufix test_y_file = prefix + "_test." + sufix dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))} word_train_set = [l.strip().lower() for l in open(train_file)] word_test_set = [l.strip().lower() for l in open(test_file)] train_y = [dic_cn[l.strip()] for l in open(train_y_file)] test_y = [dic_cn[l.strip()] for l in open(test_y_file)] vocab = [l.strip().lower().split("\t")[0] for l in open(vocab_fn)] count_vect = CountVectorizer(vocabulary=vocab) train_set_count = count_vect.fit_transform(word_train_set) test_set_count = count_vect.transform(word_test_set) tfidf_transformer = TfidfTransformer() train_set_x = tfidf_transformer.fit_transform(train_set_count).toarray() test_set_x = tfidf_transformer.transform(test_set_count).toarray() train_shared_x, train_shared_y = shared_dataset([train_set_x, train_y]) test_shared_x, test_shared_y = shared_dataset([test_set_x, test_y]) return [(train_shared_x, train_shared_y), (test_shared_x, test_shared_y)]
def test_classifiers(): print "running bayes classifier.." # train_bayesian_classifier_from_scratch() dataset = get_thing_from_file("training_dataset.txt") print dataset.target_names bayes = get_thing_from_file("bayes.txt") bayes_model = bayes.fit(dataset.data, dataset.target) bayes_model = get_thing_from_file("bayes_model.txt") results = [] count = 0 url_arr = [] bayes_predicted = bayes_model.predict(dataset) # for url in get_test_articles(): # url_arr.append(url) article_arr = get_article_array(url_arr) docs_new = ['God is love', 'OpenGL on the GPU is fast'] count_vect = CountVectorizer() tfidf_trans = TfidfTransformer() x_new_counts = count_vect.transform(docs_new) x_new_horse = tfidf_trans.transform(x_new_counts) predicted = bayes_model.predict(x_new_horse) for doc, category in zip(docs_new, predicted): print('%r => %s' % (doc, dataset.target_names[category]))
def cal_product_description_tfidf(): #PART II compute the tf-idf for product description print "\nBegins,compute the tf-idf for product description ..." product_description_data = pd.read_csv('product_descriptions.csv') print "\nMerge the product description into database..." AllSet = pd.merge( AllSet , product_description_data, how='left', on='product_uid') print "\nStemming the product description ..." AllSet['product_description'] = AllSet['product_description'].map(lambda x: stem_process(x)) product_description=AllSet['product_description'] print "\nGet the (product description vocabulary)-(search term) frequency matrix..." search_vect_descrip = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency search_vect_descrip.fit(product_description)#learn the vocabulary search_descrip_fq_matrix = search_vect_descrip.transform(search_term) #get the (product description vocabulary)-(search term) frequency matrix print "\nGet the (product description vocabulary)-(product_description) frequency matrix..." description_vect = CountVectorizer(stop_words ='english') description_vect.fit_transform(product_description)#learn the vocabulary description_fq_matrix=description_vect.transform(product_description) #get the (product discription vocabulary)-(product_description) frequency matrix print "\nGet the idf matrix..." tfidf_transformer = TfidfTransformer(norm="l2",smooth_idf=True) tfidf_transformer.fit(description_fq_matrix) # get idf for each vocabulary tf_idf_descrip_matrix = tfidf_transformer.transform(description_fq_matrix) #get the idf matrix print "\nCompute the result of tf-idf for product description ..." tf_idf_descrip_result=[]#compute the result of tf-idf for product title for index in range(tf_idf_descrip_matrix.shape[0]): tf_idf_descrip_result.append((np.multiply(tf_idf_descrip_matrix[index], search_descrip_fq_matrix[index].transpose()))[0, 0]) pd.DataFrame({"id":AllSet['id'],"product_description_tfidf": tf_idf_descrip_result}).to_csv('product_description_tfidf.csv', index=False)
def cal_product_title_tfidf(): #PART I compute the tf-idf for product title print "\nBegins,compute the tf-idf for product title ..." print "\nStemming product_title..." AllSet['product_title'] = AllSet['product_title'].map(lambda x : stem_process(x)) product_title = AllSet['product_title'] print "\nGet the (product title vocabulary)-(search term) frequency matrix..." search_vect_tittle = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency search_vect_tittle.fit(product_title)#learn the vocabulary search_tittle_fq_matrix = search_vect_tittle.transform(search_term) #get the (product title vocabulary)-(search term) frequency matrix print "\nGet the (product title vocabulary)-(product_title) frequency matrix" title_vect = CountVectorizer(stop_words='english') title_vect.fit_transform(product_title)#learn the vocabulary title_fq_matrix = title_vect.transform(product_title) #get the (product title vocabulary)-(product_title) frequency matrix print "\nGet the idf matrix" tfidf_transformer = TfidfTransformer(norm="l2", smooth_idf=True) tfidf_transformer.fit(title_fq_matrix) # get idf for each vocabulary tf_idf_title_matrix = tfidf_transformer.transform(title_fq_matrix) #get the idf matrix print "\nCompute the result of tf-idf for product title ..." tf_idf_title_result = [] #compute the result of tf-idf for product title for index in range(tf_idf_title_matrix.shape[0]): tf_idf_title_result.append((np.multiply(tf_idf_title_matrix[index], search_tittle_fq_matrix[index].transpose()))[0, 0]) pd.DataFrame({"id": AllSet['id'],"product_title_tfidf": tf_idf_title_result}).to_csv('product_title_tfidf.csv', index=False) return 0
def bayes_tfidf(prefix, sufix, dic_fn): """ prefix example: ./data/single_label_sen/sen_spanish_protest sufix example: pop_cat """ train_file = prefix + "_train.txt.tok" test_file = prefix + "_test.txt.tok" train_y_file = prefix + "_train." + sufix test_y_file = prefix + "_test." + sufix dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))} word_train_set = [l.strip().lower() for l in open(train_file)] word_test_set = [l.strip().lower() for l in open(test_file)] train_y = [dic_cn[l.strip()] for l in open(train_y_file)] test_y = [dic_cn[l.strip()] for l in open(test_y_file)] # construct the word count matrix count_vect = CountVectorizer() train_set_count = count_vect.fit_transform(word_train_set) test_set_count = count_vect.transform(word_test_set) # construct tfidf matrix tfidf_transformer = TfidfTransformer() train_set_x = tfidf_transformer.fit_transform(train_set_count) test_set_x = tfidf_transformer.transform(test_set_count) print "start the model" test_score = bayes_experiment([train_set_x, train_y], [test_set_x, test_y]) return test_score
def tfidf_score(train_set, test_set): stopwords = nltk.corpus.stopwords.words('english') vectorizer = TfidfVectorizer(min_df=1, stop_words=set(stopwords)) #Remove all the None Types from the input datasets train_set = filter(None, train_set) test_set = filter(None, test_set) vectorizer.fit_transform(train_set) #print "Word Index is {0} \n".format(vectorizer.vocabulary_) smatrix = vectorizer.transform(test_set) tfidf = TfidfTransformer(norm="l2") tfidf.fit(smatrix) #print "IDF scores:", tfidf.idf_ tf_idf_matrix = tfidf.transform(smatrix) pairwise_similarity = tf_idf_matrix * tf_idf_matrix.T msum = tf_idf_matrix.sum(axis=1) cos_sum = pairwise_similarity.sum(axis=1) mlist = msum.tolist() cos_sim = cos_sum.tolist() count = 0 tfidfscores = {} for s in train_set: tfidfscores[s] = [] tfidfscores[s].append(mlist[count][0]) tfidfscores[s].append(cos_sim[count][0]) count += 1 return tfidfscores
# # See below for a simple example. # # **Example:** # # Consider a document containing 100 words wherein the word cat appears 3 times. # # The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10 million documents and the word cat appears in one thousand of these. Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12. # ____ # # Let's go ahead and see how we can do this in SciKit Learn: from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer().fit(messages_bow) tfidf4 = tfidf_transformer.transform(bow4) print(tfidf4) # We'll go ahead and check what is the IDF (inverse document frequency) of the word `"u"` and of word `"university"`? print(tfidf_transformer.idf_[bow_transformer.vocabulary_['u']]) print(tfidf_transformer.idf_[bow_transformer.vocabulary_['university']]) # To transform the entire bag-of-words corpus into TF-IDF corpus at once: messages_tfidf = tfidf_transformer.transform(messages_bow) print(messages_tfidf.shape) # There are many ways the data can be preprocessed and vectorized. These steps involve feature engineering and building a "pipeline". I encourage you to check out SciKit Learn's documentation on dealing with text data as well as the expansive collection of available papers and books on the general topic of NLP. # ## Training a model
return [ w for w in nopunc.split() if w.lower() not in stopwords.words("english") ] bow_transformer = CountVectorizer(analyzer=text_process).fit( messages["message"]) print(len(bow_transformer.vocabulary_)) messages_bow = bow_transformer.transform(messages["message"]) print("shape of the Sparse Matrix:", messages_bow.shape) tfidf_trans = TfidfTransformer().fit(messages_bow) messages_tfidf = tfidf_trans.transform(messages_bow) spam_detect = MultinomialNB().fit(messages_tfidf, messages["label"]) print(spam_detect.predict(messages_tfidf[4])[0]) msg_train, msg_test, la_train, la_test = train_test_split(messages["message"], messages["label"], test_size=.3) pipes = Pipeline([("bow", CountVectorizer(analyzer=text_process)), ("tfidf", TfidfTransformer()), ("classifier", MultinomialNB())]) pipes.fit(msg_train, la_train)
return (data, target) X, y = get_data(DATA_DIR) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42) count_vec = CountVectorizer() X_train_counts = count_vec.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = MultinomialNB().fit(X_train_tfidf, y_train) #clf = SGDClassifier(tol=None, n_jobs=-1).fit(X_train_tfidf, y_train) X_test_counts = count_vec.transform(X_test) X_test_tfidf = tfidf_transformer.transform(X_test_counts) text_clf = Pipeline([('count_vec', CountVectorizer()), ('tfidf_transformer', TfidfTransformer()), ('clf', SGDClassifier(tol=None, n_jobs=-1))]).fit(X_train, y_train) y_pred = clf.predict(X_test_tfidf) y_pred = text_clf.predict(X_test) print(metrics.classification_report(y_test, y_pred, target_names=target_names)) print(metrics.jaccard_score(y_test, y_pred))
def run_classifer(X_train, s_train, y_train, X_test, s_test, y_test): s_train = np.array(s_train) # samples x features s_test = np.array(s_test) num_labels = 15 batch_size = 100 stemmer = sb.SnowballStemmer('english') swlist = sw.words('english') swlist += [stemmer.stem(w) for w in swlist] swlist += [ "'d", "'s", 'abov', 'ani', 'becaus', 'befor', 'could', 'doe', 'dure', 'might', 'must', "n't", 'need', 'onc', 'onli', 'ourselv', 'sha', 'themselv', 'veri', 'whi', 'wo', 'would', 'yourselv' ] #complained about not having these as stop words pubs = [ 'buzzfe', 'buzzf', 'npr', 'cnn', 'vox', 'reuter', 'breitbart', 'fox', 'guardian', 'review', 'theatlant' ] punct = [ ] #[':', '..', '“', '@', '%', ';', '→', ')', '#', '(', '*', '&', '[', ']', '…', '?','—', '‘', '$'] #gonna leave these in for now swlist += pubs swlist += punct if sys.argv[4].lower() == 'true': tkzr = StemTokenizer() else: tkzr = None if sys.argv[5].lower() != 'true': swlist = [] #what features are we using? if sys.argv[7].lower() == 'word': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(X_train) X_train = tfidf_transformer.transform(X_train) X_test = tfidf_transformer.transform(X_test) elif sys.argv[7].lower() == 'topic': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) lda_model = LatentDirichletAllocation(n_components=10) lda_model.fit(X_train) X_train = lda_model.transform(X_train) X_test = lda_model.transform(X_test) elif sys.argv[7].lower() == 'style': X_train = csr_matrix(s_train) X_test = csr_matrix(s_test) elif sys.argv[7].lower() == 'all': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(X_train) X_train_tf = tfidf_transformer.transform(X_train) X_test_tf = tfidf_transformer.transform(X_test) print(type(X_train_tf)) lda_model = LatentDirichletAllocation(n_components=10) lda_model.fit(X_train) X_train_lda = lda_model.transform(X_train) X_test_lda = lda_model.transform(X_test) print(type(X_train_lda)) X_train = csr_matrix( sparse.hstack( [X_train_tf, csr_matrix(X_train_lda), csr_matrix(s_train)])) X_test = csr_matrix( sparse.hstack( [X_test_tf, csr_matrix(X_test_lda), csr_matrix(s_test)])) print(type(X_train)) # sparse.save_npz("X_train" + sys.argv[6] + ".npz", X_train) # sparse.save_npz("X_test" + sys.argv[6] + ".npz", X_test) else: sys.exit('unknown features') encoder = LabelBinarizer() encoder.fit(y_train) y_train = encoder.transform(y_train) y_test = encoder.transform(y_test) # np.save('X_train.npy', X_train) # np.save('X_test.npy', X_test) # np.save('y_train.npy', y_train) # np.save('y_test.npy', y_test) # sparse.save_npz("y_train" + sys.argv[6] + ".npz", y_train) # sparse.save_npz("y_test" + sys.argv[6] + ".npz", y_test) # load everything back # X_train = sparse.load_npz("X_train.npz") input_dim = X_train.shape[1] model = Sequential() model.add(Dense(512, input_shape=(input_dim, ))) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(num_labels)) model.add(Activation('softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(X_train, y_train, batch_size=batch_size, epochs=5, verbose=1, validation_split=0.1) # model.model.save(sys.argv[6] + '.h5') # X_train = np.load('X_train.npy') # X_test = np.load('X_test.npy') # y_train = np.load('y_train.npy') # y_test = np.load('y_test.npy') # model = keras.models.load_model(sys.argv[6] + '.h5') score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1) print('Test accuracy:', score[1]) y_pred = model.predict(X_test, batch_size=batch_size, verbose=1) predicted = np.argmax(y_pred, axis=1) p, r, fs, s = precision_recall_fscore_support(np.argmax(y_test, axis=1), predicted) print(p, r, fs, s)
#Sparsity: comparing non zero vs total number of messages sparsity = (100.0 * message_bow.nnz / (message_bow.shape[0] * message_bow.shape[1])) print('sparsity: {}'.format((sparsity))) from sklearn.feature_extraction.text import TfidfTransformer #calculating the Term frequency and Inverse document frequency tfidf_transformer = TfidfTransformer().fit(message_bow) tfidf4 = tfidf_transformer.transform(bow4) print(tfidf4) messages_tfidf=tfidf_transformer.transform(message_bow) #calculating TF-IDF for bag of words #Naive bayes classifier from sklearn.naive_bayes import MultinomialNB spam_detect_model = MultinomialNB().fit(messages_tfidf, messages['label']) #Detection filter using Naive Bayes for Label 2 print('predicted:', spam_detect_model.predict(tfidf4)[0]) print('expected:', messages.label[2]) #Model Evaluation of all all_predictions = spam_detect_model.predict(messages_tfidf)
# print(tf_matrix_2.shape) # print("\nCalculating inverse document frequency (IDF) matrices") # Each vector's component is now the idf for each term tfidfTran = TfidfTransformer(norm="l2") tfidfTran.fit(tf_matrix) # print(tfidfTran.idf_) # Manually verify that the IDF is correct # print("The idf for terms that appear in one document: " + str(idf(2,1))) # print("The idf for terms that appear in two documents: " + str(idf(2,2))) # print("\nCreating the TF-IDF matrices") # Transform method here multiples the tf matrix by the diagonal idf matrix # The method then divides the tf-idf matrix by the Euclidean norm tfidf_matrix = tfidfTran.transform(tf_matrix) # print(tfidf_matrix.toarray()) # print("\nCreating the cosine similarity matrices") # Multiply matrix by transpose to get final result cos_similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray() print("#: {}, score: {}".format(i+1, cos_similarity_matrix[0][1])) group_set[i]['similarity'] = cos_similarity_matrix[0][1] # Calculating average and appending it to group_set # print("\nCalculating averages") # num_reviews = len(current_set) # for i in range(num_reviews): # # i is the index of the focal review # # cos_similarity_matrix[i] contains the array of similarity scores for the focal review
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert counts_train[0, v1.vocabulary_["pizza"]] == 2 # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary_) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() vocabulary = v.vocabulary_ assert counts_test[0, vocabulary["salad"]] == 1 assert counts_test[0, vocabulary["tomato"]] == 1 assert counts_test[0, vocabulary["water"]] == 1 # stop word from the fixed list assert "the" not in vocabulary # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert "copyright" not in vocabulary # not present in the sample assert counts_test[0, vocabulary["coke"]] == 0 assert counts_test[0, vocabulary["burger"]] == 0 assert counts_test[0, vocabulary["beer"]] == 0 assert counts_test[0, vocabulary["pizza"]] == 0 # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = t1.fit(counts_train).transform(counts_train).toarray() assert len(t1.idf_) == len(v1.vocabulary_) assert tfidf.shape == (n_train, len(v1.vocabulary_)) # test tf-idf with new data tfidf_test = t1.transform(counts_test).toarray() assert tfidf_test.shape == (len(test_data), len(v1.vocabulary_)) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = t2.fit(counts_train).transform(counts_train).toarray() assert not hasattr(t2, "idf_") # test idf transform with unlearned idf vector t3 = TfidfTransformer(use_idf=True) with pytest.raises(ValueError): t3.transform(counts_train) # test idf transform with incompatible n_features X = [[1, 1, 5], [1, 1, 0]] t3.fit(X) X_incompt = [[1, 3], [1, 3]] with pytest.raises(ValueError): t3.transform(X_incompt) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = TfidfVectorizer(norm='l1') tv.max_df = v1.max_df tfidf2 = tv.fit_transform(train_data).toarray() assert not tv.fixed_vocabulary_ assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = tv.transform(test_data).toarray() assert_array_almost_equal(tfidf_test, tfidf_test2) # test transform on unfitted vectorizer with empty vocabulary v3 = CountVectorizer(vocabulary=None) with pytest.raises(ValueError): v3.transform(train_data) # ascii preprocessor? v3.set_params(strip_accents='ascii', lowercase=False) processor = v3.build_preprocessor() text = ("J'ai mangé du kangourou ce midi, " "c'était pas très bon.") expected = strip_accents_ascii(text) result = processor(text) assert expected == result # error on bad strip_accents param v3.set_params(strip_accents='_gabbledegook_', preprocessor=None) with pytest.raises(ValueError): v3.build_preprocessor() # error with bad analyzer type v3.set_params = '_invalid_analyzer_type_' with pytest.raises(ValueError): v3.build_analyzer()
def build(self, texts: list, topics: list): token_pattern = r"(?u)\b\w\w+\b" vectorizer = CountVectorizer( token_pattern=token_pattern, stop_words=get_stop_words('russian') ) X = vectorizer.fit_transform(texts) transformer = TfidfTransformer() X = transformer.fit_transform(X) if self._vectorizer_path is not None: with open(self._vectorizer_path, 'w') as f: for key, value in vectorizer.vocabulary_.items(): f.write("%s " % key) f.write("%s \n" % value) classifier = SGDClassifier( loss="log", class_weight='balanced', penalty='l1', alpha=0.0000009, n_jobs=-1 ) if self._test_quality_iters is not None: for i in range(self._test_quality_iters): X_train, X_test, y_train, y_test = train_test_split(X, topics, test_size=0.33) classifier.fit(X_train, y_train) predicted = classifier.predict(X_test) print('Accuracy', np.mean(predicted == y_test)) if self._X_test is not None: X_test = self._X_test y_test = self._y_test classifier.fit(X, topics) sorted_topics = np.unique(topics) X_transformed = transformer.transform(vectorizer.transform(X_test)) predicted = classifier.predict(X_transformed) print('Accuracy on real tests:', np.mean(predicted == y_test)) # vocabulary = vectorizer.get_feature_names() : for human friendly features if self._tests_path is not None: with open(self._tests_path, 'w') as t: t.write("%s %s\n" % (self._tests_num, X.shape[1])) for index in range(len(X_test)): doc = X_transformed[index] probs = classifier.predict_proba(doc) for item in probs[0]: t.write("%s " % item) t.write("\n") orig_doc = X_test[index] t.write("%s \n" % orig_doc) for item in doc.toarray()[0]: t.write("%s " % item) t.write("\n") print(orig_doc) pred_topics = {} for i in range(len(probs[0])): probability = probs[0][i] topic = sorted_topics[i] pred_topics[topic] = probability print(sorted(pred_topics.items(), key=lambda kv: kv[1], reverse=True)) print("______") if self._weights_path is not None or self._tests_path is not None: with open(self._weights_path, 'w') as f: f.write("%s " % classifier.coef_.shape[0]) # amount of classes f.write("%s \n" % classifier.coef_.shape[1]) # amount of features for line in classifier.classes_: f.write("%s \n" % line) for line in classifier.coef_: for index, item in enumerate(line): if item != 0.0: f.write("%s %s " % (index, item)) f.write("\n") for item in classifier.intercept_: f.write("%s " % item) return classifier