scale = preprocessing.StandardScaler().fit(XtrainPos) XtrainPos = scale.fit_transform(XtrainPos) XtestPos = scale.fit_transform(XtestPos) #scale = preprocessing.MinMaxScaler() #XtrainPos = scale.fit_transform(XtrainPos) #XtestPos = scale.fit_transform(XtestPos) # scale = preprocessing.Normalizer().fit(XtrainPos) XtrainPos = scale.fit_transform(XtrainPos) XtestPos = scale.fit_transform(XtestPos) #classification clf = CDClassifier(penalty="l1/l2", loss="squared_hinge",multiclass=True,max_iter=20,C=1, alpha=1e-4,tol=1e-3) #clf = LinearSVC(penalty="l2") clf = clf.fit(XtrainPos, YtrainPos) print(metrics.classification_report(YtestPos, clf.predict(XtestPos))) ## Crossvalidation 5 times using different split #scores = cross_validation.cross_val_score(clf_svm, posfeat, label, cv=5, scoring='f1') #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Visualization #plt.hist(XtrainPos[:,0]) #plt.show()
terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, sfs), metadata=[ '/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames ]) file_name = "demo_sklearn_sfs.html" open(file_name, 'wb').write(html.encode('utf-8')) print("open " + file_name) sfs = (corpus.get_scaled_f_scores('alt.atheism', beta=1) - 0.5) * 2 html = st.produce_frequency_explorer( corpus, 'alt.atheism', scores=sfs, use_term_significance=False, terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, sfs), metadata=[ '/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames ]) file_name = "demo_sklearn_sfs_beta1.html" open(file_name, 'wb').write(html.encode('utf-8')) print("open " + file_name) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')) X_test = vectorizer.transform(newsgroups_test.data) pred = clf.predict(X_test) f1 = f1_score(pred, newsgroups_test.target, average='micro') print("Microaveraged F1 score", f1)
terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, sfs), metadata = ['/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames] ) file_name = "demo_sklearn_sfs.html" open(file_name, 'wb').write(html.encode('utf-8')) print("open " + file_name) sfs = (corpus.get_scaled_f_scores('alt.atheism', beta=1) - 0.5) * 2 html = st.produce_frequency_explorer( corpus, 'alt.atheism', scores=sfs, use_term_significance=False, terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, sfs), metadata = ['/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames] ) file_name = "demo_sklearn_sfs_beta1.html" open(file_name, 'wb').write(html.encode('utf-8')) print("open " + file_name) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')) X_test = vectorizer.transform(newsgroups_test.data) pred = clf.predict(X_test) f1 = f1_score(pred, newsgroups_test.target, average='micro') print("Microaveraged F1 score", f1)