Beispiel #1
0
scale = preprocessing.StandardScaler().fit(XtrainPos)
XtrainPos = scale.fit_transform(XtrainPos)
XtestPos = scale.fit_transform(XtestPos)

#scale = preprocessing.MinMaxScaler()
#XtrainPos = scale.fit_transform(XtrainPos)
#XtestPos = scale.fit_transform(XtestPos)
#
scale = preprocessing.Normalizer().fit(XtrainPos)
XtrainPos = scale.fit_transform(XtrainPos)
XtestPos = scale.fit_transform(XtestPos)

#classification
clf = CDClassifier(penalty="l1/l2", loss="squared_hinge",multiclass=True,max_iter=20,C=1,
                   alpha=1e-4,tol=1e-3)

#clf = LinearSVC(penalty="l2")
clf = clf.fit(XtrainPos, YtrainPos)
print(metrics.classification_report(YtestPos, clf.predict(XtestPos)))

## Crossvalidation 5 times using different split
#scores = cross_validation.cross_val_score(clf_svm, posfeat, label, cv=5, scoring='f1')
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Visualization
#plt.hist(XtrainPos[:,0])
#plt.show()



Beispiel #2
0
    terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, sfs),
    metadata=[
        '/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames
    ])

file_name = "demo_sklearn_sfs.html"
open(file_name, 'wb').write(html.encode('utf-8'))
print("open " + file_name)

sfs = (corpus.get_scaled_f_scores('alt.atheism', beta=1) - 0.5) * 2
html = st.produce_frequency_explorer(
    corpus,
    'alt.atheism',
    scores=sfs,
    use_term_significance=False,
    terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, sfs),
    metadata=[
        '/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames
    ])

file_name = "demo_sklearn_sfs_beta1.html"
open(file_name, 'wb').write(html.encode('utf-8'))
print("open " + file_name)

newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'))
X_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(X_test)
f1 = f1_score(pred, newsgroups_test.target, average='micro')
print("Microaveraged F1 score", f1)
	terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, sfs),
	metadata = ['/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames]
)

file_name = "demo_sklearn_sfs.html"
open(file_name, 'wb').write(html.encode('utf-8'))
print("open " + file_name)

sfs = (corpus.get_scaled_f_scores('alt.atheism', beta=1) - 0.5) * 2
html = st.produce_frequency_explorer(
	corpus,
	'alt.atheism',
	scores=sfs,
	use_term_significance=False,
	terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, sfs),
	metadata = ['/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames]
)

file_name = "demo_sklearn_sfs_beta1.html"
open(file_name, 'wb').write(html.encode('utf-8'))
print("open " + file_name)


newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'))
X_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(X_test)
f1 = f1_score(pred, newsgroups_test.target, average='micro')
print("Microaveraged F1 score", f1)