Python WikiTfIdfVectorizer Examples

Programming Language: Python

Namespace/Package Name: WikiTfIdfVectorizer

Examples at hotexamples.com: 2

Python WikiTfIdfVectorizer - 2 examples found. These are the top rated real world Python examples of WikiTfIdfVectorizer.WikiTfIdfVectorizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

get_cluster_list(1)

get_label_vector(1)

get_vectorized_dataset(1)

get_vectorizer(1)

vectorize(1)

Example #1

Show file

File: doc_clustering.py Project: catherineverdiergo/MS_BIGDATA2016

from WikiSampleLoader import WikiSampleLoader
from WikiTfIdfVectorizer import WikiTfIdfVectorizer
from WikiKmeans import WikiKmeans
from sklearn import metrics
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import os

# Load dataset from pkz file and vectorize it
#w_tf_idf = WikiTfIdfVectorizer(use_hashing=True)
wsl = WikiSampleLoader()
#wsl = WikiSampleLoader(file_name=os.environ['HOME']+"/scikit_learn_data/20news-bydate.pkz")
stop_w = set(ENGLISH_STOP_WORDS)
stop_w = stop_w.union(['url', 'http', 'www', 'ref', 'jpg', 'file', 'com'])
stop_w = stop_w.union(['web', 'category', 'reference', 'title', 'org', 'br'])
w_tf_idf = WikiTfIdfVectorizer(stop_words=stop_w)
w_tf_idf.vectorize(wsl)
# get vectorized dataset
X = w_tf_idf._X
# init K-means
k = len(w_tf_idf._cluster_list)
labels = w_tf_idf._labels
wkm = WikiKmeans(k)
# apply K-means
km = wkm.apply_K_means(X)

print(labels)
print(km.labels_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))

Example #2

Show file

File: doc_clustering.py Project: catherineverdiergo/MS_BIGDATA2016

op.print_help()

(opts, args) = op.parse_args()

# Initialize WikiSampleLoader
wsl = WikiSampleLoader(file_name=opts.dataset_file)

# Add wiki frequent technical words to stop_words
# to avoid overfitting on not relevant terms
stop_w = set(ENGLISH_STOP_WORDS)
# Enrich stop_words set with wiki frequent technical tags
stop_w = stop_w.union(['url', 'http', 'www', 'ref', 'jpg', 'file', 'com'])
stop_w = stop_w.union(['web', 'category', 'reference', 'title', 'org', 'br'])

w_tf_idf = WikiTfIdfVectorizer(stop_words=stop_w,
                               use_idf=opts.use_idf,
                               n_features=opts.n_features,
                               use_hashing=opts.use_hashing)
w_tf_idf.vectorize(wsl)

# get vectorized dataset
X = w_tf_idf.get_vectorized_dataset()
# init K-means
k = len(w_tf_idf.get_cluster_list())
labels = w_tf_idf.get_label_vector()
wkm = WikiKmeans(k, verbose=opts.verbose, mini_batch=opts.minibatch, init=opts.init)
# apply K-means
km = wkm.apply_K_means(X)

#print(labels)
#print(km.labels_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))