from WikiSampleLoader import WikiSampleLoader from WikiTfIdfVectorizer import WikiTfIdfVectorizer from WikiKmeans import WikiKmeans from sklearn import metrics from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS import os # Load dataset from pkz file and vectorize it #w_tf_idf = WikiTfIdfVectorizer(use_hashing=True) wsl = WikiSampleLoader() #wsl = WikiSampleLoader(file_name=os.environ['HOME']+"/scikit_learn_data/20news-bydate.pkz") stop_w = set(ENGLISH_STOP_WORDS) stop_w = stop_w.union(['url', 'http', 'www', 'ref', 'jpg', 'file', 'com']) stop_w = stop_w.union(['web', 'category', 'reference', 'title', 'org', 'br']) w_tf_idf = WikiTfIdfVectorizer(stop_words=stop_w) w_tf_idf.vectorize(wsl) # get vectorized dataset X = w_tf_idf._X # init K-means k = len(w_tf_idf._cluster_list) labels = w_tf_idf._labels wkm = WikiKmeans(k) # apply K-means km = wkm.apply_K_means(X) print(labels) print(km.labels_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
op.print_help() (opts, args) = op.parse_args() # Initialize WikiSampleLoader wsl = WikiSampleLoader(file_name=opts.dataset_file) # Add wiki frequent technical words to stop_words # to avoid overfitting on not relevant terms stop_w = set(ENGLISH_STOP_WORDS) # Enrich stop_words set with wiki frequent technical tags stop_w = stop_w.union(['url', 'http', 'www', 'ref', 'jpg', 'file', 'com']) stop_w = stop_w.union(['web', 'category', 'reference', 'title', 'org', 'br']) w_tf_idf = WikiTfIdfVectorizer(stop_words=stop_w, use_idf=opts.use_idf, n_features=opts.n_features, use_hashing=opts.use_hashing) w_tf_idf.vectorize(wsl) # get vectorized dataset X = w_tf_idf.get_vectorized_dataset() # init K-means k = len(w_tf_idf.get_cluster_list()) labels = w_tf_idf.get_label_vector() wkm = WikiKmeans(k, verbose=opts.verbose, mini_batch=opts.minibatch, init=opts.init) # apply K-means km = wkm.apply_K_means(X) #print(labels) #print(km.labels_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))