stop_w = set(ENGLISH_STOP_WORDS) # Enrich stop_words set with wiki frequent technical tags stop_w = stop_w.union(['url', 'http', 'www', 'ref', 'jpg', 'file', 'com']) stop_w = stop_w.union(['web', 'category', 'reference', 'title', 'org', 'br']) w_tf_idf = WikiTfIdfVectorizer(stop_words=stop_w, use_idf=opts.use_idf, n_features=opts.n_features, use_hashing=opts.use_hashing) w_tf_idf.vectorize(wsl) # get vectorized dataset X = w_tf_idf.get_vectorized_dataset() # init K-means k = len(w_tf_idf.get_cluster_list()) labels = w_tf_idf.get_label_vector() wkm = WikiKmeans(k, verbose=opts.verbose, mini_batch=opts.minibatch, init=opts.init) # apply K-means km = wkm.apply_K_means(X) #print(labels) #print(km.labels_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) if not opts.use_hashing: