def DetectNonOutliersByThreshold(dic_tupple_class, avgItemsInCluster_in_a_batch): non_outlier_pred_true_txts_in_all_clusters=[] outlier_pred_true_txts_in_all_clusters=[] for label, pred_true_txts in dic_tupple_class.items(): itemsInCluster=len(pred_true_txts) if itemsInCluster>avgItemsInCluster_in_a_batch: #print("cluster label="+str(label)+", "+str(itemsInCluster)) textsArr=[] for pred_true_txt in pred_true_txts: textsArr.append(pred_true_txt[2]) vectorizer = TfidfVectorizer( max_df=1.0, min_df=1, stop_words='english', use_idf=True, smooth_idf=True, norm='l2') x_train = vectorizer.fit_transform(textsArr) contratio = 0.3 isf = IsolationForest(n_estimators=100, max_samples='auto', contamination=contratio, max_features=1.0, bootstrap=True, verbose=0, random_state=0, behaviour='new') #isf=IsolationForest(n_estimators=100, max_samples='auto', contamination=contratio, max_features=1.0, bootstrap=True, verbose=0, random_state=0) outlierPreds = isf.fit(x_train).predict(x_train) non_outlier_pred_true_txts_in_a_cluster=[] for i in range(len(outlierPreds)): outlierPred=outlierPreds[i] if outlierPred !=-1: non_outlier_pred_true_txts_in_a_cluster.append(pred_true_txts[i]) non_outlier_pred_true_txts_in_all_clusters.append(pred_true_txts[i]) else: outlier_pred_true_txts_in_all_clusters.append(pred_true_txts[i]) else: non_outlier_pred_true_txts_in_all_clusters.extend(pred_true_txts) dic_tupple_class_filteres=groupTxtByClass(non_outlier_pred_true_txts_in_all_clusters, False) printClusterEvaluation_list(non_outlier_pred_true_txts_in_all_clusters) print ("true clusters="+str(len(groupTxtByClass(non_outlier_pred_true_txts_in_all_clusters, True)))) #ComputePurity(dic_tupple_class_filteres) return [non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters]
def DetectNonOutliers(listtuple_pred_true_text): printClusterEvaluation_list(listtuple_pred_true_text) dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False) print("true clusters=" + str(len(groupTxtByClass(listtuple_pred_true_text, True)))) #ComputePurity(dic_tupple_class) totalItems = 0 itemsInClusterList = [] for label, pred_true_txts in dic_tupple_class.items(): itemsInCluster = len(pred_true_txts) #print("itemsInCluster="+str(itemsInCluster)) totalItems = totalItems + itemsInCluster itemsInClusterList.append(itemsInCluster) totalClusters = len(dic_tupple_class) avgItemsInCluster_in_a_batch = float(totalItems) / totalClusters std = np.std(itemsInClusterList) print("totalItems=" + str(totalItems) + ",avgItemsInCluster_in_a_batch=" + str(avgItemsInCluster_in_a_batch) + ",std=" + str(std)) non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters = DetectNonOutliersByThreshold( dic_tupple_class, avgItemsInCluster_in_a_batch) print("total #outliers=" + str(len(outlier_pred_true_txts_in_all_clusters))) #print("#non_outlier_pred_true_txts_in_all_clusters#") #print(non_outlier_pred_true_txts_in_all_clusters) #print("#outlier_pred_true_txts_in_all_clusters#") #print(outlier_pred_true_txts_in_all_clusters) #print("--Batch End--") return [ non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters, avgItemsInCluster_in_a_batch ]
def clusterByWordEmbeddingIntelligent(list_pred_true_text_ind_prevind, wordVectorsDic): print("pred_mstreams") printClusterEvaluation_list(list_pred_true_text_ind_prevind) dic_itemGroups = groupItemsBySingleKeyIndex( list_pred_true_text_ind_prevind, 0) pred_clusters = int(len(dic_itemGroups) / 1.0) #needs to be determined carefully dic_group_sizes = [ len(dic_itemGroups[x]) for x in dic_itemGroups if isinstance(dic_itemGroups[x], list) ] print(dic_group_sizes) print("#clusters=" + str(pred_clusters)) nparr = np.array(list_pred_true_text_ind_prevind) preds = list(nparr[:, 0]) trues = list(nparr[:, 1]) word_arr = list(nparr[:, 2]) inds = list(nparr[:, 3]) X = generate_sent_vecs_toktextdata(word_arr, wordVectorsDic, 300) #X=generate_sent_vecs_toktextdata_autoencoder(word_arr, wordVectorsDic, 300, pred_clusters) svd = TruncatedSVD(50) #svd = PCA(n_components=50) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) #X=X.toarray() X = lsa.fit_transform(X) ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text = combine_pred_true_txt_from_list( ward.labels_, trues, word_arr) print("hr-ward") printClusterEvaluation_list(list_hr_pred_true_text) clustering = SpectralClustering(n_clusters=pred_clusters, assign_labels="discretize", random_state=0).fit(X) list_sp_pred_true_text = combine_pred_true_txt_from_list( clustering.labels_, trues, word_arr) print("spectral") printClusterEvaluation_list(list_sp_pred_true_text)
outlier_pred_true_texts, non_outlier_pred_true_txts, avgItemsInCluster = removeOutlierConnectedComponentLexical( listtuple_pred_true_text) #change pred labels newOutlier_pred_true_txts = change_pred_label(outlier_pred_true_texts, 1000) #end change pred labels non_outlier_pred_true_txts.extend(newOutlier_pred_true_txts) #print("print_by_group(outlier_pred_true_texts)") #print_by_group(outlier_pred_true_texts) print("print_by_group(non_outlier_pred_true_txts)") print_by_group(non_outlier_pred_true_txts) print("listtuple_pred_true_text") printClusterEvaluation_list(listtuple_pred_true_text) #print("outlier_pred_true_texts") #printClusterEvaluation_list(outlier_pred_true_texts) print("non_outlier_pred_true_txts") printClusterEvaluation_list(non_outlier_pred_true_txts) #non_outlier_pred_true_txts, outlier_pred_true_txts,avgItemsInCluster = DetectNonOutliersLexical(listtuple_pred_true_text) #Print_list_pred_true_text(listtuple_pred_true_text) #print("#real Batch end#") #non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters,avgItemsInCluster_in_a_batch =DetectNonOutliers(listtuple_pred_true_text) #clusterByTfIdfFeature(listtuple_pred_true_text) #clusterByWordEmbeddingFeature(listtuple_pred_true_text, wordVectorsDic) #change pred labels
def clusterByTfIdfFeature(list_pred_true_text): print("pred_mstreams") printClusterEvaluation_list(list_pred_true_text) dic_tupple_class = groupTxtByClass(list_pred_true_text, False) pred_clusters = len(dic_tupple_class) print("pred_clusters for k-means=" + str(pred_clusters)) preds, trues, texts = split_pred_true_txt_from_list(list_pred_true_text) skStopWords = getScikitLearn_StopWords() texts = processTextsRemoveStopWordTokenized(texts, skStopWords) vectorizer = TfidfVectorizer(tokenizer=stem_text, max_df=0.5, min_df=2) #vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english') X = vectorizer.fit_transform(texts) svd = TruncatedSVD(100) #svd = PCA(n_components=50) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) #X=X.toarray() X = lsa.fit_transform(X) km = KMeans(n_clusters=pred_clusters, init='k-means++', max_iter=100, random_state=0) km.fit(X) list_km_pred_true_text = combine_pred_true_txt_from_list( km.labels_, trues, texts) print("k-means") printClusterEvaluation_list(list_km_pred_true_text) ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text = combine_pred_true_txt_from_list( ward.labels_, trues, texts) print("hr-ward") printClusterEvaluation_list(list_hr_pred_true_text) clustering = SpectralClustering(n_clusters=pred_clusters, assign_labels="discretize", random_state=0).fit(X) list_sp_pred_true_text = combine_pred_true_txt_from_list( clustering.labels_, trues, texts) print("spectral") printClusterEvaluation_list(list_sp_pred_true_text) brc = Birch(branching_factor=50, n_clusters=pred_clusters, threshold=0.5, compute_labels=True) brc.fit_predict(X) list_brc_pred_true_text = combine_pred_true_txt_from_list( brc.labels_, trues, texts) print("brc") printClusterEvaluation_list(list_brc_pred_true_text) gmm = GaussianMixture(n_components=pred_clusters, covariance_type='full') gmm_labels = gmm.fit_predict(X) list_gmm_pred_true_text = combine_pred_true_txt_from_list( gmm_labels, trues, texts) print("gmm") printClusterEvaluation_list(list_gmm_pred_true_text)
def ClusterByHDbScan(listtuple_pred_true_text, avgItemsInCluster_in_a_batch): print("\nClusterByHDbScan") printClusterEvaluation_list(listtuple_pred_true_text) print(len(listtuple_pred_true_text), avgItemsInCluster_in_a_batch) dic_tupple_class_predicted = groupTxtByClass(listtuple_pred_true_text, False) numberOfClusters_predicted = len(dic_tupple_class_predicted) dic_tupple_class_true = groupTxtByClass(listtuple_pred_true_text, True) numberOfClusters_true = len(dic_tupple_class_true) print("numberOfClusters_true=" + str(numberOfClusters_true) + ", numberOfClusters_predicted=" + str(numberOfClusters_predicted)) train_data = [] train_predlabels = [] train_trueLabels = [] for pred_true_text in listtuple_pred_true_text: train_predlabels.append(pred_true_text[0]) train_trueLabels.append(pred_true_text[1]) train_data.append(pred_true_text[2]) vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', use_idf=True, smooth_idf=True, norm='l2') X = vectorizer.fit_transform(train_data) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X_svd = lsa.fit_transform(X) min_cluster_size_in_a_batch = int(math.ceil(avgItemsInCluster_in_a_batch)) min_cluster_size_in_a_batch = 2 clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size_in_a_batch) clusterer.fit(X) X_hdbscan_labels = clusterer.labels_ print("X-total-clusters=" + str(X_hdbscan_labels.max())) print("Homogeneity: %0.4f" % metrics.homogeneity_score(train_trueLabels, X_hdbscan_labels)) print("Completeness: %0.4f" % metrics.completeness_score(train_trueLabels, X_hdbscan_labels)) print("V-measure: %0.4f" % metrics.v_measure_score(train_trueLabels, X_hdbscan_labels)) print("Adjusted Rand-Index: %.4f" % metrics.adjusted_rand_score(train_trueLabels, X_hdbscan_labels)) print("nmi_score-whole-data: %0.4f" % metrics.normalized_mutual_info_score( train_trueLabels, X_hdbscan_labels, average_method='arithmetic')) clusterer_svd = hdbscan.HDBSCAN( min_cluster_size=min_cluster_size_in_a_batch) clusterer_svd.fit(X_svd) X_svd_hdbscan_labels = clusterer_svd.labels_ db = DBSCAN().fit(X_svd) X_svd_dbscan_labels = db.labels_ print("X-svd-total-clusters=" + str(X_svd_hdbscan_labels.max())) print("Homogeneity: %0.4f" % metrics.homogeneity_score(train_trueLabels, X_svd_hdbscan_labels)) print("Completeness: %0.4f" % metrics.completeness_score(train_trueLabels, X_svd_hdbscan_labels)) print("V-measure: %0.4f" % metrics.v_measure_score(train_trueLabels, X_svd_hdbscan_labels)) print("Adjusted Rand-Index: %.4f" % metrics.adjusted_rand_score(train_trueLabels, X_svd_hdbscan_labels)) print("nmi_score-whole-data: %0.4f" % metrics.normalized_mutual_info_score(train_trueLabels, X_svd_hdbscan_labels, average_method='arithmetic')) print("X-svd-dbscan-total-clusters=" + str(X_svd_dbscan_labels.max())) print("Homogeneity: %0.4f" % metrics.homogeneity_score(train_trueLabels, X_svd_dbscan_labels)) print("Completeness: %0.4f" % metrics.completeness_score(train_trueLabels, X_svd_dbscan_labels)) print("V-measure: %0.4f" % metrics.v_measure_score(train_trueLabels, X_svd_dbscan_labels)) print("Adjusted Rand-Index: %.4f" % metrics.adjusted_rand_score(train_trueLabels, X_svd_dbscan_labels)) print("nmi_score-whole-data: %0.4f" % metrics.normalized_mutual_info_score(train_trueLabels, X_svd_dbscan_labels, average_method='arithmetic'))
#remove high entropy words (needs logic) from each text #find the embedding of each text #cluster the texts using hac + sd method #cluster text by tf-idf feature '''listtuple_pred_true_text=[ ['10', '20', 'gerry adam ira irrelevant mcguinness'], ['10', '86', 'lynch highlight senior night win wmu'], ['10', '86', 'jordan lynch break ncaa record lead northern illinois western michigan'], ['10', '86', 'lynch roll northern illinois win western michigan'], ['10', '86', 'lynch husky bowl bronco']]''' _components,new_pred_true_texts=clusterByConnectedComponent(listtuple_pred_true_text) print_by_group(new_pred_true_texts) printClusterEvaluation_list(new_pred_true_texts) #non_outlier_pred_true_txts, outlier_pred_true_txts,avgItemsInCluster = DetectNonOutliersLexical(listtuple_pred_true_text) #Print_list_pred_true_text(listtuple_pred_true_text) #print("#real Batch end#") #non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters,avgItemsInCluster_in_a_batch =DetectNonOutliers(listtuple_pred_true_text) #clusterByTfIdfFeature(listtuple_pred_true_text) #clusterByWordEmbeddingFeature(listtuple_pred_true_text, wordVectorsDic) #change pred labels '''newPredSeed=1000 new_outlier_pred_true_txts_in_all_clusters=[] for pred_true_txt in outlier_pred_true_txts_in_all_clusters: predLabel= int(pred_true_txt[0])+newPredSeed