def DetectNonOutliersByThreshold(dic_tupple_class, avgItemsInCluster_in_a_batch):
  non_outlier_pred_true_txts_in_all_clusters=[]
  outlier_pred_true_txts_in_all_clusters=[]
  for label, pred_true_txts in dic_tupple_class.items():
    itemsInCluster=len(pred_true_txts)
    if itemsInCluster>avgItemsInCluster_in_a_batch:
      #print("cluster label="+str(label)+", "+str(itemsInCluster))
      textsArr=[]
      for pred_true_txt in pred_true_txts:
        textsArr.append(pred_true_txt[2])
      vectorizer = TfidfVectorizer( max_df=1.0, min_df=1, stop_words='english', use_idf=True, smooth_idf=True, norm='l2')
      x_train = vectorizer.fit_transform(textsArr)
      contratio = 0.3
      isf = IsolationForest(n_estimators=100, max_samples='auto', contamination=contratio, max_features=1.0, bootstrap=True, verbose=0, random_state=0, behaviour='new')
      #isf=IsolationForest(n_estimators=100, max_samples='auto', contamination=contratio, max_features=1.0, bootstrap=True, verbose=0, random_state=0)
      outlierPreds = isf.fit(x_train).predict(x_train)
      non_outlier_pred_true_txts_in_a_cluster=[]
      for i in range(len(outlierPreds)):
        outlierPred=outlierPreds[i]
        if outlierPred !=-1:
          non_outlier_pred_true_txts_in_a_cluster.append(pred_true_txts[i])	
          non_outlier_pred_true_txts_in_all_clusters.append(pred_true_txts[i])
        else:
         outlier_pred_true_txts_in_all_clusters.append(pred_true_txts[i])		
    else:
      non_outlier_pred_true_txts_in_all_clusters.extend(pred_true_txts)
  dic_tupple_class_filteres=groupTxtByClass(non_outlier_pred_true_txts_in_all_clusters, False)
  printClusterEvaluation_list(non_outlier_pred_true_txts_in_all_clusters)
  print ("true clusters="+str(len(groupTxtByClass(non_outlier_pred_true_txts_in_all_clusters, True))))  
  #ComputePurity(dic_tupple_class_filteres) 	
  
  return [non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters]  
def DetectNonOutliers(listtuple_pred_true_text):
    printClusterEvaluation_list(listtuple_pred_true_text)
    dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False)
    print("true clusters=" +
          str(len(groupTxtByClass(listtuple_pred_true_text, True))))
    #ComputePurity(dic_tupple_class)

    totalItems = 0
    itemsInClusterList = []
    for label, pred_true_txts in dic_tupple_class.items():
        itemsInCluster = len(pred_true_txts)
        #print("itemsInCluster="+str(itemsInCluster))
        totalItems = totalItems + itemsInCluster
        itemsInClusterList.append(itemsInCluster)

    totalClusters = len(dic_tupple_class)
    avgItemsInCluster_in_a_batch = float(totalItems) / totalClusters
    std = np.std(itemsInClusterList)
    print("totalItems=" + str(totalItems) + ",avgItemsInCluster_in_a_batch=" +
          str(avgItemsInCluster_in_a_batch) + ",std=" + str(std))
    non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters = DetectNonOutliersByThreshold(
        dic_tupple_class, avgItemsInCluster_in_a_batch)
    print("total #outliers=" +
          str(len(outlier_pred_true_txts_in_all_clusters)))
    #print("#non_outlier_pred_true_txts_in_all_clusters#")
    #print(non_outlier_pred_true_txts_in_all_clusters)
    #print("#outlier_pred_true_txts_in_all_clusters#")
    #print(outlier_pred_true_txts_in_all_clusters)
    #print("--Batch End--")
    return [
        non_outlier_pred_true_txts_in_all_clusters,
        outlier_pred_true_txts_in_all_clusters, avgItemsInCluster_in_a_batch
    ]
def clusterByWordEmbeddingIntelligent(list_pred_true_text_ind_prevind,
                                      wordVectorsDic):
    print("pred_mstreams")
    printClusterEvaluation_list(list_pred_true_text_ind_prevind)
    dic_itemGroups = groupItemsBySingleKeyIndex(
        list_pred_true_text_ind_prevind, 0)

    pred_clusters = int(len(dic_itemGroups) /
                        1.0)  #needs to be determined carefully

    dic_group_sizes = [
        len(dic_itemGroups[x]) for x in dic_itemGroups
        if isinstance(dic_itemGroups[x], list)
    ]
    print(dic_group_sizes)

    print("#clusters=" + str(pred_clusters))

    nparr = np.array(list_pred_true_text_ind_prevind)
    preds = list(nparr[:, 0])
    trues = list(nparr[:, 1])
    word_arr = list(nparr[:, 2])
    inds = list(nparr[:, 3])
    X = generate_sent_vecs_toktextdata(word_arr, wordVectorsDic, 300)
    #X=generate_sent_vecs_toktextdata_autoencoder(word_arr, wordVectorsDic, 300, pred_clusters)

    svd = TruncatedSVD(50)
    #svd = PCA(n_components=50)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    #X=X.toarray()
    X = lsa.fit_transform(X)

    ward = AgglomerativeClustering(n_clusters=pred_clusters,
                                   linkage='ward').fit(X)
    list_hr_pred_true_text = combine_pred_true_txt_from_list(
        ward.labels_, trues, word_arr)
    print("hr-ward")
    printClusterEvaluation_list(list_hr_pred_true_text)

    clustering = SpectralClustering(n_clusters=pred_clusters,
                                    assign_labels="discretize",
                                    random_state=0).fit(X)
    list_sp_pred_true_text = combine_pred_true_txt_from_list(
        clustering.labels_, trues, word_arr)
    print("spectral")
    printClusterEvaluation_list(list_sp_pred_true_text)
    outlier_pred_true_texts, non_outlier_pred_true_txts, avgItemsInCluster = removeOutlierConnectedComponentLexical(
        listtuple_pred_true_text)

    #change pred labels
    newOutlier_pred_true_txts = change_pred_label(outlier_pred_true_texts,
                                                  1000)
    #end change pred labels
    non_outlier_pred_true_txts.extend(newOutlier_pred_true_txts)

    #print("print_by_group(outlier_pred_true_texts)")
    #print_by_group(outlier_pred_true_texts)
    print("print_by_group(non_outlier_pred_true_txts)")
    print_by_group(non_outlier_pred_true_txts)

    print("listtuple_pred_true_text")
    printClusterEvaluation_list(listtuple_pred_true_text)
    #print("outlier_pred_true_texts")
    #printClusterEvaluation_list(outlier_pred_true_texts)
    print("non_outlier_pred_true_txts")
    printClusterEvaluation_list(non_outlier_pred_true_txts)

    #non_outlier_pred_true_txts, outlier_pred_true_txts,avgItemsInCluster = DetectNonOutliersLexical(listtuple_pred_true_text)

    #Print_list_pred_true_text(listtuple_pred_true_text)
    #print("#real Batch end#")
    #non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters,avgItemsInCluster_in_a_batch =DetectNonOutliers(listtuple_pred_true_text)

    #clusterByTfIdfFeature(listtuple_pred_true_text)
    #clusterByWordEmbeddingFeature(listtuple_pred_true_text, wordVectorsDic)

    #change pred labels
def clusterByTfIdfFeature(list_pred_true_text):
    print("pred_mstreams")
    printClusterEvaluation_list(list_pred_true_text)
    dic_tupple_class = groupTxtByClass(list_pred_true_text, False)
    pred_clusters = len(dic_tupple_class)
    print("pred_clusters for k-means=" + str(pred_clusters))

    preds, trues, texts = split_pred_true_txt_from_list(list_pred_true_text)
    skStopWords = getScikitLearn_StopWords()
    texts = processTextsRemoveStopWordTokenized(texts, skStopWords)
    vectorizer = TfidfVectorizer(tokenizer=stem_text, max_df=0.5, min_df=2)
    #vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english')
    X = vectorizer.fit_transform(texts)

    svd = TruncatedSVD(100)
    #svd = PCA(n_components=50)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    #X=X.toarray()
    X = lsa.fit_transform(X)

    km = KMeans(n_clusters=pred_clusters,
                init='k-means++',
                max_iter=100,
                random_state=0)
    km.fit(X)
    list_km_pred_true_text = combine_pred_true_txt_from_list(
        km.labels_, trues, texts)
    print("k-means")
    printClusterEvaluation_list(list_km_pred_true_text)

    ward = AgglomerativeClustering(n_clusters=pred_clusters,
                                   linkage='ward').fit(X)
    list_hr_pred_true_text = combine_pred_true_txt_from_list(
        ward.labels_, trues, texts)
    print("hr-ward")
    printClusterEvaluation_list(list_hr_pred_true_text)

    clustering = SpectralClustering(n_clusters=pred_clusters,
                                    assign_labels="discretize",
                                    random_state=0).fit(X)
    list_sp_pred_true_text = combine_pred_true_txt_from_list(
        clustering.labels_, trues, texts)
    print("spectral")
    printClusterEvaluation_list(list_sp_pred_true_text)

    brc = Birch(branching_factor=50,
                n_clusters=pred_clusters,
                threshold=0.5,
                compute_labels=True)
    brc.fit_predict(X)
    list_brc_pred_true_text = combine_pred_true_txt_from_list(
        brc.labels_, trues, texts)
    print("brc")
    printClusterEvaluation_list(list_brc_pred_true_text)

    gmm = GaussianMixture(n_components=pred_clusters, covariance_type='full')
    gmm_labels = gmm.fit_predict(X)
    list_gmm_pred_true_text = combine_pred_true_txt_from_list(
        gmm_labels, trues, texts)
    print("gmm")
    printClusterEvaluation_list(list_gmm_pred_true_text)
def ClusterByHDbScan(listtuple_pred_true_text, avgItemsInCluster_in_a_batch):
    print("\nClusterByHDbScan")
    printClusterEvaluation_list(listtuple_pred_true_text)
    print(len(listtuple_pred_true_text), avgItemsInCluster_in_a_batch)

    dic_tupple_class_predicted = groupTxtByClass(listtuple_pred_true_text,
                                                 False)
    numberOfClusters_predicted = len(dic_tupple_class_predicted)

    dic_tupple_class_true = groupTxtByClass(listtuple_pred_true_text, True)
    numberOfClusters_true = len(dic_tupple_class_true)

    print("numberOfClusters_true=" + str(numberOfClusters_true) +
          ", numberOfClusters_predicted=" + str(numberOfClusters_predicted))

    train_data = []
    train_predlabels = []
    train_trueLabels = []

    for pred_true_text in listtuple_pred_true_text:
        train_predlabels.append(pred_true_text[0])
        train_trueLabels.append(pred_true_text[1])
        train_data.append(pred_true_text[2])

    vectorizer = TfidfVectorizer(max_df=1.0,
                                 min_df=1,
                                 stop_words='english',
                                 use_idf=True,
                                 smooth_idf=True,
                                 norm='l2')
    X = vectorizer.fit_transform(train_data)

    svd = TruncatedSVD(2)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    X_svd = lsa.fit_transform(X)

    min_cluster_size_in_a_batch = int(math.ceil(avgItemsInCluster_in_a_batch))

    min_cluster_size_in_a_batch = 2

    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size_in_a_batch)
    clusterer.fit(X)
    X_hdbscan_labels = clusterer.labels_

    print("X-total-clusters=" + str(X_hdbscan_labels.max()))
    print("Homogeneity: %0.4f" %
          metrics.homogeneity_score(train_trueLabels, X_hdbscan_labels))
    print("Completeness: %0.4f" %
          metrics.completeness_score(train_trueLabels, X_hdbscan_labels))
    print("V-measure: %0.4f" %
          metrics.v_measure_score(train_trueLabels, X_hdbscan_labels))
    print("Adjusted Rand-Index: %.4f" %
          metrics.adjusted_rand_score(train_trueLabels, X_hdbscan_labels))
    print("nmi_score-whole-data:   %0.4f" %
          metrics.normalized_mutual_info_score(
              train_trueLabels, X_hdbscan_labels, average_method='arithmetic'))

    clusterer_svd = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size_in_a_batch)
    clusterer_svd.fit(X_svd)
    X_svd_hdbscan_labels = clusterer_svd.labels_

    db = DBSCAN().fit(X_svd)
    X_svd_dbscan_labels = db.labels_

    print("X-svd-total-clusters=" + str(X_svd_hdbscan_labels.max()))
    print("Homogeneity: %0.4f" %
          metrics.homogeneity_score(train_trueLabels, X_svd_hdbscan_labels))
    print("Completeness: %0.4f" %
          metrics.completeness_score(train_trueLabels, X_svd_hdbscan_labels))
    print("V-measure: %0.4f" %
          metrics.v_measure_score(train_trueLabels, X_svd_hdbscan_labels))
    print("Adjusted Rand-Index: %.4f" %
          metrics.adjusted_rand_score(train_trueLabels, X_svd_hdbscan_labels))
    print("nmi_score-whole-data:   %0.4f" %
          metrics.normalized_mutual_info_score(train_trueLabels,
                                               X_svd_hdbscan_labels,
                                               average_method='arithmetic'))

    print("X-svd-dbscan-total-clusters=" + str(X_svd_dbscan_labels.max()))
    print("Homogeneity: %0.4f" %
          metrics.homogeneity_score(train_trueLabels, X_svd_dbscan_labels))
    print("Completeness: %0.4f" %
          metrics.completeness_score(train_trueLabels, X_svd_dbscan_labels))
    print("V-measure: %0.4f" %
          metrics.v_measure_score(train_trueLabels, X_svd_dbscan_labels))
    print("Adjusted Rand-Index: %.4f" %
          metrics.adjusted_rand_score(train_trueLabels, X_svd_dbscan_labels))
    print("nmi_score-whole-data:   %0.4f" %
          metrics.normalized_mutual_info_score(train_trueLabels,
                                               X_svd_dbscan_labels,
                                               average_method='arithmetic'))
   #remove high entropy words (needs logic) from each text
   #find the embedding of each text
   #cluster the texts using hac + sd method
   #cluster text by tf-idf feature    
   
   
   '''listtuple_pred_true_text=[   
   ['10', '20', 'gerry adam ira irrelevant mcguinness'],
 ['10', '86', 'lynch highlight senior night win wmu'],
 ['10', '86', 'jordan lynch break ncaa record lead northern illinois western michigan'],
 ['10', '86', 'lynch roll northern illinois win western michigan'],
 ['10', '86', 'lynch husky bowl bronco']]'''
   
   _components,new_pred_true_texts=clusterByConnectedComponent(listtuple_pred_true_text)
   print_by_group(new_pred_true_texts)   
   printClusterEvaluation_list(new_pred_true_texts)   
   
   #non_outlier_pred_true_txts, outlier_pred_true_txts,avgItemsInCluster = DetectNonOutliersLexical(listtuple_pred_true_text)
   
   #Print_list_pred_true_text(listtuple_pred_true_text) 
   #print("#real Batch end#")
   #non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters,avgItemsInCluster_in_a_batch =DetectNonOutliers(listtuple_pred_true_text)

   #clusterByTfIdfFeature(listtuple_pred_true_text)    
   #clusterByWordEmbeddingFeature(listtuple_pred_true_text, wordVectorsDic)
   
   #change pred labels
   '''newPredSeed=1000
   new_outlier_pred_true_txts_in_all_clusters=[]   
   for pred_true_txt in outlier_pred_true_txts_in_all_clusters:
      predLabel= int(pred_true_txt[0])+newPredSeed