def DetectNonOutliers(listtuple_pred_true_text):
    printClusterEvaluation_list(listtuple_pred_true_text)
    dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False)
    print("true clusters=" +
          str(len(groupTxtByClass(listtuple_pred_true_text, True))))
    #ComputePurity(dic_tupple_class)

    totalItems = 0
    itemsInClusterList = []
    for label, pred_true_txts in dic_tupple_class.items():
        itemsInCluster = len(pred_true_txts)
        #print("itemsInCluster="+str(itemsInCluster))
        totalItems = totalItems + itemsInCluster
        itemsInClusterList.append(itemsInCluster)

    totalClusters = len(dic_tupple_class)
    avgItemsInCluster_in_a_batch = float(totalItems) / totalClusters
    std = np.std(itemsInClusterList)
    print("totalItems=" + str(totalItems) + ",avgItemsInCluster_in_a_batch=" +
          str(avgItemsInCluster_in_a_batch) + ",std=" + str(std))
    non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters = DetectNonOutliersByThreshold(
        dic_tupple_class, avgItemsInCluster_in_a_batch)
    print("total #outliers=" +
          str(len(outlier_pred_true_txts_in_all_clusters)))
    #print("#non_outlier_pred_true_txts_in_all_clusters#")
    #print(non_outlier_pred_true_txts_in_all_clusters)
    #print("#outlier_pred_true_txts_in_all_clusters#")
    #print(outlier_pred_true_txts_in_all_clusters)
    #print("--Batch End--")
    return [
        non_outlier_pred_true_txts_in_all_clusters,
        outlier_pred_true_txts_in_all_clusters, avgItemsInCluster_in_a_batch
    ]
def DetectNonOutliersByThreshold(dic_tupple_class, avgItemsInCluster_in_a_batch):
  non_outlier_pred_true_txts_in_all_clusters=[]
  outlier_pred_true_txts_in_all_clusters=[]
  for label, pred_true_txts in dic_tupple_class.items():
    itemsInCluster=len(pred_true_txts)
    if itemsInCluster>avgItemsInCluster_in_a_batch:
      #print("cluster label="+str(label)+", "+str(itemsInCluster))
      textsArr=[]
      for pred_true_txt in pred_true_txts:
        textsArr.append(pred_true_txt[2])
      vectorizer = TfidfVectorizer( max_df=1.0, min_df=1, stop_words='english', use_idf=True, smooth_idf=True, norm='l2')
      x_train = vectorizer.fit_transform(textsArr)
      contratio = 0.3
      isf = IsolationForest(n_estimators=100, max_samples='auto', contamination=contratio, max_features=1.0, bootstrap=True, verbose=0, random_state=0, behaviour='new')
      #isf=IsolationForest(n_estimators=100, max_samples='auto', contamination=contratio, max_features=1.0, bootstrap=True, verbose=0, random_state=0)
      outlierPreds = isf.fit(x_train).predict(x_train)
      non_outlier_pred_true_txts_in_a_cluster=[]
      for i in range(len(outlierPreds)):
        outlierPred=outlierPreds[i]
        if outlierPred !=-1:
          non_outlier_pred_true_txts_in_a_cluster.append(pred_true_txts[i])	
          non_outlier_pred_true_txts_in_all_clusters.append(pred_true_txts[i])
        else:
         outlier_pred_true_txts_in_all_clusters.append(pred_true_txts[i])		
    else:
      non_outlier_pred_true_txts_in_all_clusters.extend(pred_true_txts)
  dic_tupple_class_filteres=groupTxtByClass(non_outlier_pred_true_txts_in_all_clusters, False)
  printClusterEvaluation_list(non_outlier_pred_true_txts_in_all_clusters)
  print ("true clusters="+str(len(groupTxtByClass(non_outlier_pred_true_txts_in_all_clusters, True))))  
  #ComputePurity(dic_tupple_class_filteres) 	
  
  return [non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters]  
def GenerateTrainTest2_Percentage(percentTrainData):
    trainDataRatio = 1.0

    listtuple_pred_true_text = ReadPredTrueText(traintestFile)
    perct_tdata = percentTrainData / 100
    goodAmount_txts = int(perct_tdata *
                          (len(listtuple_pred_true_text) / numberOfClusters))
    dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False)
    #write texts of each group in
    WriteTextsOfEachGroup(textsperlabelDir, dic_tupple_class)
    dic_label_outliers = Gen_WriteOutliersEachGroup(textsperlabelDir,
                                                    numberOfClusters)

    train_pred_true_txts = []
    test_pred_true_txts = []

    for label, pred_true_txt in dic_tupple_class.items():
        outlierpreds = dic_label_outliers[str(label)]
        pred_true_txts = dic_tupple_class[str(label)]

        if len(outlierpreds) != len(pred_true_txts):
            print("Size not match for=" + str(label))

        outLiers_pred_true_txt = []
        count = -1
        for outPred in outlierpreds:
            outPred = str(outPred)
            count = count + 1
            if outPred == "-1":
                outLiers_pred_true_txt.append(pred_true_txts[count])

        test_pred_true_txts.extend(outLiers_pred_true_txt)
        #remove outlierts insts from pred_true_txts
        pred_true_txts_good = [
            e for e in pred_true_txts if e not in outLiers_pred_true_txt
        ]
        dic_tupple_class[str(label)] = pred_true_txts_good

    for label, pred_true_txt in dic_tupple_class.items():
        pred_true_txts = dic_tupple_class[str(label)]
        pred_true_txt_subs = []
        numTrainGoodTexts = int(perct_tdata * len(pred_true_txts))
        if len(pred_true_txts) > goodAmount_txts:
            pred_true_txt_subs.extend(pred_true_txts[0:goodAmount_txts])
            test_pred_true_txts.extend(
                pred_true_txts[goodAmount_txts:len(pred_true_txts)])
        else:
            pred_true_txt_subs.extend(pred_true_txts)
        train_pred_true_txts.extend(pred_true_txt_subs)

    trainDataRatio = len(train_pred_true_txts) / len(train_pred_true_txts +
                                                     test_pred_true_txts)
    #print("trainDataRatio="+str(trainDataRatio))
    if trainDataRatio <= maxTrainRatio:
        WriteTrainTestInstances(trainFile, train_pred_true_txts)
        WriteTrainTestInstances(testFile, test_pred_true_txts)

    return trainDataRatio
def generateTrainTestTxtsByOutliers(dic_tuple_class, dic_list_outliers_class1,
                                    maxItemsInEachClass):
    trainTup_pred_true_txt = []
    testTup_pred_true_txt = []

    for key, value in dic_tuple_class.items():
        if key not in dic_list_outliers_class1 or len(value) != len(
                dic_list_outliers_class1[key]):
            print("miss match=" + key)
            continue

        rest_traintupTPTxt = []
        outliers = dic_list_outliers_class1[key]
        print("collections.Counter=" + str(collections.Counter(outliers)) +
              ", key=" + key + ", len(outliers)=" + str(len(outliers)) +
              ", len(value)=" + str(len(value)))
        count = -1
        for tup_pred_true_text in value:
            count = count + 1
            #print("outliers[count]="+str(outliers[count]))
            if outliers[count] == -1:
                testTup_pred_true_txt.append(tup_pred_true_text)
            else:
                rest_traintupTPTxt.append(tup_pred_true_text)

        print("len(rest_traintupTPTxt)=" + str(len(rest_traintupTPTxt)) +
              ",maxItemsInEachClass=" + str(maxItemsInEachClass))
        if len(rest_traintupTPTxt) > maxItemsInEachClass:
            trainTup_pred_true_txt.extend(
                rest_traintupTPTxt[0:maxItemsInEachClass])
            testTup_pred_true_txt.extend(
                rest_traintupTPTxt[maxItemsInEachClass:len(rest_traintupTPTxt
                                                           )])
        else:
            trainTup_pred_true_txt.extend(rest_traintupTPTxt)

    print("after remove outlier, max items=" + str(maxItemsInEachClass) +
          ", total=" +
          str(len(trainTup_pred_true_txt + testTup_pred_true_txt)))
    groupTxtByClass(trainTup_pred_true_txt + testTup_pred_true_txt, False)
    return [trainTup_pred_true_txt, testTup_pred_true_txt]
Example #5
0
def Evaluate(listtuple_pred_true_text):
    print("evaluate total texts=" + str(len(listtuple_pred_true_text)))
    preds = []
    trues = []
    for pred_true_text in listtuple_pred_true_text:
        preds.append(pred_true_text[0])
        trues.append(pred_true_text[1])

    score = metrics.homogeneity_score(trues, preds)
    print("homogeneity_score-whole-data:   %0.8f" % score)

    score = metrics.completeness_score(trues, preds)
    print("completeness_score-whole-data:   %0.8f" % score)

    score = metrics.v_measure_score(trues, preds)
    print("v_measure_score-whole-data:   %0.4f" % score)

    nmi_score = metrics.normalized_mutual_info_score(
        trues, preds, average_method='arithmetic')
    print("nmi_score-whole-data:   %0.8f" % nmi_score)

    # score=metrics.adjusted_mutual_info_score(trues, preds)
    # print ("adjusted_mutual_info_score-whole-data:   %0.4f" % score)

    # score=metrics.adjusted_rand_score(trues, preds)
    # print ("adjusted_rand_score-whole-data:   %0.4f" % score)

    dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False)
    dic_tupple_class_true = groupTxtByClass(listtuple_pred_true_text, True)
    print("pred clusters=" + str(len(dic_tupple_class)) + ", true clusters=" +
          str(len(dic_tupple_class_true)))
    purity = ComputePurity(dic_tupple_class)
    '''print("---Pred distribution")
 for key,value in dic_tupple_class.items():
   print(key, len(value))
 print("---True distribution")
 for key,value in dic_tupple_class_true.items():
   print(key, len(value))'''
    return [purity, nmi_score]
def EvaluateByPurity(traintestFile):
    listtuple_pred_true_text = ReadPredTrueText(traintestFile)
    preds = []
    trues = []
    for pred_true_text in listtuple_pred_true_text:
        preds.append(pred_true_text[0])
        trues.append(pred_true_text[1])

    score = metrics.homogeneity_score(trues, preds)
    print("purity_score-whole-data:   %0.4f" % score)
    score = metrics.normalized_mutual_info_score(trues, preds)
    print("nmi_score-whole-data:   %0.4f" % score)
    dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False)
    ComputePurity(dic_tupple_class)
Example #7
0
def removeOutlierConnectedComponentLexicalByItem(listtuple, batchDocs,
                                                 maxPredLabel):

    outliers = []
    non_outliers = []
    avgItemsInCluster = 0

    dic_tupple_class = groupTxtByClass(listtuple, False)

    for label, items in dic_tupple_class.items():
        #_components,newPred_items=clusterByConnectedComponentByItem(items)
        print("to do")

    return [outliers, non_outliers, avgItemsInCluster, maxPredLabel]
Example #8
0
def printClusterEvaluation_list(listtuple_pred_true_text):
    preds = []
    trues = []
    for pred_true_text in listtuple_pred_true_text:
        preds.append(pred_true_text[0])
        trues.append(pred_true_text[1])

    score = metrics.homogeneity_score(trues, preds)
    print("homogeneity_score-whole-data:   %0.4f" % score)
    score = metrics.normalized_mutual_info_score(trues,
                                                 preds,
                                                 average_method='arithmetic')
    print("nmi_score-whole-data:   %0.4f" % score)
    dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False)
    ComputePurity(dic_tupple_class)
def EvaluateByPurity(traintestFile):
    listtuple_pred_true_text = ReadPredTrueText(traintestFile)
    preds = []
    trues = []
    for pred_true_text in listtuple_pred_true_text:
        preds.append(pred_true_text[0])
        trues.append(pred_true_text[1])

    #score = metrics.homogeneity_score(trues, preds)
    #print ("homogeneity_score-whole-data:   %0.4f" % score)
    score = metrics.normalized_mutual_info_score(trues,
                                                 preds,
                                                 average_method='arithmetic')
    #print ("nmi_score-whole-data:   %0.6f" % score)
    dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False)
    acc = ComputePurity(dic_tupple_class)
    print("acc", acc, "nmi", score)
def comPrehensive_GenerateTrainTestTxtsByOutliersTfIDf_varoutlier(
        listtuple_pred_true_text, maxItemsInEachClass, avgItemPercluster):
    trainTup_pred_true_txt = []
    testTup_pred_true_txt = []

    dic_tuple_class = groupTxtByClass(listtuple_pred_true_text, False)

    dic_list_outliers_class = {}

    for key, value in dic_tuple_class.items():
        txt_datas = []
        for tup_pred_true_text in value:
            txt_datas.append(tup_pred_true_text[2])

        outlierratio = len(value) / avgItemPercluster * 0.3
        print("outlierratio=" + str(outlierratio))
        if outlierratio > 0.4:
            outlierratio = 0.4

        contratio = outlierratio  #0.1
        print(len(txt_datas))
        vectorizer = TfidfVectorizer(max_df=1.0,
                                     min_df=1,
                                     stop_words='english',
                                     use_idf=True,
                                     smooth_idf=True,
                                     norm='l2')
        x_train = vectorizer.fit_transform(txt_datas)
        isf = IsolationForest(n_estimators=100,
                              max_samples='auto',
                              contamination=contratio,
                              max_features=1.0,
                              bootstrap=True,
                              verbose=0,
                              random_state=0)
        outlierPreds = isf.fit(x_train).predict(x_train)
        print(len(outlierPreds))
        dic_list_outliers_class[key] = outlierPreds

    trainTup_pred_true_txt, testTup_pred_true_txt = generateTrainTestTxtsByOutliers(
        dic_tuple_class, dic_list_outliers_class, maxItemsInEachClass)
    print("#trainTup_pred_true_txt=" + str(len(trainTup_pred_true_txt)) +
          ", #testTup_pred_true_txt=" + str(len(testTup_pred_true_txt)))
    return [trainTup_pred_true_txt, testTup_pred_true_txt]
def EvaluateByPurity(traintestFile):
 listtuple_pred_true_text = ReadPredTrueText(traintestFile)
 preds = []
 trues = []
 for pred_true_text in listtuple_pred_true_text:
  preds.append(pred_true_text[0])
  trues.append(pred_true_text[1])
 
 score = metrics.homogeneity_score(trues, preds)
 last_homoginity=score
 print ("homogeneity_score-whole-data:   %0.4f" % score)   			
 #score = metrics.normalized_mutual_info_score(trues, preds, average_method='arithmetic')
 score = metrics.normalized_mutual_info_score(trues, preds) 
 last_nmi=score
 print ("nmi_score-whole-data:   %0.4f" % score)
 last_completeness=metrics.completeness_score(trues, preds)
 print("completeness="+str(last_completeness))
 dic_tupple_class=groupTxtByClass(listtuple_pred_true_text, False)
 return ComputePurity(dic_tupple_class)
Example #12
0
def outlierBySmallestGroups(newPred_OldPred_true_texts):
    outliersInCluster = []
    non_outliersInCluster = []
    rows = len(newPred_OldPred_true_texts)

    #np.concatenate((A[:,0].reshape(2,1), A[:,2:4]),axis=1)
    np_arr = np.array(newPred_OldPred_true_texts)
    newPred_true_texts = np.concatenate(
        (np_arr[:, 0].reshape(rows, 1), np_arr[:, 2:4]), axis=1).tolist()

    dic_tupple_class = groupTxtByClass(newPred_true_texts, False)

    maxGroupSize = -10
    newPredMaxLabel = ""
    totalGroups = len(dic_tupple_class)
    #print("totalGroups by connComp="+str(totalGroups))

    for label, pred_true_txts in dic_tupple_class.items():
        groupSize = len(pred_true_txts)
        if maxGroupSize < groupSize:
            newPredMaxLabel = label
            maxGroupSize = groupSize

    for newPred_OldPred_true_text in newPred_OldPred_true_texts:
        newPredLabel = newPred_OldPred_true_text[0]
        oldPredLabel = newPred_OldPred_true_text[1]
        trueLabel = newPred_OldPred_true_text[2]
        text = newPred_OldPred_true_text[3]
        OldPred_true_text = [oldPredLabel, trueLabel, text]
        if str(newPredLabel) == str(newPredMaxLabel):
            non_outliersInCluster.append(OldPred_true_text)
        else:
            outliersInCluster.append(OldPred_true_text)
    '''if totalGroups>1:
    #split the group into outlier/non-outliers"
    non_outliersInCluster=dic_tupple_class[newPredMaxLabel]
    for label, pred_true_txts in dic_tupple_class.items():
      if label!=newPredMaxLabel:
        outliersInCluster.extend(pred_true_txts)	  
  else:
    non_outliersInCluster=newPred_OldPred_true_texts'''

    return [outliersInCluster, non_outliersInCluster]
def ComputePurity(dic_tupple_class):
    totalItems = 0
    maxGroupSizeSum = 0
    for label, pred_true_txts in dic_tupple_class.items():
        totalItems = totalItems + len(pred_true_txts)
        dic_tupple_class_originalLabel = groupTxtByClass(pred_true_txts, True)
        maxMemInGroupSize = minIntVal
        maxMemOriginalLabel = ""
        for orgLabel, org_pred_true_txts in dic_tupple_class_originalLabel.items(
        ):
            if maxMemInGroupSize < len(org_pred_true_txts):
                maxMemInGroupSize = len(org_pred_true_txts)
                maxMemOriginalLabel = orgLabel

        maxGroupSizeSum = maxGroupSizeSum + maxMemInGroupSize

    acc = maxGroupSizeSum / totalItems
    #print("acc whole data="+str(acc))
    return acc
def ComputePurity(dic_tupple_class):
 totalItems=0
 maxGroupSizeSum =0
 for label, pred_true_txts in dic_tupple_class.items():
  totalItems=totalItems+len(pred_true_txts)
  dic_tupple_class_originalLabel=groupTxtByClass(pred_true_txts, True)
  maxMemInGroupSize=minIntVal
  maxMemOriginalLabel=""
  for orgLabel, org_pred_true_txts in dic_tupple_class_originalLabel.items():
   if maxMemInGroupSize < len(org_pred_true_txts):
    maxMemInGroupSize=len(org_pred_true_txts)
    maxMemOriginalLabel=orgLabel
  
  maxGroupSizeSum=maxGroupSizeSum+maxMemInGroupSize
  
 purity=maxGroupSizeSum/totalItems
 print("purity majority whole data="+str(purity))
 last_purity_custom=purity
 return purity
Example #15
0
def extrcatLargeClusterItems(listtuple):
    dic_tupple_class = groupTxtByClass(listtuple, False)
    itemCounts = []
    items_to_cluster = []
    items_to_not_cluster = []
    for label, tuples in dic_tupple_class.items():
        #if len(tuples)<3:
        #  continue
        itemCounts.append(len(tuples))
    std = statistics.stdev(itemCounts)
    mean = statistics.stdev(itemCounts)

    for label, tuples in dic_tupple_class.items():
        no_items = len(tuples)
        if no_items >= mean + 1.2 * std:
            items_to_cluster.extend(tuples)
        else:
            items_to_not_cluster.extend(tuples)

    return [items_to_cluster, items_to_not_cluster]
Example #16
0
def ComputePurity(dic_tupple_class):
    totalItems = 0
    maxGroupSizeSum = 0
    for label, pred_true_txts in dic_tupple_class.items():
        totalItems = totalItems + len(pred_true_txts)
        # print("pred label="+label+", #texts="+str(len(pred_true_txts)))
        dic_tupple_class_originalLabel = groupTxtByClass(pred_true_txts, True)
        maxMemInGroupSize = -1000000
        maxMemOriginalLabel = ""
        for orgLabel, org_pred_true_txts in dic_tupple_class_originalLabel.items(
        ):
            # print("orgLabel label="+orgLabel+", #texts="+str(len(org_pred_true_txts)))
            if maxMemInGroupSize < len(org_pred_true_txts):
                maxMemInGroupSize = len(org_pred_true_txts)
                maxMemOriginalLabel = orgLabel

        # print("\n")
        # print(str(label)+" purity="+str(maxMemInGroupSize/len(pred_true_txts))+", items="+str(len(pred_true_txts))+", max match#="+str(maxMemInGroupSize))
        # print_by_group(pred_true_txts)
        maxGroupSizeSum = maxGroupSizeSum + maxMemInGroupSize

    purity = maxGroupSizeSum / float(totalItems)
    print("purity majority whole data=" + str(purity))
    return purity
def EvaluateByPurity(traintestFile):
    listtuple_pred_true_text = ReadPredTrueText(traintestFile)
    dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False)
    ComputePurity(dic_tupple_class)
def ClusterByHDbScan(listtuple_pred_true_text, avgItemsInCluster_in_a_batch):
    print("\nClusterByHDbScan")
    printClusterEvaluation_list(listtuple_pred_true_text)
    print(len(listtuple_pred_true_text), avgItemsInCluster_in_a_batch)

    dic_tupple_class_predicted = groupTxtByClass(listtuple_pred_true_text,
                                                 False)
    numberOfClusters_predicted = len(dic_tupple_class_predicted)

    dic_tupple_class_true = groupTxtByClass(listtuple_pred_true_text, True)
    numberOfClusters_true = len(dic_tupple_class_true)

    print("numberOfClusters_true=" + str(numberOfClusters_true) +
          ", numberOfClusters_predicted=" + str(numberOfClusters_predicted))

    train_data = []
    train_predlabels = []
    train_trueLabels = []

    for pred_true_text in listtuple_pred_true_text:
        train_predlabels.append(pred_true_text[0])
        train_trueLabels.append(pred_true_text[1])
        train_data.append(pred_true_text[2])

    vectorizer = TfidfVectorizer(max_df=1.0,
                                 min_df=1,
                                 stop_words='english',
                                 use_idf=True,
                                 smooth_idf=True,
                                 norm='l2')
    X = vectorizer.fit_transform(train_data)

    svd = TruncatedSVD(2)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    X_svd = lsa.fit_transform(X)

    min_cluster_size_in_a_batch = int(math.ceil(avgItemsInCluster_in_a_batch))

    min_cluster_size_in_a_batch = 2

    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size_in_a_batch)
    clusterer.fit(X)
    X_hdbscan_labels = clusterer.labels_

    print("X-total-clusters=" + str(X_hdbscan_labels.max()))
    print("Homogeneity: %0.4f" %
          metrics.homogeneity_score(train_trueLabels, X_hdbscan_labels))
    print("Completeness: %0.4f" %
          metrics.completeness_score(train_trueLabels, X_hdbscan_labels))
    print("V-measure: %0.4f" %
          metrics.v_measure_score(train_trueLabels, X_hdbscan_labels))
    print("Adjusted Rand-Index: %.4f" %
          metrics.adjusted_rand_score(train_trueLabels, X_hdbscan_labels))
    print("nmi_score-whole-data:   %0.4f" %
          metrics.normalized_mutual_info_score(
              train_trueLabels, X_hdbscan_labels, average_method='arithmetic'))

    clusterer_svd = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size_in_a_batch)
    clusterer_svd.fit(X_svd)
    X_svd_hdbscan_labels = clusterer_svd.labels_

    db = DBSCAN().fit(X_svd)
    X_svd_dbscan_labels = db.labels_

    print("X-svd-total-clusters=" + str(X_svd_hdbscan_labels.max()))
    print("Homogeneity: %0.4f" %
          metrics.homogeneity_score(train_trueLabels, X_svd_hdbscan_labels))
    print("Completeness: %0.4f" %
          metrics.completeness_score(train_trueLabels, X_svd_hdbscan_labels))
    print("V-measure: %0.4f" %
          metrics.v_measure_score(train_trueLabels, X_svd_hdbscan_labels))
    print("Adjusted Rand-Index: %.4f" %
          metrics.adjusted_rand_score(train_trueLabels, X_svd_hdbscan_labels))
    print("nmi_score-whole-data:   %0.4f" %
          metrics.normalized_mutual_info_score(train_trueLabels,
                                               X_svd_hdbscan_labels,
                                               average_method='arithmetic'))

    print("X-svd-dbscan-total-clusters=" + str(X_svd_dbscan_labels.max()))
    print("Homogeneity: %0.4f" %
          metrics.homogeneity_score(train_trueLabels, X_svd_dbscan_labels))
    print("Completeness: %0.4f" %
          metrics.completeness_score(train_trueLabels, X_svd_dbscan_labels))
    print("V-measure: %0.4f" %
          metrics.v_measure_score(train_trueLabels, X_svd_dbscan_labels))
    print("Adjusted Rand-Index: %.4f" %
          metrics.adjusted_rand_score(train_trueLabels, X_svd_dbscan_labels))
    print("nmi_score-whole-data:   %0.4f" %
          metrics.normalized_mutual_info_score(train_trueLabels,
                                               X_svd_dbscan_labels,
                                               average_method='arithmetic'))
def clusteringDCT(pred_true_txt_ind_prevPreds, wordVectorsDic, batchDocs,
                  maxPredLabel):
    print("#m-stream-cleaned")
    Evaluate(pred_true_txt_ind_prevPreds)

    pred_true_text_ind_prevPreds_to_cluster, pred_true_text_ind_prevPreds_to_not_cluster = extrcatLargeClusterItems(
        pred_true_txt_ind_prevPreds)
    print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3]))
    print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4]))
    '''minPredToC, maxPredToC, minTrueToC, maxTrueToC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_cluster)
  print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_cluster)") 
  print(minPredToC, maxPredToC, minTrueToC, maxTrueToC)
  
  minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_not_cluster)
  print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_not_cluster)") 
  print(minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC)'''

    all_pred_clusters = len(groupTxtByClass(pred_true_txt_ind_prevPreds,
                                            False))
    pred_clusters = len(
        groupTxtByClass(pred_true_text_ind_prevPreds_to_cluster, False))
    non_pred_clusters = len(
        groupTxtByClass(pred_true_text_ind_prevPreds_to_not_cluster, False))

    print("#clusters=" + str(pred_clusters))
    print("#not clusters=" + str(non_pred_clusters))
    print("this clustering with embedding DCT")
    pred_clusters = non_pred_clusters - pred_clusters
    print("#update clusters=" + str(pred_clusters))

    nparr = np.array(pred_true_text_ind_prevPreds_to_cluster)
    print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3]))
    print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4]))
    preds = list(nparr[:, 0])
    trues = list(nparr[:, 1])
    texts = list(nparr[:, 2])
    inds = list(nparr[:, 3])
    prevPreds = list(nparr[:, 4])

    skStopWords = getScikitLearn_StopWords()
    texts = processTextsRemoveStopWordTokenized(texts, skStopWords)
    '''dicDocFreq=getDocFreq(texts)
  dctCoffs=1
  X=generate_sent_vecs_toktextdata_DCT(texts, wordVectorsDic, 300,dctCoffs)  
  #vectorizer = TfidfVectorizer(tokenizer=stem_text,max_df=0.5,min_df=1)
  #vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english')
  #X = vectorizer.fit_transform(texts)'''
    '''svd = TruncatedSVD(50)
  #svd = PCA(n_components=50)	
  normalizer = Normalizer(copy=False)
  lsa = make_pipeline(svd, normalizer)
  #X=X.toarray()	
  X = lsa.fit_transform(X)'''
    '''km = KMeans(n_clusters=pred_clusters, init='k-means++', max_iter=100,random_state=0)	
  km.fit(X)
  list_km_pred_true_text=combine_pred_true_txt_from_list(km.labels_, trues, texts)
  print("#k-means")	
  Evaluate(list_km_pred_true_text)'''
    '''ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X)
  list_hr_pred_true_text=combine_pred_true_txt_from_list(ward.labels_, trues, texts)
  print("#hr-ward-DCT")
  print(min(ward.labels_), max(ward.labels_))
  pred_true_text_ind_prevPreds_to_not_cluster_hr=change_pred_label(pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters+1)  
  Evaluate(list_hr_pred_true_text)
  Evaluate(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr)
  '''

    X = generate_sent_vecs_toktextdata(texts, wordVectorsDic, 300)
    ward = AgglomerativeClustering(n_clusters=pred_clusters,
                                   linkage='ward').fit(X)
    list_hr_pred_true_text_ind_prevPred = np.column_stack(
        (ward.labels_, trues, texts, inds, prevPreds)).tolist()
    print("#hr-ward-AVG")
    pred_true_text_ind_prevPreds_to_not_cluster_hr = change_pred_label(
        pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1)
    Evaluate(list_hr_pred_true_text_ind_prevPred)
    Evaluate(list_hr_pred_true_text_ind_prevPred +
             pred_true_text_ind_prevPreds_to_not_cluster_hr)
    #print_by_group(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr)

    print("#spectral-avg")
    clustering = SpectralClustering(n_clusters=pred_clusters,
                                    assign_labels="discretize",
                                    random_state=0).fit(X)
    list_sp_pred_true_text_ind_prevPred = np.column_stack(
        (clustering.labels_, trues, texts, inds, prevPreds)).tolist()
    pred_true_text_ind_prevPreds_to_not_cluster_spec = change_pred_label(
        pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1)
    Evaluate(list_sp_pred_true_text_ind_prevPred)
    Evaluate(list_sp_pred_true_text_ind_prevPred +
             pred_true_text_ind_prevPreds_to_not_cluster_spec)
def ObtainNumberOfClusters(isByTrueLabel, listtuple_pred_true_text) :
  dic_tupple_class=groupTxtByClass(listtuple_pred_true_text, isByTrueLabel)
  num_clusters= len(dic_tupple_class)
  return num_clusters
Example #21
0
def print_by_group(listtuple_pred_true_text):
    dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False)
    for label, pred_true_txts in sorted(dic_tupple_class.items()):
        Print_list_pred_true_text(pred_true_txts)
def clusterByTfIdfFeature(list_pred_true_text):
    print("pred_mstreams")
    printClusterEvaluation_list(list_pred_true_text)
    dic_tupple_class = groupTxtByClass(list_pred_true_text, False)
    pred_clusters = len(dic_tupple_class)
    print("pred_clusters for k-means=" + str(pred_clusters))

    preds, trues, texts = split_pred_true_txt_from_list(list_pred_true_text)
    skStopWords = getScikitLearn_StopWords()
    texts = processTextsRemoveStopWordTokenized(texts, skStopWords)
    vectorizer = TfidfVectorizer(tokenizer=stem_text, max_df=0.5, min_df=2)
    #vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english')
    X = vectorizer.fit_transform(texts)

    svd = TruncatedSVD(100)
    #svd = PCA(n_components=50)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    #X=X.toarray()
    X = lsa.fit_transform(X)

    km = KMeans(n_clusters=pred_clusters,
                init='k-means++',
                max_iter=100,
                random_state=0)
    km.fit(X)
    list_km_pred_true_text = combine_pred_true_txt_from_list(
        km.labels_, trues, texts)
    print("k-means")
    printClusterEvaluation_list(list_km_pred_true_text)

    ward = AgglomerativeClustering(n_clusters=pred_clusters,
                                   linkage='ward').fit(X)
    list_hr_pred_true_text = combine_pred_true_txt_from_list(
        ward.labels_, trues, texts)
    print("hr-ward")
    printClusterEvaluation_list(list_hr_pred_true_text)

    clustering = SpectralClustering(n_clusters=pred_clusters,
                                    assign_labels="discretize",
                                    random_state=0).fit(X)
    list_sp_pred_true_text = combine_pred_true_txt_from_list(
        clustering.labels_, trues, texts)
    print("spectral")
    printClusterEvaluation_list(list_sp_pred_true_text)

    brc = Birch(branching_factor=50,
                n_clusters=pred_clusters,
                threshold=0.5,
                compute_labels=True)
    brc.fit_predict(X)
    list_brc_pred_true_text = combine_pred_true_txt_from_list(
        brc.labels_, trues, texts)
    print("brc")
    printClusterEvaluation_list(list_brc_pred_true_text)

    gmm = GaussianMixture(n_components=pred_clusters, covariance_type='full')
    gmm_labels = gmm.fit_predict(X)
    list_gmm_pred_true_text = combine_pred_true_txt_from_list(
        gmm_labels, trues, texts)
    print("gmm")
    printClusterEvaluation_list(list_gmm_pred_true_text)