def GenerateTrainTest2_Percentage(percentTrainData, traintestFile, numberOfClusters, textsperlabelDir, trainFile, testFile):
 trainDataRatio = 1.0
		
 listtuple_pred_true_text = ReadPredTrueText(traintestFile)
 perct_tdata = percentTrainData/100
 goodAmount_txts = int(perct_tdata*(len(listtuple_pred_true_text)/numberOfClusters))			
 dic_tupple_class=groupTxtByClass(listtuple_pred_true_text, False)		
 #write texts of each group in  
 labelNames=WriteTextsOfEachGroup(textsperlabelDir,dic_tupple_class)
 dic_label_outliers = Gen_WriteOutliersEachGroup(textsperlabelDir, numberOfClusters, labelNames)

 train_pred_true_txts = []
 test_pred_true_txts = []

 for label, pred_true_txt in dic_tupple_class.items():
  outlierpreds = dic_label_outliers[str(label)]
  pred_true_txts = dic_tupple_class[str(label)]

  if len(outlierpreds)!= len(pred_true_txts):
   print("Size not match for="+str(label))
  
  outLiers_pred_true_txt = []
  count = -1
  for outPred in outlierpreds:
   outPred = str(outPred)
   count=count+1
   if outPred=="-1":
    outLiers_pred_true_txt.append(pred_true_txts[count])

  test_pred_true_txts.extend(outLiers_pred_true_txt)
  #remove outlierts insts from pred_true_txts
  pred_true_txts_good = [e for e in pred_true_txts if e not in outLiers_pred_true_txt]
  dic_tupple_class[str(label)]=pred_true_txts_good

  
 for label, pred_true_txt in dic_tupple_class.items():
  pred_true_txts = dic_tupple_class[str(label)] 
  pred_true_txt_subs= []
  numTrainGoodTexts=int(perct_tdata*len(pred_true_txts))
  if len(pred_true_txts) > goodAmount_txts:
   pred_true_txt_subs.extend(pred_true_txts[0:goodAmount_txts])
   test_pred_true_txts.extend(pred_true_txts[goodAmount_txts:len(pred_true_txts)]) 
  else:
   pred_true_txt_subs.extend(pred_true_txts)
  train_pred_true_txts.extend(pred_true_txt_subs)
 
 trainDataRatio = float(len(train_pred_true_txts))/float(len(train_pred_true_txts+test_pred_true_txts))
 print("trainDataRatio="+str(trainDataRatio))
 #if trainDataRatio<=maxTrainRatio:
 writePredTrueTexts(trainFile,train_pred_true_txts)
 writePredTrueTexts(testFile,test_pred_true_txts) 
   		
 return trainDataRatio
Example #2
0
    def run_MStream(self, documentSet, outputPath, wordList, AllBatchNum):
        self.D_All = documentSet.D  # The whole number of documents
        self.z = {
        }  # Cluster assignments of each document                 (documentID -> clusterID)
        self.m_z = {
        }  # The number of documents in cluster z               (clusterID -> number of documents)
        self.n_z = {
        }  # The number of words in cluster z                   (clusterID -> number of words)
        self.n_zv = {
        }  # The number of occurrences of word v in cluster z  (n_zv[clusterID][wordID] = number)
        self.currentDoc = 0  # Store start point of next batch
        self.startDoc = 0  # Store start point of this batch
        self.D = 0  # The number of documents currently #may not be used
        self.K_current = copy.deepcopy(
            self.K)  # the number of cluster containing documents currently
        # self.BatchSet = {} # No need to store information of each batch
        self.word_current = {}  # Store word-IDs' list of each batch
        self.f_zs = {}  # feature to cluster z. t1={w1, w2} w1->c1, w2->c1

        # Get batchNum2tweetID by AllBatchNum
        self.getAveBatch(documentSet, AllBatchNum)
        print("batchNum2tweetID is ", self.batchNum2tweetID)

        t11 = datetime.now()

        while self.currentDoc < self.D_All:
            print("Batch", self.batchNum)
            if self.batchNum not in self.batchNum2tweetID:
                break
            dic_docId__cluster = self.customInitialize(documentSet)
            # self.intialize(documentSet)
            # self.gibbsSampling(documentSet)
            self.gibbsSamplingPartial(documentSet, dic_docId__cluster)
            print("\tGibbs sampling successful! Start to saving results.")
            self.output(documentSet, outputPath, wordList, self.batchNum - 1)
            print("\tSaving successful!")
            self.deleteClusters()

            # print(self.f_zs)

        t12 = datetime.now()
        t_diff = t12 - t11
        print("total time diff secs=", t_diff.seconds)

        listtuple_pred_true_text = ReadPredTrueText(clusterWriterFile)
        Evaluate(listtuple_pred_true_text)
        clusterWriter.close()
"/home/owner/PhD/MStream-master/MStream/result/NewsPredTueTextMStream_WordArr.txt",
"/home/owner/PhD/MStream-master/MStream/result/NewsPredTueTextMStreamSemantic_WordArr.txt"'''

predTrueTextfiles = [
    "/home/owner/PhD/MStream-master/MStream/result/batchId_PredTrueText1"
]

merged_new_label_pred_true_txts_all_batch = []
non_outlier_pred_true_txts_all_batch = []
mstream_pred_true_txts_all_batch = []

gloveFile = "/home/owner/PhD/dr.norbert/dataset/shorttext/glove.42B.300d/glove.42B.300d.txt"
#wordVectorsDic = extractAllWordVecs(gloveFile, 300)

for predTrueTextfile in predTrueTextfiles:
    listtuple_pred_true_text = ReadPredTrueText(predTrueTextfile)

    #remove outliers from each cluster by connected components
    #assign those outliers to the clusters based on common words
    #find out the entropy of each word using cluster distribution
    #remove high entropy words (needs logic) from each text
    #find the embedding of each text
    #cluster the texts using hac + sd method
    #cluster text by tf-idf feature

    outlier_pred_true_texts, non_outlier_pred_true_txts, avgItemsInCluster = removeOutlierConnectedComponentLexical(
        listtuple_pred_true_text)

    #change pred labels
    newOutlier_pred_true_txts = change_pred_label(outlier_pred_true_texts,
                                                  1000)
c_totalBiterms={}
c_wordsFreqs={}
c_totalWords={}
c_txtIds={}
c_clusterVecs={}
txtId_txt={}
last_txtId=0  
max_c_id=0
dic_clus__id={}

dic_biterm__clusterIds={}



f = open(resultFile, 'w')

t11=datetime.now()

c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterIds=cluster_biterm(f, list_pred_true_words_index, c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, max_c_id, wordVectorsDic, dic_clus__id, dic_biterm__clusterIds)


t12=datetime.now()	  
t_diff = t12-t11
print("total time diff secs=",t_diff.seconds)  

f.close()
  
listtuple_pred_true_text=ReadPredTrueText(resultFile, ignoreMinusOne)

print('result for', dataset)
Evaluate_old(listtuple_pred_true_text)  
Example #5
0
def printClusterEvaluation_file(pred_true_text_file):
    listtuple_pred_true_text = ReadPredTrueText(pred_true_text_file)
    printClusterEvaluation_list(listtuple_pred_true_text)
Example #6
0
    def run_MStreamF(self, documentSet, outputPath, wordList, AllBatchNum):
        self.D_All = documentSet.D  # The whole number of documents
        self.z = {
        }  # Cluster assignments of each document                 (documentID -> clusterID)
        self.m_z = {
        }  # The number of documents in cluster z               (clusterID -> number of documents)
        self.n_z = {
        }  # The number of words in cluster z                   (clusterID -> number of words)
        self.n_zv = {
        }  # The number of occurrences of word v in cluster z  (n_zv[clusterID][wordID] = number)
        self.currentDoc = 0  # Store start point of next batch
        self.startDoc = 0  # Store start point of this batch
        self.D = 0  # The number of documents currently
        self.K_current = copy.deepcopy(
            self.K)  # the number of cluster containing documents currently
        self.BatchSet = {}  # Store information of each batch
        self.word_current = {}  # Store word-IDs' list of each batch
        self.f_zs = {
        }  # feature to list of clusters. t1={w1, w2} w1->c1, w2->c1

        # Get batchNum2tweetID by AllBatchNum
        self.getAveBatch(documentSet, AllBatchNum)
        print("batchNum2tweetID is ", self.batchNum2tweetID)

        t11 = datetime.now()
        while self.currentDoc < self.D_All:
            print("Batch", self.batchNum)
            if self.batchNum not in self.batchNum2tweetID:
                break
            if self.batchNum <= self.Max_Batch:
                self.BatchSet[self.batchNum] = {}
                self.BatchSet[self.batchNum]['D'] = copy.deepcopy(self.D)
                self.BatchSet[self.batchNum]['z'] = copy.deepcopy(self.z)
                self.BatchSet[self.batchNum]['m_z'] = copy.deepcopy(self.m_z)
                self.BatchSet[self.batchNum]['n_z'] = copy.deepcopy(self.n_z)
                self.BatchSet[self.batchNum]['n_zv'] = copy.deepcopy(self.n_zv)

                dic_docId__cluster = self.customInitialize(documentSet)
                # self.intialize(documentSet)
                self.gibbsSamplingPartial(documentSet, dic_docId__cluster)

                # self.gibbsSampling(documentSet)
            else:
                # remove influence of batch earlier than Max_Batch
                self.D -= self.BatchSet[self.batchNum - self.Max_Batch]['D']
                for cluster in self.m_z:
                    if cluster in self.BatchSet[self.batchNum -
                                                self.Max_Batch]['m_z']:
                        self.m_z[cluster] -= self.BatchSet[
                            self.batchNum - self.Max_Batch]['m_z'][cluster]
                        self.n_z[cluster] -= self.BatchSet[
                            self.batchNum - self.Max_Batch]['n_z'][cluster]
                        for word in self.n_zv[cluster]:
                            if word in self.BatchSet[
                                    self.batchNum -
                                    self.Max_Batch]['n_zv'][cluster]:
                                self.n_zv[cluster][word] -= \
                                    self.BatchSet[self.batchNum - self.Max_Batch]['n_zv'][cluster][word]
                for cluster in range(self.K):
                    self.checkEmpty(cluster)
                self.BatchSet.pop(self.batchNum - self.Max_Batch)
                self.BatchSet[self.batchNum] = {}
                self.BatchSet[self.batchNum]['D'] = copy.deepcopy(self.D)
                self.BatchSet[self.batchNum]['z'] = copy.deepcopy(self.z)
                self.BatchSet[self.batchNum]['m_z'] = copy.deepcopy(self.m_z)
                self.BatchSet[self.batchNum]['n_z'] = copy.deepcopy(self.n_z)
                self.BatchSet[self.batchNum]['n_zv'] = copy.deepcopy(self.n_zv)

                dic_docId__cluster = self.customInitialize(documentSet)
                # self.intialize(documentSet)

                # self.gibbsSampling(documentSet)
                self.gibbsSamplingPartial(documentSet, dic_docId__cluster)
            # get influence of only the current batch (remove other influence)
            self.BatchSet[self.batchNum -
                          1]['D'] = self.D - self.BatchSet[self.batchNum -
                                                           1]['D']
            for cluster in self.m_z:
                if cluster not in self.BatchSet[self.batchNum - 1]['m_z']:
                    self.BatchSet[self.batchNum - 1]['m_z'][cluster] = 0
                if cluster not in self.BatchSet[self.batchNum - 1]['n_z']:
                    self.BatchSet[self.batchNum - 1]['n_z'][cluster] = 0
                self.BatchSet[self.batchNum - 1]['m_z'][cluster] = self.m_z[cluster] - \
                                                                   self.BatchSet[self.batchNum - 1]['m_z'][cluster]
                self.BatchSet[self.batchNum - 1]['n_z'][cluster] = self.n_z[cluster] - \
                                                                   self.BatchSet[self.batchNum - 1]['n_z'][cluster]
                if cluster not in self.BatchSet[self.batchNum - 1]['n_zv']:
                    self.BatchSet[self.batchNum - 1]['n_zv'][cluster] = {}
                for word in self.n_zv[cluster]:
                    if word not in self.BatchSet[self.batchNum -
                                                 1]['n_zv'][cluster]:
                        self.BatchSet[self.batchNum -
                                      1]['n_zv'][cluster][word] = 0
                    self.BatchSet[self.batchNum - 1]['n_zv'][cluster][word] = self.n_zv[cluster][word] - \
                                                                              self.BatchSet[self.batchNum - 1]['n_zv'][
                                                                                  cluster][word]
            print("\tGibbs sampling successful! Start to saving results.")
            self.output(documentSet, outputPath, wordList, self.batchNum - 1)
            print("\tSaving successful!")
            # print(self.f_zs)
            self.deleteClusters()

        t12 = datetime.now()
        t_diff = t12 - t11
        print("total time diff secs=", t_diff.seconds)

        listtuple_pred_true_text = ReadPredTrueText(clusterWriterFile)
        Evaluate_old(listtuple_pred_true_text)
        clusterWriter.close()
Example #7
0
import numpy as np
from groupTxt_ByClass import groupItemsBySingleKeyIndex
from word_vec_extractor import extractAllWordVecs
from read_pred_true_text import ReadPredTrueText
from sent_vecgenerator import generate_sent_vecs_toktextdata
from general_util import print_by_group
from sklearn.cluster import SpectralClustering
from txt_process_util import RemoveHighClusterEntropyWordsIndex
from sklearn.feature_extraction.text import TfidfVectorizer
from cluster_file_connected_component import clusterByConnectedComponentIndex
from cluster_file_leadNonOverlapWords import clusterByLeadingOnOverlappingWords
#from general_util import Print_list_pred_true_text

gloveFile = "/home/owner/PhD/dr.norbert/dataset/shorttext/glove.42B.300d/glove.42B.300d.txt"

listtuple_pred_true_text = ReadPredTrueText("result/batchId_PredTrueText1")
newList = []
i = -1
for pred_true_text in listtuple_pred_true_text:
    i = i + 1
    newList.append(pred_true_text + [i, i])
listtuple_pred_true_text = newList

listtuple_pred_true_text = RemoveHighClusterEntropyWordsIndex(
    listtuple_pred_true_text)

dic_tupple_class = groupItemsBySingleKeyIndex(listtuple_pred_true_text, 0)

#wordVectorsDic = extractAllWordVecs(gloveFile, 300)

for label, cluster_pred_true_txt_inds in dic_tupple_class.items():
Example #8
0
                h_count = max_hit + 100
                break
            # if h_count > max_hit:
            #    break

        # if h_count > max_hit:
        #    break

        if cluscount > 1000:
            break

    if not found:
        print('not\t' + str(h_count) + '\t' + str(test_oCPost.soPostId) + '\t' + str(test_oCPost.tagWords)+'\t'+str(test_oCPost.trueLabel))


listtuple_pred_true_text = ReadPredTrueText(clusterWriterFile)
dic_tupple_class = groupItemsBySingleKeyIndex(listtuple_pred_true_text, 0)  # before 0
# print(dic_tupple_class)
dic_term_clusterIds, dic_cluster_ftrs, dic_cluster_size = createTermToClsuetrId(dic_tupple_class)





#############test
test_list_CPost = readStackOverflowDataSetTagTitleBody(testFile)
# print(test_list_CPost)
for oCPost in test_list_CPost:
    terms = oCPost.tagWords
    test_term_dict = Counter(terms)
    test_term_size = len(terms)