def GenerateTrainTest2_Percentage(percentTrainData, traintestFile, numberOfClusters, textsperlabelDir, trainFile, testFile): trainDataRatio = 1.0 listtuple_pred_true_text = ReadPredTrueText(traintestFile) perct_tdata = percentTrainData/100 goodAmount_txts = int(perct_tdata*(len(listtuple_pred_true_text)/numberOfClusters)) dic_tupple_class=groupTxtByClass(listtuple_pred_true_text, False) #write texts of each group in labelNames=WriteTextsOfEachGroup(textsperlabelDir,dic_tupple_class) dic_label_outliers = Gen_WriteOutliersEachGroup(textsperlabelDir, numberOfClusters, labelNames) train_pred_true_txts = [] test_pred_true_txts = [] for label, pred_true_txt in dic_tupple_class.items(): outlierpreds = dic_label_outliers[str(label)] pred_true_txts = dic_tupple_class[str(label)] if len(outlierpreds)!= len(pred_true_txts): print("Size not match for="+str(label)) outLiers_pred_true_txt = [] count = -1 for outPred in outlierpreds: outPred = str(outPred) count=count+1 if outPred=="-1": outLiers_pred_true_txt.append(pred_true_txts[count]) test_pred_true_txts.extend(outLiers_pred_true_txt) #remove outlierts insts from pred_true_txts pred_true_txts_good = [e for e in pred_true_txts if e not in outLiers_pred_true_txt] dic_tupple_class[str(label)]=pred_true_txts_good for label, pred_true_txt in dic_tupple_class.items(): pred_true_txts = dic_tupple_class[str(label)] pred_true_txt_subs= [] numTrainGoodTexts=int(perct_tdata*len(pred_true_txts)) if len(pred_true_txts) > goodAmount_txts: pred_true_txt_subs.extend(pred_true_txts[0:goodAmount_txts]) test_pred_true_txts.extend(pred_true_txts[goodAmount_txts:len(pred_true_txts)]) else: pred_true_txt_subs.extend(pred_true_txts) train_pred_true_txts.extend(pred_true_txt_subs) trainDataRatio = float(len(train_pred_true_txts))/float(len(train_pred_true_txts+test_pred_true_txts)) print("trainDataRatio="+str(trainDataRatio)) #if trainDataRatio<=maxTrainRatio: writePredTrueTexts(trainFile,train_pred_true_txts) writePredTrueTexts(testFile,test_pred_true_txts) return trainDataRatio
def run_MStream(self, documentSet, outputPath, wordList, AllBatchNum): self.D_All = documentSet.D # The whole number of documents self.z = { } # Cluster assignments of each document (documentID -> clusterID) self.m_z = { } # The number of documents in cluster z (clusterID -> number of documents) self.n_z = { } # The number of words in cluster z (clusterID -> number of words) self.n_zv = { } # The number of occurrences of word v in cluster z (n_zv[clusterID][wordID] = number) self.currentDoc = 0 # Store start point of next batch self.startDoc = 0 # Store start point of this batch self.D = 0 # The number of documents currently #may not be used self.K_current = copy.deepcopy( self.K) # the number of cluster containing documents currently # self.BatchSet = {} # No need to store information of each batch self.word_current = {} # Store word-IDs' list of each batch self.f_zs = {} # feature to cluster z. t1={w1, w2} w1->c1, w2->c1 # Get batchNum2tweetID by AllBatchNum self.getAveBatch(documentSet, AllBatchNum) print("batchNum2tweetID is ", self.batchNum2tweetID) t11 = datetime.now() while self.currentDoc < self.D_All: print("Batch", self.batchNum) if self.batchNum not in self.batchNum2tweetID: break dic_docId__cluster = self.customInitialize(documentSet) # self.intialize(documentSet) # self.gibbsSampling(documentSet) self.gibbsSamplingPartial(documentSet, dic_docId__cluster) print("\tGibbs sampling successful! Start to saving results.") self.output(documentSet, outputPath, wordList, self.batchNum - 1) print("\tSaving successful!") self.deleteClusters() # print(self.f_zs) t12 = datetime.now() t_diff = t12 - t11 print("total time diff secs=", t_diff.seconds) listtuple_pred_true_text = ReadPredTrueText(clusterWriterFile) Evaluate(listtuple_pred_true_text) clusterWriter.close()
"/home/owner/PhD/MStream-master/MStream/result/NewsPredTueTextMStream_WordArr.txt", "/home/owner/PhD/MStream-master/MStream/result/NewsPredTueTextMStreamSemantic_WordArr.txt"''' predTrueTextfiles = [ "/home/owner/PhD/MStream-master/MStream/result/batchId_PredTrueText1" ] merged_new_label_pred_true_txts_all_batch = [] non_outlier_pred_true_txts_all_batch = [] mstream_pred_true_txts_all_batch = [] gloveFile = "/home/owner/PhD/dr.norbert/dataset/shorttext/glove.42B.300d/glove.42B.300d.txt" #wordVectorsDic = extractAllWordVecs(gloveFile, 300) for predTrueTextfile in predTrueTextfiles: listtuple_pred_true_text = ReadPredTrueText(predTrueTextfile) #remove outliers from each cluster by connected components #assign those outliers to the clusters based on common words #find out the entropy of each word using cluster distribution #remove high entropy words (needs logic) from each text #find the embedding of each text #cluster the texts using hac + sd method #cluster text by tf-idf feature outlier_pred_true_texts, non_outlier_pred_true_txts, avgItemsInCluster = removeOutlierConnectedComponentLexical( listtuple_pred_true_text) #change pred labels newOutlier_pred_true_txts = change_pred_label(outlier_pred_true_texts, 1000)
c_totalBiterms={} c_wordsFreqs={} c_totalWords={} c_txtIds={} c_clusterVecs={} txtId_txt={} last_txtId=0 max_c_id=0 dic_clus__id={} dic_biterm__clusterIds={} f = open(resultFile, 'w') t11=datetime.now() c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterIds=cluster_biterm(f, list_pred_true_words_index, c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, max_c_id, wordVectorsDic, dic_clus__id, dic_biterm__clusterIds) t12=datetime.now() t_diff = t12-t11 print("total time diff secs=",t_diff.seconds) f.close() listtuple_pred_true_text=ReadPredTrueText(resultFile, ignoreMinusOne) print('result for', dataset) Evaluate_old(listtuple_pred_true_text)
def printClusterEvaluation_file(pred_true_text_file): listtuple_pred_true_text = ReadPredTrueText(pred_true_text_file) printClusterEvaluation_list(listtuple_pred_true_text)
def run_MStreamF(self, documentSet, outputPath, wordList, AllBatchNum): self.D_All = documentSet.D # The whole number of documents self.z = { } # Cluster assignments of each document (documentID -> clusterID) self.m_z = { } # The number of documents in cluster z (clusterID -> number of documents) self.n_z = { } # The number of words in cluster z (clusterID -> number of words) self.n_zv = { } # The number of occurrences of word v in cluster z (n_zv[clusterID][wordID] = number) self.currentDoc = 0 # Store start point of next batch self.startDoc = 0 # Store start point of this batch self.D = 0 # The number of documents currently self.K_current = copy.deepcopy( self.K) # the number of cluster containing documents currently self.BatchSet = {} # Store information of each batch self.word_current = {} # Store word-IDs' list of each batch self.f_zs = { } # feature to list of clusters. t1={w1, w2} w1->c1, w2->c1 # Get batchNum2tweetID by AllBatchNum self.getAveBatch(documentSet, AllBatchNum) print("batchNum2tweetID is ", self.batchNum2tweetID) t11 = datetime.now() while self.currentDoc < self.D_All: print("Batch", self.batchNum) if self.batchNum not in self.batchNum2tweetID: break if self.batchNum <= self.Max_Batch: self.BatchSet[self.batchNum] = {} self.BatchSet[self.batchNum]['D'] = copy.deepcopy(self.D) self.BatchSet[self.batchNum]['z'] = copy.deepcopy(self.z) self.BatchSet[self.batchNum]['m_z'] = copy.deepcopy(self.m_z) self.BatchSet[self.batchNum]['n_z'] = copy.deepcopy(self.n_z) self.BatchSet[self.batchNum]['n_zv'] = copy.deepcopy(self.n_zv) dic_docId__cluster = self.customInitialize(documentSet) # self.intialize(documentSet) self.gibbsSamplingPartial(documentSet, dic_docId__cluster) # self.gibbsSampling(documentSet) else: # remove influence of batch earlier than Max_Batch self.D -= self.BatchSet[self.batchNum - self.Max_Batch]['D'] for cluster in self.m_z: if cluster in self.BatchSet[self.batchNum - self.Max_Batch]['m_z']: self.m_z[cluster] -= self.BatchSet[ self.batchNum - self.Max_Batch]['m_z'][cluster] self.n_z[cluster] -= self.BatchSet[ self.batchNum - self.Max_Batch]['n_z'][cluster] for word in self.n_zv[cluster]: if word in self.BatchSet[ self.batchNum - self.Max_Batch]['n_zv'][cluster]: self.n_zv[cluster][word] -= \ self.BatchSet[self.batchNum - self.Max_Batch]['n_zv'][cluster][word] for cluster in range(self.K): self.checkEmpty(cluster) self.BatchSet.pop(self.batchNum - self.Max_Batch) self.BatchSet[self.batchNum] = {} self.BatchSet[self.batchNum]['D'] = copy.deepcopy(self.D) self.BatchSet[self.batchNum]['z'] = copy.deepcopy(self.z) self.BatchSet[self.batchNum]['m_z'] = copy.deepcopy(self.m_z) self.BatchSet[self.batchNum]['n_z'] = copy.deepcopy(self.n_z) self.BatchSet[self.batchNum]['n_zv'] = copy.deepcopy(self.n_zv) dic_docId__cluster = self.customInitialize(documentSet) # self.intialize(documentSet) # self.gibbsSampling(documentSet) self.gibbsSamplingPartial(documentSet, dic_docId__cluster) # get influence of only the current batch (remove other influence) self.BatchSet[self.batchNum - 1]['D'] = self.D - self.BatchSet[self.batchNum - 1]['D'] for cluster in self.m_z: if cluster not in self.BatchSet[self.batchNum - 1]['m_z']: self.BatchSet[self.batchNum - 1]['m_z'][cluster] = 0 if cluster not in self.BatchSet[self.batchNum - 1]['n_z']: self.BatchSet[self.batchNum - 1]['n_z'][cluster] = 0 self.BatchSet[self.batchNum - 1]['m_z'][cluster] = self.m_z[cluster] - \ self.BatchSet[self.batchNum - 1]['m_z'][cluster] self.BatchSet[self.batchNum - 1]['n_z'][cluster] = self.n_z[cluster] - \ self.BatchSet[self.batchNum - 1]['n_z'][cluster] if cluster not in self.BatchSet[self.batchNum - 1]['n_zv']: self.BatchSet[self.batchNum - 1]['n_zv'][cluster] = {} for word in self.n_zv[cluster]: if word not in self.BatchSet[self.batchNum - 1]['n_zv'][cluster]: self.BatchSet[self.batchNum - 1]['n_zv'][cluster][word] = 0 self.BatchSet[self.batchNum - 1]['n_zv'][cluster][word] = self.n_zv[cluster][word] - \ self.BatchSet[self.batchNum - 1]['n_zv'][ cluster][word] print("\tGibbs sampling successful! Start to saving results.") self.output(documentSet, outputPath, wordList, self.batchNum - 1) print("\tSaving successful!") # print(self.f_zs) self.deleteClusters() t12 = datetime.now() t_diff = t12 - t11 print("total time diff secs=", t_diff.seconds) listtuple_pred_true_text = ReadPredTrueText(clusterWriterFile) Evaluate_old(listtuple_pred_true_text) clusterWriter.close()
import numpy as np from groupTxt_ByClass import groupItemsBySingleKeyIndex from word_vec_extractor import extractAllWordVecs from read_pred_true_text import ReadPredTrueText from sent_vecgenerator import generate_sent_vecs_toktextdata from general_util import print_by_group from sklearn.cluster import SpectralClustering from txt_process_util import RemoveHighClusterEntropyWordsIndex from sklearn.feature_extraction.text import TfidfVectorizer from cluster_file_connected_component import clusterByConnectedComponentIndex from cluster_file_leadNonOverlapWords import clusterByLeadingOnOverlappingWords #from general_util import Print_list_pred_true_text gloveFile = "/home/owner/PhD/dr.norbert/dataset/shorttext/glove.42B.300d/glove.42B.300d.txt" listtuple_pred_true_text = ReadPredTrueText("result/batchId_PredTrueText1") newList = [] i = -1 for pred_true_text in listtuple_pred_true_text: i = i + 1 newList.append(pred_true_text + [i, i]) listtuple_pred_true_text = newList listtuple_pred_true_text = RemoveHighClusterEntropyWordsIndex( listtuple_pred_true_text) dic_tupple_class = groupItemsBySingleKeyIndex(listtuple_pred_true_text, 0) #wordVectorsDic = extractAllWordVecs(gloveFile, 300) for label, cluster_pred_true_txt_inds in dic_tupple_class.items():
h_count = max_hit + 100 break # if h_count > max_hit: # break # if h_count > max_hit: # break if cluscount > 1000: break if not found: print('not\t' + str(h_count) + '\t' + str(test_oCPost.soPostId) + '\t' + str(test_oCPost.tagWords)+'\t'+str(test_oCPost.trueLabel)) listtuple_pred_true_text = ReadPredTrueText(clusterWriterFile) dic_tupple_class = groupItemsBySingleKeyIndex(listtuple_pred_true_text, 0) # before 0 # print(dic_tupple_class) dic_term_clusterIds, dic_cluster_ftrs, dic_cluster_size = createTermToClsuetrId(dic_tupple_class) #############test test_list_CPost = readStackOverflowDataSetTagTitleBody(testFile) # print(test_list_CPost) for oCPost in test_list_CPost: terms = oCPost.tagWords test_term_dict = Counter(terms) test_term_size = len(terms)