def cluster_biterm(f, list_pred_true_words_index, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterId_Freq={}, dic_biterm__allClusterFreq={}): print("cluster_bigram") current_txt_id = last_txtId eval_pred_treu_txt = [] line_count = 0 t11 = datetime.now() for item in list_pred_true_words_index: words = item[2] bi_terms = construct_biterms(words) current_txt_id += 1 line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) #X=generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) #text_Vec=X[0] text_Vec = [0] * embedDim clusterId = findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec) max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)]) dic_clus__id[clusterId] = max_c_id txtId_txt[current_txt_id] = words c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq = populateClusterFeature( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) '''if line_count%1000==0: c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)''' eval_pred_treu_txt.append([clusterId, item[1], item[2]]) if ignoreMinusOne == True: if str(item[1]) != '-1': f.write( str(clusterId) + " " + str(item[1]) + " " + str(item[2]) + "\n") else: f.write( str(clusterId) + " " + str(item[1]) + " " + str(item[2]) + "\n") if line_count % 500 == 0: #print(dic_clus__id) print(len(dic_clus__id)) #delete old and small clusters, remove multi-cluster words from clusters list_c_sizes = [] list_c_ids = [] #list_size__cid={} for c_id, txtIds in c_txtIds.items(): list_c_sizes.append(len(txtIds)) list_c_ids.append(dic_clus__id[c_id]) #list_size__cid[len(txtIds)]=c_id mean_c_size = statistics.mean(list_c_sizes) std_c_size = statistics.stdev(list_c_sizes) mean_c_id = statistics.mean(list_c_ids) std_c_id = statistics.stdev(list_c_ids) print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size) print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id) list_del_cids = [] del_count = 0 '''for c_id, txtIds in c_txtIds.items(): c_size= len(txtIds) ##print('c_id=', c_id, 'c_size=', c_size) #if c_size<=2 :#or del_count<15: # list_del_cids.append(c_id) # print('delete cluster=',c_id, '#size=', c_size) #del_count+=1 #if c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size)) or float(c_size)>=mean_c_size+std_c_size or float(c_size)>=mean_c_size: #if float(c_size)<float(abs(mean_c_size)): # list_del_cids.append(c_id) #print('delete cluster=',c_id, '#size=', c_size) #float(c_id)<=float(abs(mean_c_id-std_c_id)) if (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))) or float(c_size)>=mean_c_size: #and del_count<100: list_del_cids.append(c_id) del_count+=1 # print('delete cluster=',c_id, '#size=', c_size) #list_c_sizes.sort(reverse=True) #for c_size in list_c_sizes[0:20]: # list_del_cids.append(list_size__cid[c_size])''' for c_id, orderId in dic_clus__id.items(): if c_id not in c_txtIds: continue c_size = len(c_txtIds[c_id]) #if (float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id))): #if (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size)) or float(c_size)>=mean_c_size+std_c_size*1): if (float(c_id) <= float(abs(mean_c_id - std_c_id)) or float(orderId) <= float(abs(mean_c_id - std_c_id)) ) and (c_size <= 1 or float(c_size) <= float( abs(mean_c_size - std_c_size)) or float(c_size) >= mean_c_size + std_c_size): list_del_cids.append(c_id) print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs)) listTargetBiterms = [] for c_id in list_del_cids: del c_bitermsFreqs[c_id] del c_totalBiterms[c_id] del c_txtIds[c_id] del c_wordsFreqs[c_id] del c_totalWords[c_id] del dic_clus__id[c_id] #del c_clusterVecs[c_id] '''for biterm, dic_clusterId__Freq in dic_biterm__clusterId_Freq.items(): if c_id in dic_biterm__clusterId_Freq[biterm]: bitermClusterIdFreq=dic_biterm__clusterId_Freq[biterm][c_id] #dic_biterm__clusterId_Freq[biterm][c_id]=0 dic_biterm__allClusterFreq[biterm]-=bitermClusterIdFreq listTargetBiterms.append(biterm) del dic_biterm__clusterId_Freq[biterm][c_id]''' '''listTargetBiterms=set(listTargetBiterms) for biterm in listTargetBiterms: if dic_biterm__allClusterFreq[biterm]<=0: del dic_biterm__clusterId_Freq[biterm] del dic_biterm__allClusterFreq[biterm]''' #c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) if line_count % 1000 == 0: print('#######-personal-eval_pred_treu_txt', len(eval_pred_treu_txt)) Evaluate(eval_pred_treu_txt, ignoreMinusOne) t12 = datetime.now() t_diff = t12 - t11 print("total time diff secs=", t_diff.seconds) last_txtId = current_txt_id return [ c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq ]
def cluster_biterm_framework( f, list_CPost, c_CFVector, max_c_id, dic_txtId__CPost, wordVectorsDic, dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram, oCSimilarityFlgas, c_itemsCount): eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for oCPost in list_CPost: trueLabel = oCPost.trueLabel tagWords = oCPost.tagWords titleWords = oCPost.titleWords bodyWords = oCPost.bodyWords id = oCPost.id soPostId = oCPost.soPostId createtime = oCPost.createtime print('id', id, 'tagWords', tagWords, 'titleWords', titleWords, 'bodyWords', bodyWords) txtBitermsFreqs_Tag = None bi_terms_len_Tag = 0 grams_Tag = None txtBitermsFreqs_Title = None bi_terms_len_Title = 0 grams_Title = None txtBitermsFreqs_Body = None bi_terms_len_Body = 0 grams_Body = None text_VecTag = None text_VecTitle = None text_VecBody = None targetClusterIds = [] dic_txtId__CPost[id] = oCPost if oCSimilarityFlgas.isTagSim: bi_termsTag = construct_biterms(tagWords) grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram) for gram in grams_Tag: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Tag = Counter(bi_termsTag) bi_terms_len_Tag = len(bi_termsTag) tCIds = findTargetClusters(txtBitermsFreqs_Tag, dic_bitermTag__clusterIds) # print('dic_bitermTag__clusterIds', dic_bitermTag__clusterIds, 'txtBitermsFreqs_Tag', txtBitermsFreqs_Tag) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic, embedDim) text_VecTag = X[0] if oCSimilarityFlgas.isTitleSim: bi_termsTitle = construct_biterms(titleWords) grams_Title = generateGramsConsucetive(titleWords, min_gram, max_gram) for gram in grams_Title: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Title = Counter(bi_termsTitle) bi_terms_len_Title = len(bi_termsTitle) tCIds = findTargetClusters(txtBitermsFreqs_Title, dic_bitermTitle__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([titleWords], wordVectorsDic, embedDim) text_VecTitle = X[0] if oCSimilarityFlgas.isBodySim: bi_termsBody = construct_biterms(bodyWords) grams_Body = generateGramsConsucetive(bodyWords, min_gram, max_gram) for gram in grams_Body: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Body = Counter(bi_termsBody) bi_terms_len_Body = len(bi_termsBody) tCIds = findTargetClusters(txtBitermsFreqs_Body, dic_bitermBody__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic, embedDim) text_VecBody = X[0] oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag, txtBitermsFreqs_Title, bi_terms_len_Title, txtBitermsFreqs_Body, bi_terms_len_Body, text_VecTag, text_VecTitle, text_VecBody) targetClusterIds = set(targetClusterIds) clusterId = findCloseClusterByTargetClusters_framework( c_CFVector, oCPostProcessed, targetClusterIds, max_c_id, oCSimilarityFlgas) if ignoreMinusOne: if str(trueLabel) != '-1': f.write( str(clusterId) + " " + str(trueLabel) + " " + ' '.join(tagWords) + " " + str(soPostId) + "\n") else: f.write( str(clusterId) + " " + str(trueLabel) + " " + ' '.join(tagWords) + " " + str(soPostId) + "\n") eval_pred_true_txt.append([clusterId, trueLabel, tagWords]) if clusterId not in c_itemsCount: c_itemsCount[clusterId] = 0 c_itemsCount[clusterId] += 1 max_c_id = max([max_c_id, clusterId, len(c_CFVector)]) dic_clus__id[clusterId] = max_c_id # print('max_c_id, len(c_CFVector)', max_c_id, len(c_CFVector)) c_CFVector, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds = populateClusterFeature_framework( c_CFVector, oCPostProcessed, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, clusterId, id, oCSimilarityFlgas) del oCPostProcessed del oCPost line_count += 1 if line_count % DeleteInterval == 0: c_CFVector, c_itemsCount = deleteOldClusters_framework( c_CFVector, c_itemsCount, dic_clus__id) if line_count % 1000 == 0: # print('c_itemsCount', c_itemsCount) Evaluate(eval_pred_true_txt, ignoreMinusOne) return [ c_CFVector, max_c_id, dic_txtId__CPost, dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, dic_ngram__txtIds, c_itemsCount ]
def cluster_biterm(f, list_pred_true_words_index_postid_createtime, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterId_Freq={}, dic_biterm__allClusterFreq={}, dic_biterm__clusterIds={}, c_textItems={}, dic_ngram__textItems={}, min_gram=1, max_gram=2, isTagSim=True, isTitleSim=False, isBodySim=False): print("cluster_bigram") # current_txt_id=last_txtId eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for item in list_pred_true_words_index_postid_createtime: words = item[2] current_txt_id = int(item[3]) postId = item[4] bi_terms = construct_biterms(words) grams = generateGramsConsucetive(words, min_gram, max_gram) # bi_terms=generateGramsConsucetive(words,minGSize, maxGSize) # print(words, bi_terms) for gram in grams: dic_ngram__textItems.setdefault(gram, []).append(item) line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] # clusterId=findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds) targetClusterIds = findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) clusterId = findCloseClusterByTargetClusters( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds, targetClusterIds) c_textItems.setdefault(clusterId, []).append(item) max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)]) dic_clus__id[clusterId] = max_c_id txtId_txt[current_txt_id] = words c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds = populateClusterFeature( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds) # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) # print('clusterId', clusterId, 'current_txt_id', current_txt_id, len(c_textItems), len(c_txtIds), words, len(targetClusterIds), len(dic_ngram__textItems)) eval_pred_true_txt.append([clusterId, item[1], item[2]]) if ignoreMinusOne == True: if str(item[1]) != '-1': f.write( str(clusterId) + " " + str(item[1]) + " " + str(' '.join(item[2])) + " " + postId + "\n") else: f.write( str(clusterId) + " " + str(item[1]) + " " + str(' '.join(item[2])) + " " + postId + "\n") if line_count % 500 == 0: # print(dic_clus__id) print(len(dic_clus__id)) # delete old and small clusters, remove multi-cluster words from clusters list_c_sizes = [] list_c_ids = [] # list_size__cid={} for c_id, txtIds in c_txtIds.items(): list_c_sizes.append(len(txtIds)) list_c_ids.append(dic_clus__id[c_id]) # list_size__cid[len(txtIds)]=c_id mean_c_size = 0 std_c_size = 0 if len(list_c_sizes) > 2: mean_c_size = statistics.mean(list_c_sizes) std_c_size = statistics.stdev(list_c_sizes) mean_c_id = 0 std_c_id = 0 if len(list_c_ids) > 2: mean_c_id = statistics.mean(list_c_ids) std_c_id = statistics.stdev(list_c_ids) print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size) print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id) list_del_cids = [] del_count = 0 for c_id, txtIds in c_txtIds.items(): c_size = len(txtIds) if ((c_size <= 1 or float(c_size) <= float(abs(mean_c_size - std_c_size))) or (float(c_size) >= mean_c_size + std_c_size)) or ( (float(c_id) <= float(abs(mean_c_id - std_c_id))) or (float(c_id) >= float(abs(mean_c_id + std_c_id)))): list_del_cids.append(c_id) list_del_cids = set(list_del_cids) print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs)) listTargetBiterms = [] # need to uncomment for c_id in list_del_cids: if c_id in c_bitermsFreqs: # print('del c_id', c_id, len(c_bitermsFreqs[c_id])) del c_bitermsFreqs[c_id] if c_id in c_totalBiterms: del c_totalBiterms[c_id] if c_id in c_txtIds: del c_txtIds[c_id] if c_id in c_wordsFreqs: del c_wordsFreqs[c_id] if c_id in c_totalWords: del c_totalWords[c_id] if c_id in dic_clus__id: del dic_clus__id[c_id] if isSemantic == True: del c_clusterVecs[c_id] # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) if line_count % 1000 == 0: print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt)) Evaluate(eval_pred_true_txt, ignoreMinusOne) t12 = datetime.now() t_diff = t12 - t11 print("total time diff secs=", t_diff.seconds) last_txtId = current_txt_id return [ c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds, c_textItems, dic_ngram__textItems ]
def test_cluster_biterm(testList_pred_true_words_index_postid, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterIds={}, dicTrain_pred__trues={}): print("test cluster_bigram") current_txt_id = last_txtId eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for item in testList_pred_true_words_index_postid: pred = item[0] testTrue = int(item[1]) words = item[2] postId = item[4] bi_terms = construct_biterms(words) #print(words, bi_terms, pred) current_txt_id += 1 line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] targetClusterIds = findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) print(targetClusterIds) clusterId = findCloseClusterByTargetClusters( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds, targetClusterIds) if clusterId in dicTrain_pred__trues and testTrue in dicTrain_pred__trues[ clusterId]: print('found found', 'clusterId', clusterId, 'testTrue', testTrue, words, postId, 'len', len(dicTrain_pred__trues[clusterId])) else: print('not found', 'clusterId', clusterId, 'testTrue', testTrue, words, postId) #max_c_id=max([max_c_id, clusterId,len(c_bitermsFreqs)]) #dic_clus__id[clusterId]=max_c_id #txtId_txt[current_txt_id]=words #c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterIds=populateClusterFeature(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterIds) #no need here '''eval_pred_true_txt.append([clusterId, item[1], item[2]]) if ignoreMinusOne==True: if str(item[1])!='-1': f.write(str(clusterId)+" "+str(item[1])+" "+str(item[2])+" "+postId+"\n") else: f.write(str(clusterId)+" "+str(item[1])+" "+str(item[2])+" "+postId+"\n") if line_count%500==0: #remove multi-cluster biterms from c_bitermsFreqs using targetClusterIds; before computing similarity c_bitermsFreqs, c_totalBiterms, c_txtIds, txtBitermsFreqs=removeTargetMultiClusterBiTerms(c_bitermsFreqs, c_totalBiterms, c_txtIds, targetClusterIds, txtBitermsFreqs, dic_biterm__clusterIds)''' '''if line_count%500==0: #print(dic_clus__id) print(len(dic_clus__id)) #delete old and small clusters, remove multi-cluster words from clusters list_c_sizes=[] list_c_ids=[] #list_size__cid={} for c_id, txtIds in c_txtIds.items(): list_c_sizes.append(len(txtIds)) list_c_ids.append(dic_clus__id[c_id]) #list_size__cid[len(txtIds)]=c_id mean_c_size=statistics.mean(list_c_sizes) std_c_size=statistics.stdev(list_c_sizes) mean_c_id=statistics.mean(list_c_ids) std_c_id=statistics.stdev(list_c_ids) print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size) print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id) list_del_cids=[] del_count=0 for c_id, orderId in dic_clus__id.items(): #if float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id)): if c_id not in c_txtIds: continue c_size=len(c_txtIds[c_id]) if ( float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id))) and (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))): #or float(c_size)>=mean_c_size+std_c_size*1): list_del_cids.append(c_id) print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs)) listTargetBiterms=[] for c_id in list_del_cids: BitermsFreqs=c_bitermsFreqs[c_id] for biterm, freq in BitermsFreqs.items(): if biterm not in dic_biterm__clusterIds: continue clusterIds=set(dic_biterm__clusterIds[biterm]) if c_id not in clusterIds: continue clusterIds.remove(c_id) dic_biterm__clusterIds[biterm]=list(clusterIds) if len(dic_biterm__clusterIds[biterm])==0: del dic_biterm__clusterIds[biterm] del c_bitermsFreqs[c_id] del c_totalBiterms[c_id] del c_txtIds[c_id] del c_wordsFreqs[c_id] del c_totalWords[c_id] del dic_clus__id[c_id] if isSemantic==True: del c_clusterVecs[c_id] if line_count%1000==0: print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt)) Evaluate(eval_pred_true_txt, ignoreMinusOne) t12=datetime.now() t_diff = t12-t11 print("total time diff secs=",t_diff.seconds) ''' last_txtId = current_txt_id return [ c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterIds ]
def test_cluster_bitermMapping_buffer( testList_pred_true_words_index_postid_createtime, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterIds={}, c_textItems={}, dic_ngram__textItems={}, min_gram=1, max_gram=2, max_hitindex=10000): eval_pred_true_txt = [] line_count = 0 print("testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" + "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff") for item in testList_pred_true_words_index_postid_createtime: t11 = datetime.now() pred = item[0] testTrue = int(item[1]) words = item[2] testpostId = item[4] testDateTime = datetime.strptime(item[5].split("t")[0], "%Y-%m-%d") #datetime.now() # item[5] #print('testDateTime', item[5]) bi_terms = construct_biterms(words) #print(words, bi_terms, pred) #current_txt_id=int(testpostId) line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] #text->biterms #biterms->targetClusterIds #targetClusterIds->txtIds by c_txtIds #txtIds->textItems by txtId_txt targetClusterIds = findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) trainItems = findTextItems(targetClusterIds, c_textItems) grams = generateGramsConsucetive(words, min_gram, max_gram) sortedGrams = list(sorted(grams, key=len, reverse=True)) train_Items = aggregateTextItems(sortedGrams, dic_ngram__textItems) trainItems.extend(train_Items) #print('len(targetClusterIds)', len(targetClusterIds), 'len(trainItems)',len(trainItems), words) pathCount = 0 flag = False for trainItem in trainItems: #list_pred_true_words_index_postid in clustring_term_online_stack= trainItem trainTrue = int(trainItem[1]) train_words = trainItem[2] trainPostId = trainItem[4] pathCount += 1 if str(testTrue) == str(trainTrue): #grams=generateGramsConsucetive(words, min_gram, max_gram) #sortedGrams = list(sorted(grams, key = len, reverse=True)) ProposedHitRank_val = int( max(1, math.floor(pathCount / len(sortedGrams)))) t12 = datetime.now() t_diff = t12 - t11 #print(str(testpostId)+"\t"+str(trainPostId)+"\t0\t0\t0\t0\t"+str(ProposedHitRank_val)+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue)) text_sim, commonCount = computeTextSimCommonWord_WordDic( Counter(words), Counter(train_words), len(words), len(train_words)) trainDateTime = datetime.strptime(trainItem[5].split("t")[0], "%Y-%m-%d") #datetime.now() date_diff = trainDateTime - testDateTime date_diff = date_diff.days print( str(testpostId) + "\t" + str(trainPostId) + "\t" + str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(words) + "\t" + ' '.join(train_words) + "\t" + str(trainDateTime) + "\t" + str(testDateTime) + "\t" + str(date_diff)) flag = True break if pathCount > max_hitindex: break if flag == False: '''grams=generateGramsConsucetive(words, min_gram, max_gram) sortedGrams = list(sorted(grams, key = len, reverse=True)) flag=False largestGram='' ProposedHitRank=0 train_Items=aggregateTextItems(sortedGrams, dic_ngram__textItems) #print("len(train_Items)", len(train_Items) ) for train_item in train_Items: ProposedHitRank+=1 trainTruelabel=train_item[1] train_words=train_item[2] trainPostId=train_item[4] if str(trainTruelabel)==str(testTrue): t12=datetime.now() t_diff = t12-t11 text_sim, commonCount = computeTextSimCommonWord_WordDic(Counter(words), Counter(train_words), len(words), len(train_words) ) ProposedHitRank_val=int(max(1,math.floor(ProposedHitRank/len(sortedGrams)))) trainDateTime= datetime.strptime(train_item[5].split("t")[0] ,"%Y-%m-%d") #datetime.now() date_diff=trainDateTime-testDateTime date_diff=date_diff.days print(str(testpostId)+"\t"+str(trainPostId)+"\t"+str(text_sim)+"\t"+str(ProposedHitRank_val)+"\t"+str(t_diff.microseconds/float(microDivide))+"\t"+str(testTrue)+"\t"+' '.join(words)+"\t"+' '.join(train_words)+"\t"+str(trainDateTime)+"\t"+str(testDateTime)+"\t"+str(date_diff)) flag=True break if ProposedHitRank > max_hitindex: break''' if flag == False: #print('not found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds)) t12 = datetime.now() t_diff = t12 - t11 #print(str(testpostId)+"\t"+"-100"+"\t0\t0\t0\t0\t-100"+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue)) print( str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(words) + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "")
def test_cluster_bitermMapping(testList_pred_true_words_index_postid, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterIds={}, dic_word__clusterIds={}, dicTrain_pred__trues={}): #print("test_cluster_bitermMapping") eval_pred_true_txt = [] line_count = 0 print("testpostId" + "\t" + "trainPostId" + "\tTitleSim\tBodySim\tTagSim\tLuceneHitRank\t" + "ProposedHitRank" + "\tlucene_hit_duration\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel") for item in testList_pred_true_words_index_postid: t11 = datetime.now() pred = item[0] testTrue = int(item[1]) words = item[2] postId = item[4] bi_terms = construct_biterms(words) #bi_terms=generateGramsConsucetive(words, minGSize, maxGSize) print(words, bi_terms, pred) current_txt_id = int(postId) line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] #text->biterms #biterms->targetClusterIds #targetClusterIds->txtIds by c_txtIds #txtIds->textItems by txtId_txt targetClusterIds = findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) print('len(targetClusterIds)', len(targetClusterIds)) textIds = findTextIds(targetClusterIds, c_txtIds) print('len(textIds)', len(textIds)) pathCount = 0 flag = False for textId in textIds: trainItem = txtId_txt[textId] trainTrue = int(trainItem[1]) trainPostId = trainItem[3] pathCount += 1 if str(testTrue) == str(trainTrue): #print('found found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds)) t12 = datetime.now() t_diff = t12 - t11 print( str(postId) + "\t" + str(trainPostId) + "\t0\t0\t0\t0\t" + str(len(targetClusterIds)) + "\t0\t" + str(t_diff.microseconds) + "\t" + str(testTrue)) flag = True break if pathCount > max_hitindex: break if flag == False: '''targetClusterIds=findTargetClusters(txtWordsFreqs, dic_word__clusterIds) textIds=findTextIds(targetClusterIds, c_txtIds) pathCount=0 flag=False for textId in textIds: trainItem = txtId_txt[textId] trainTrue=int(trainItem[1]) trainPostId=trainItem[3] pathCount+=1 if str(testTrue) == str(trainTrue): #print('found found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds)) t12=datetime.now() t_diff = t12-t11 print(str(postId)+"\t"+str(trainPostId)+"\t0\t0\t0\t0\t"+str(len(targetClusterIds))+"\t0\t"+str(t_diff.microseconds)+"\t"+str(testTrue)) flag=True break ''' if flag == False: #print('not found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds)) t12 = datetime.now() t_diff = t12 - t11 print( str(postId) + "\t" + "-100" + "\t0\t0\t0\t0\t-100" + "\t0\t" + str(t_diff.microseconds) + "\t" + str(testTrue))
def trainLoad_cluster_biterm(trainList_pred_true_text_postid, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterIds={}, dic_word__clusterIds={}): print("train cluster_bigram") dicTrain_pred__trues = {} eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for item in trainList_pred_true_text_postid: pred = item[0] #pred clusId true = item[1] words = item[2].split(' ') postId = item[3] bi_terms = construct_biterms(words) #bi_terms=generateGramsConsucetive(words, minGSize, maxGSize) #print(words, bi_terms) line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] clusterId = int(pred) #dicTrain_pred__trues[clusterId]=int(true) dicTrain_pred__trues.setdefault(clusterId, []).append(int(true)) dic_clus__id[clusterId] = clusterId current_txt_id = int(postId) txtId_txt[current_txt_id] = item c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterIds, dic_word__clusterIds = populateClusterFeature( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterIds, dic_word__clusterIds) eval_pred_true_txt.append([clusterId, item[1], item[2]]) #if clusterId>0: # print(item, bi_terms) #print(dic_biterm__clusterIds.keys()) if line_count % 1000 == 0: print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt)) Evaluate(eval_pred_true_txt, ignoreMinusOne) t12 = datetime.now() t_diff = t12 - t11 print("total time diff secs=", t_diff.seconds) return [ c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, dic_clus__id, dic_biterm__clusterIds, dic_word__clusterIds, dicTrain_pred__trues ]
def test_cluster_bitermMapping_buffer_framework( list_CPost_test, c_CFVector, dic_txtId__CPost, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram, max_hitindex, oCSimilarityFlgas, wordVectorsDic): eval_pred_true_txt = [] line_count = 0 fileWrite = open(outfileName, 'w') fileWrite.write("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" + "\t" + "Proposed_hitrank" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank" + "\n") print("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" + "\t" + "Proposed_hitrank" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank") for oCPost in list_CPost_test: t11 = datetime.now() testTrue = oCPost.trueLabel tagWords = oCPost.tagWords titleWords = oCPost.titleWords bodyWords = oCPost.bodyWords # id = oCPost.id # may not be useful for test testpostId = oCPost.soPostId testCreatetime = oCPost.createtime testWords = tagWords # this can be changed txtBitermsFreqs_Tag = None bi_terms_len_Tag = 0 grams_Tag = None txtBitermsFreqs_Title = None bi_terms_len_Title = 0 grams_Title = None txtBitermsFreqs_Body = None bi_terms_len_Body = 0 grams_Body = None text_VecTag = None text_VecTitle = None text_VecBody = None targetClusterIds = [] grams = [] line_count += 1 # text->biterms # biterms->targetClusterIds # targetClusterIds->txtIds by c_txtIds # txtIds->textItems by txtId_txt if oCSimilarityFlgas.isTagSim: bi_termsTag = construct_biterms(tagWords) grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram) grams.extend(grams_Tag) txtBitermsFreqs_Tag = Counter(bi_termsTag) bi_terms_len_Tag = len(bi_termsTag) tCIds = findTargetClusters(txtBitermsFreqs_Tag, dic_bitermTag__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic, embedDim) text_VecTag = X[0] if oCSimilarityFlgas.isTitleSim: bi_termsTitle = construct_biterms(titleWords) grams_Title = generateGramsConsucetive(titleWords, min_gram, max_gram) grams.extend(grams_Title) txtBitermsFreqs_Title = Counter(bi_termsTitle) bi_terms_len_Title = len(bi_termsTitle) tCIds = findTargetClusters(txtBitermsFreqs_Title, dic_bitermTitle__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([titleWords], wordVectorsDic, embedDim) text_VecTitle = X[0] if oCSimilarityFlgas.isBodySim: bi_termsBody = construct_biterms(bodyWords) grams_Body = generateGramsConsucetive(bodyWords, min_gram, max_gram) grams.extend(grams_Body) txtBitermsFreqs_Body = Counter(bi_termsBody) bi_terms_len_Body = len(bi_termsBody) tCIds = findTargetClusters(txtBitermsFreqs_Body, dic_bitermBody__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic, embedDim) text_VecBody = X[0] oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag, txtBitermsFreqs_Title, bi_terms_len_Title, txtBitermsFreqs_Body, bi_terms_len_Body, text_VecTag, text_VecTitle, text_VecBody) targetClusterIds = set(targetClusterIds) closeClusterIds = findCloseClustersIds_framework( oCPostProcessed, targetClusterIds, c_CFVector, oCSimilarityFlgas) train_cluster_CPosts = findTextItems_framework(closeClusterIds, c_CFVector, dic_txtId__CPost) # train_cluster_CPosts = filterTextItems_framework(train_cluster_CPosts, oCSimilarityFlgas, oCPostProcessed) sortedGrams = list(sorted(grams, key=len, reverse=True)) train_gram_CPosts = aggregateTextItems_framework( sortedGrams, dic_ngram__txtIds, dic_txtId__CPost) train_gram_CPosts.extend(train_cluster_CPosts) # train_Items.extend(trainItems) # print('len(train_gram_CPosts)', len(train_gram_CPosts), 'len(targetClusterIds)', len(targetClusterIds)) pathCount = 0 flag = False for trainCPost in train_gram_CPosts: trainTrue = int(str(trainCPost.trueLabel)) train_words = trainCPost.tagWords # this can be changed trainPostId = trainCPost.soPostId trainCreateTime = trainCPost.createtime pathCount += 1 if str(testTrue) == str(trainTrue): ProposedHitRank_val = int( max(1, math.floor(pathCount / len(sortedGrams)))) t12 = datetime.now() t_diff = t12 - t11 text_sim, commonCount = computeTextSimCommonWord_WordDic( Counter(testWords), Counter(train_words), len(testWords), len(train_words)) date_diff = trainCreateTime - testCreatetime date_diff = date_diff.days # "testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" + "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank" print( str(testpostId) + "\t" + str(trainPostId) + "\t" + str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" + str(trainCreateTime) + "\t" + str(date_diff) + "\t" + str(pathCount)) fileWrite.write( str(testpostId) + "\t" + str(trainPostId) + "\t" + str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" + str(trainCreateTime) + "\t" + str(date_diff) + "\t" + str(pathCount) + "\n") flag = True break if pathCount > max_hitindex: break if not flag: t12 = datetime.now() t_diff = t12 - t11 print( str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "") fileWrite.write( str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n") fileWrite.close()