def findCloseCluster_GramKey_Semantic(keys_list, word_arr, minMatch, wordVectorsDic, euclidean=True): closeKey_Semantic=None sent_vec=generate_sent_vecs_toktextdata([word_arr], wordVectorsDic, 300)[0] min_dist=sys.float_info.max max_sim=0 for key in keys_list: key_words=key.split(' ') set1=set(key_words) set2=set(word_arr) common=set1.intersection(set2) key_vec=generate_sent_vecs_toktextdata([key_words], wordVectorsDic, 300)[0] #eu_dist=0 #if euclidean==True: # eu_dist=distance.euclidean(sent_vec, key_vec) #else: eu_dist=cosine(sent_vec, key_vec) #cosine=distance sim=1-eu_dist #if len(common)>=minMatch and min_dist>eu_dist: if len(common)>=minMatch and max_sim<sim: #min_dist=eu_dist max_sim=sim closeKey_Semantic=key return [closeKey_Semantic, max_sim]
def buildNGramIndex(list_pred_true_words_index_postid_createtime): for item in list_pred_true_words_index_postid_createtime: words=item[2] txtId=item[3] #print('process index for', item) text_Vec=None if isSemantic==True: if txtId in dic_txtId__vec: text_Vec= dic_txtId__vec[txtId] else: X=generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec=X[0] dic_txtId__vec[txtId]=text_Vec dic_txtId__text[txtId]=item grams=generateGramsConsucetive(words,min_gram,max_gram) #len(words)) for gram in grams: dic_ngram__txtIds.setdefault(gram, []).append(txtId) if isSemantic==True: if gram in dic_ngram__center: dic_ngram__center[gram]=list( map(add, dic_ngram__center[gram], text_Vec) ) else: dic_ngram__center[gram]=text_Vec
def populateClusterReps(all_global, wordVectorsDic, embedDim): dic_cluster_rep_words = {} dic_cluster_rep_vec = {} dic_tupple_class = groupItemsBySingleKeyIndex(all_global, 0) for predKey, items in dic_tupple_class.items(): clus_words = [] #can filter some words using word entropy based on clus distributions. for item in items: words = item[2] clus_words.extend(words) dic_word_counts = Counter(clus_words) wordCounts = dic_word_counts.values() mean = 0 if len(wordCounts) >= 1: mean = statistics.mean(wordCounts) std = mean if len(wordCounts) >= 2: std = statistics.stdev(wordCounts) dic_word_counts_filtered = {} for key, counts in dic_word_counts.items(): if counts > mean + std: dic_word_counts_filtered[key] = counts if len(dic_word_counts_filtered) <= 2: dic_word_counts_filtered = {} for key, counts in dic_word_counts.items(): if counts > 1: dic_word_counts_filtered[key] = counts #if len(dic_word_counts_filtered)<=2: # dic_word_counts_filtered={} # for key, counts in dic_word_counts.items(): # dic_word_counts_filtered[key]=counts clus_words = list(dic_word_counts_filtered.keys()) clus_word_counts = list(dic_word_counts_filtered.values()) cent_Vec_words = generate_sent_vecs_toktextdata([clus_words], wordVectorsDic, embedDim)[0] dic_cluster_rep_words[predKey] = [ dic_word_counts_filtered, sum(clus_word_counts) ] dic_cluster_rep_vec[predKey] = cent_Vec_words #print(dic_cluster_rep_words[predKey]) #print(dic_cluster_rep_vec[predKey]) return [dic_cluster_rep_words, dic_cluster_rep_vec]
def populateClusterVecs(dic_nonCommon__txtIds_Clust, dic_txtId__text): dic_clusteVecs = {} for gramKey, txtIds in dic_nonCommon__txtIds_Clust.items(): data = [] for txtId in txtIds: item = dic_txtId__text[txtId] words = item[2] X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] data.append(text_Vec) avg = [sum(col) / float(len(col)) for col in zip(*data)] dic_clusteVecs[gramKey] = avg return dic_clusteVecs
def clusterByWordEmbeddingIntelligent(list_pred_true_text_ind_prevind, wordVectorsDic): print("pred_mstreams") printClusterEvaluation_list(list_pred_true_text_ind_prevind) dic_itemGroups = groupItemsBySingleKeyIndex( list_pred_true_text_ind_prevind, 0) pred_clusters = int(len(dic_itemGroups) / 1.0) #needs to be determined carefully dic_group_sizes = [ len(dic_itemGroups[x]) for x in dic_itemGroups if isinstance(dic_itemGroups[x], list) ] print(dic_group_sizes) print("#clusters=" + str(pred_clusters)) nparr = np.array(list_pred_true_text_ind_prevind) preds = list(nparr[:, 0]) trues = list(nparr[:, 1]) word_arr = list(nparr[:, 2]) inds = list(nparr[:, 3]) X = generate_sent_vecs_toktextdata(word_arr, wordVectorsDic, 300) #X=generate_sent_vecs_toktextdata_autoencoder(word_arr, wordVectorsDic, 300, pred_clusters) svd = TruncatedSVD(50) #svd = PCA(n_components=50) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) #X=X.toarray() X = lsa.fit_transform(X) ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text = combine_pred_true_txt_from_list( ward.labels_, trues, word_arr) print("hr-ward") printClusterEvaluation_list(list_hr_pred_true_text) clustering = SpectralClustering(n_clusters=pred_clusters, assign_labels="discretize", random_state=0).fit(X) list_sp_pred_true_text = combine_pred_true_txt_from_list( clustering.labels_, trues, word_arr) print("spectral") printClusterEvaluation_list(list_sp_pred_true_text)
def assignToClusterSimDistribution(not_clustered_inds_batch, dic_bitri_keys_selectedClusters_seenBatch, seen_list_pred_true_words_index, wordVectorsDic): new_not_clustered_inds_batch=[] ##follow Mstream dic_ClusterGroupsDetail={} dic_ClusterWords={} dic_ClusterTextWords={} dic_ClusterVecs={} for key, txtInds in dic_bitri_keys_selectedClusters_seenBatch.items(): list_pred_true_words_index=[] cluster_words=[] txtWords=[] vec=np.zeros(shape=[300]) for txtInd in txtInds: pred= seen_list_pred_true_words_index[txtInd][0] true= seen_list_pred_true_words_index[txtInd][1] words= seen_list_pred_true_words_index[txtInd][2] index= seen_list_pred_true_words_index[txtInd][3] list_pred_true_words_index.append([pred, true, words, index]) cluster_words.extend(words) txtWords.append(words) sent_vec=generate_sent_vecs_toktextdata([words], wordVectorsDic, 300)[0] sent_vec=np.asarray(sent_vec) vec=np.add(vec, sent_vec) dic_ClusterGroupsDetail[key]=list_pred_true_words_index dic_ClusterWords[key]=[Counter(cluster_words), len(cluster_words)] dic_ClusterTextWords[key]=txtWords dic_ClusterVecs[key]=vec # np.true_divide(vec, len(txtInds)+1) #print("dic_ClusterVecs[key]", dic_ClusterVecs[key]) ##end follow Mstream ####our logic starts keys_list=list(dic_bitri_keys_selectedClusters_seenBatch.keys()) #new_clusters={} for item in not_clustered_inds_batch: word_arr=item[2] global_index=item[3] true=item[1] dic_lex_Sim_CommonWords, maxPredLabel_lex, maxSim_lex, maxCommon_lex, minSim_lex=commonWordSims_clusterGroup(word_arr, dic_ClusterWords) text_Vec=generate_sent_vecs_toktextdata([word_arr], wordVectorsDic, 300)[0] dic_semanticSims, maxPredLabel_Semantic, maxSim_Semantic, minSim_semantic=semanticSims(text_Vec, dic_ClusterVecs) if maxCommon_lex>0 and str(maxPredLabel_lex)==str(maxPredLabel_Semantic): new_pred=str(maxPredLabel_lex) new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) '''if len(new_pred.split(' '))==1 and new_pred.isnumeric()==True: #print("new_pred.isnumeric=", new_pred) dic_ClusterVecs[new_pred]= np.add(dic_ClusterVecs[new_pred], np.asarray(text_Vec)) count_dic=dic_ClusterWords[new_pred][0] totalwords_dic=dic_ClusterWords[new_pred][1] dic_ClusterWords[new_pred]=[count_dic+Counter(word_arr), totalwords_dic+len(word_arr)]''' '''else: new_key=str(len(dic_ClusterVecs)+10) new_pred=new_key new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) dic_ClusterVecs[new_pred]=np.asarray(text_Vec) dic_ClusterWords[new_pred]=[Counter(word_arr), len(word_arr)] #print("new_pred=", new_pred)''' '''closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,1) closeKey_Semantic, max_Semantic_sim_gram=findCloseCluster_GramKey_Semantic(keys_list,word_arr,0, wordVectorsDic, False) if closeKey_Lexical==closeKey_Semantic: new_pred=str(closeKey_Lexical) new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) else: closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,2) if closeKey_Lexical != None: new_pred=str(closeKey_Lexical) new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) #elif max_Semantic_sim_gram>=0.8: # new_pred=str(closeKey_Lexical) # new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) else: dic_lex_Sim_CommonWords, maxPredLabel_lex, maxSim_lex, maxCommon_lex, minSim_lex=commonWordSims_clusterGroup(word_arr, dic_ClusterWords) text_Vec=generate_sent_vecs_toktextdata([word_arr], wordVectorsDic, 300)[0] dic_semanticSims, maxPredLabel_Semantic, maxSim_Semantic, minSim_semantic=semanticSims(text_Vec, dic_ClusterVecs) if maxCommon_lex>0 and str(maxPredLabel_lex)==str(maxPredLabel_Semantic): new_pred=str(maxPredLabel_lex) new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) #else: #assign to a new cluster # new_key=str(len(new_clusters) + len(keys_list)+10) # new_pred=new_key # new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) # new_clusters.setdefault(new_key,[]).append([new_pred, true, word_arr, global_index]) #elif maxCommon_lex>=6: # new_pred=str(maxPredLabel_lex) # new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) #elif maxSim_Semantic>=0.5: # new_pred=str(maxPredLabel_Semantic) # new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) # maxPredLabel=int(str(maxPredLabel))+1 # pred_true_text_ind_prevPred[0]=str(maxPredLabel) # new_outs.append(pred_true_text_ind_prevPred) #elif closeKey_Lexical != None: # new_pred=str(closeKey_Lexical) # new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])''' '''else: if closeKey_Semantic !=None: new_pred=str(closeKey_Semantic) new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])''' return new_not_clustered_inds_batch
def findDuplicateBySemantic(test_item): t11=datetime.now() testTruelabel= test_item[1] test_words=test_item[2] testpostId=test_item[4] testCreateTime=test_item[5] testDateTime= datetime.strptime(test_item[5].split("T")[0] ,"%Y-%m-%d") test_X=generate_sent_vecs_toktextdata([test_words], wordVectorsDic, embedDim) test_text_Vec=test_X[0] dic_gram__sim={} for gram, center_Vec in dic_ngram__center.items(): sim = 1-spatial.distance.cosine(center_Vec, test_text_Vec) dic_gram__sim[gram]=sim list_sim=list(dic_gram__sim.values()) sim_stdev=statistics.stdev(list_sim) sim_mean=statistics.mean(list_sim) all_textIds=[] for gram, center_Vec in dic_ngram__center.items(): gram_sim=dic_gram__sim[gram] if gram_sim>=sim_mean+sim_stdev: txtIds=dic_ngram__txtIds[gram] all_textIds.extend(txtIds) all_textIds=set(all_textIds) ProposedHitRank=0 print('sem-all_textIds', len(all_textIds), 'test_words', test_words) for txtId in all_textIds: ProposedHitRank+=1 if ProposedHitRank > max_hitindex: break train_item=dic_txtId__text[txtId] trainTruelabel=train_item[1] train_words=train_item[2] trainPostId=train_item[4] trainCreateTime = train_item[5] if str(trainTruelabel)==str(testTruelabel): t12=datetime.now() t_diff = t12-t11 text_sim, commonCount = computeTextSimCommonWord_WordDic(Counter(test_words), Counter(train_words), len(test_words), len(train_words) ) ProposedHitRank_val=int(max(1,math.floor(ProposedHitRank/len(sortedGrams)))) trainDateTime= datetime.strptime(train_item[5].split("T")[0] ,"%Y-%m-%d") date_diff=trainDateTime-testDateTime date_diff=date_diff.days print(str(testpostId)+"\t"+str(trainPostId)+"\t"+str(text_sim)+"\t"+str(ProposedHitRank_val)+"\t"+str(t_diff.microseconds)+"\t"+str(testTruelabel)+"\t"+' '.join(test_words)+"\t"+' '.join(train_words)+"\t"+testCreateTime+"\t"+trainCreateTime+"\t"+str(date_diff)) return True return False
def cluster_biterm(f, list_pred_true_words_index_postid_createtime, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterId_Freq={}, dic_biterm__allClusterFreq={}, dic_biterm__clusterIds={}, c_textItems={}, dic_ngram__textItems={}, min_gram=1, max_gram=2, isTagSim=True, isTitleSim=False, isBodySim=False): print("cluster_bigram") # current_txt_id=last_txtId eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for item in list_pred_true_words_index_postid_createtime: words = item[2] current_txt_id = int(item[3]) postId = item[4] bi_terms = construct_biterms(words) grams = generateGramsConsucetive(words, min_gram, max_gram) # bi_terms=generateGramsConsucetive(words,minGSize, maxGSize) # print(words, bi_terms) for gram in grams: dic_ngram__textItems.setdefault(gram, []).append(item) line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] # clusterId=findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds) targetClusterIds = findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) clusterId = findCloseClusterByTargetClusters( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds, targetClusterIds) c_textItems.setdefault(clusterId, []).append(item) max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)]) dic_clus__id[clusterId] = max_c_id txtId_txt[current_txt_id] = words c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds = populateClusterFeature( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds) # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) # print('clusterId', clusterId, 'current_txt_id', current_txt_id, len(c_textItems), len(c_txtIds), words, len(targetClusterIds), len(dic_ngram__textItems)) eval_pred_true_txt.append([clusterId, item[1], item[2]]) if ignoreMinusOne == True: if str(item[1]) != '-1': f.write( str(clusterId) + " " + str(item[1]) + " " + str(' '.join(item[2])) + " " + postId + "\n") else: f.write( str(clusterId) + " " + str(item[1]) + " " + str(' '.join(item[2])) + " " + postId + "\n") if line_count % 500 == 0: # print(dic_clus__id) print(len(dic_clus__id)) # delete old and small clusters, remove multi-cluster words from clusters list_c_sizes = [] list_c_ids = [] # list_size__cid={} for c_id, txtIds in c_txtIds.items(): list_c_sizes.append(len(txtIds)) list_c_ids.append(dic_clus__id[c_id]) # list_size__cid[len(txtIds)]=c_id mean_c_size = 0 std_c_size = 0 if len(list_c_sizes) > 2: mean_c_size = statistics.mean(list_c_sizes) std_c_size = statistics.stdev(list_c_sizes) mean_c_id = 0 std_c_id = 0 if len(list_c_ids) > 2: mean_c_id = statistics.mean(list_c_ids) std_c_id = statistics.stdev(list_c_ids) print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size) print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id) list_del_cids = [] del_count = 0 for c_id, txtIds in c_txtIds.items(): c_size = len(txtIds) if ((c_size <= 1 or float(c_size) <= float(abs(mean_c_size - std_c_size))) or (float(c_size) >= mean_c_size + std_c_size)) or ( (float(c_id) <= float(abs(mean_c_id - std_c_id))) or (float(c_id) >= float(abs(mean_c_id + std_c_id)))): list_del_cids.append(c_id) list_del_cids = set(list_del_cids) print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs)) listTargetBiterms = [] # need to uncomment for c_id in list_del_cids: if c_id in c_bitermsFreqs: # print('del c_id', c_id, len(c_bitermsFreqs[c_id])) del c_bitermsFreqs[c_id] if c_id in c_totalBiterms: del c_totalBiterms[c_id] if c_id in c_txtIds: del c_txtIds[c_id] if c_id in c_wordsFreqs: del c_wordsFreqs[c_id] if c_id in c_totalWords: del c_totalWords[c_id] if c_id in dic_clus__id: del dic_clus__id[c_id] if isSemantic == True: del c_clusterVecs[c_id] # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) if line_count % 1000 == 0: print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt)) Evaluate(eval_pred_true_txt, ignoreMinusOne) t12 = datetime.now() t_diff = t12 - t11 print("total time diff secs=", t_diff.seconds) last_txtId = current_txt_id return [ c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds, c_textItems, dic_ngram__textItems ]
def cluster_biterm_framework( f, list_CPost, c_CFVector, max_c_id, dic_txtId__CPost, wordVectorsDic, dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram, oCSimilarityFlgas, c_itemsCount): eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for oCPost in list_CPost: trueLabel = oCPost.trueLabel tagWords = oCPost.tagWords titleWords = oCPost.titleWords bodyWords = oCPost.bodyWords id = oCPost.id soPostId = oCPost.soPostId createtime = oCPost.createtime print('id', id, 'tagWords', tagWords, 'titleWords', titleWords, 'bodyWords', bodyWords) txtBitermsFreqs_Tag = None bi_terms_len_Tag = 0 grams_Tag = None txtBitermsFreqs_Title = None bi_terms_len_Title = 0 grams_Title = None txtBitermsFreqs_Body = None bi_terms_len_Body = 0 grams_Body = None text_VecTag = None text_VecTitle = None text_VecBody = None targetClusterIds = [] dic_txtId__CPost[id] = oCPost if oCSimilarityFlgas.isTagSim: bi_termsTag = construct_biterms(tagWords) grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram) for gram in grams_Tag: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Tag = Counter(bi_termsTag) bi_terms_len_Tag = len(bi_termsTag) tCIds = findTargetClusters(txtBitermsFreqs_Tag, dic_bitermTag__clusterIds) # print('dic_bitermTag__clusterIds', dic_bitermTag__clusterIds, 'txtBitermsFreqs_Tag', txtBitermsFreqs_Tag) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic, embedDim) text_VecTag = X[0] if oCSimilarityFlgas.isTitleSim: bi_termsTitle = construct_biterms(titleWords) grams_Title = generateGramsConsucetive(titleWords, min_gram, max_gram) for gram in grams_Title: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Title = Counter(bi_termsTitle) bi_terms_len_Title = len(bi_termsTitle) tCIds = findTargetClusters(txtBitermsFreqs_Title, dic_bitermTitle__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([titleWords], wordVectorsDic, embedDim) text_VecTitle = X[0] if oCSimilarityFlgas.isBodySim: bi_termsBody = construct_biterms(bodyWords) grams_Body = generateGramsConsucetive(bodyWords, min_gram, max_gram) for gram in grams_Body: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Body = Counter(bi_termsBody) bi_terms_len_Body = len(bi_termsBody) tCIds = findTargetClusters(txtBitermsFreqs_Body, dic_bitermBody__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic, embedDim) text_VecBody = X[0] oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag, txtBitermsFreqs_Title, bi_terms_len_Title, txtBitermsFreqs_Body, bi_terms_len_Body, text_VecTag, text_VecTitle, text_VecBody) targetClusterIds = set(targetClusterIds) clusterId = findCloseClusterByTargetClusters_framework( c_CFVector, oCPostProcessed, targetClusterIds, max_c_id, oCSimilarityFlgas) if ignoreMinusOne: if str(trueLabel) != '-1': f.write( str(clusterId) + " " + str(trueLabel) + " " + ' '.join(tagWords) + " " + str(soPostId) + "\n") else: f.write( str(clusterId) + " " + str(trueLabel) + " " + ' '.join(tagWords) + " " + str(soPostId) + "\n") eval_pred_true_txt.append([clusterId, trueLabel, tagWords]) if clusterId not in c_itemsCount: c_itemsCount[clusterId] = 0 c_itemsCount[clusterId] += 1 max_c_id = max([max_c_id, clusterId, len(c_CFVector)]) dic_clus__id[clusterId] = max_c_id # print('max_c_id, len(c_CFVector)', max_c_id, len(c_CFVector)) c_CFVector, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds = populateClusterFeature_framework( c_CFVector, oCPostProcessed, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, clusterId, id, oCSimilarityFlgas) del oCPostProcessed del oCPost line_count += 1 if line_count % DeleteInterval == 0: c_CFVector, c_itemsCount = deleteOldClusters_framework( c_CFVector, c_itemsCount, dic_clus__id) if line_count % 1000 == 0: # print('c_itemsCount', c_itemsCount) Evaluate(eval_pred_true_txt, ignoreMinusOne) return [ c_CFVector, max_c_id, dic_txtId__CPost, dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, dic_ngram__txtIds, c_itemsCount ]
def trainLoad_cluster_biterm(trainList_pred_true_text_postid, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterIds={}, dic_word__clusterIds={}): print("train cluster_bigram") dicTrain_pred__trues = {} eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for item in trainList_pred_true_text_postid: pred = item[0] #pred clusId true = item[1] words = item[2].split(' ') postId = item[3] bi_terms = construct_biterms(words) #bi_terms=generateGramsConsucetive(words, minGSize, maxGSize) #print(words, bi_terms) line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] clusterId = int(pred) #dicTrain_pred__trues[clusterId]=int(true) dicTrain_pred__trues.setdefault(clusterId, []).append(int(true)) dic_clus__id[clusterId] = clusterId current_txt_id = int(postId) txtId_txt[current_txt_id] = item c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterIds, dic_word__clusterIds = populateClusterFeature( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterIds, dic_word__clusterIds) eval_pred_true_txt.append([clusterId, item[1], item[2]]) #if clusterId>0: # print(item, bi_terms) #print(dic_biterm__clusterIds.keys()) if line_count % 1000 == 0: print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt)) Evaluate(eval_pred_true_txt, ignoreMinusOne) t12 = datetime.now() t_diff = t12 - t11 print("total time diff secs=", t_diff.seconds) return [ c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, dic_clus__id, dic_biterm__clusterIds, dic_word__clusterIds, dicTrain_pred__trues ]
def clusterByWordEmbeddingFeature(list_pred_true_text, wordVectorsDic): print("pred_mstreams") printClusterEvaluation_list(list_pred_true_text) dic_tupple_class = groupTxtByClass(list_pred_true_text, False) pred_clusters = len(dic_tupple_class) print("#clusters=" + str(pred_clusters)) preds, trues, texts = split_pred_true_txt_from_list(list_pred_true_text) skStopWords = getScikitLearn_StopWords() texts = processTextsRemoveStopWordTokenized(texts, skStopWords) dicDocFreq = getDocFreq(texts) X = generate_sent_vecs_toktextdata(texts, wordVectorsDic, 300) #X = generate_weighted_sent_vecs_toktextdata(texts, wordVectorsDic, dicDocFreq, 300) #not good svd = TruncatedSVD(100) #svd = PCA(n_components=50) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) #X=X.toarray() X = lsa.fit_transform(X) km = KMeans(n_clusters=pred_clusters, init='k-means++', max_iter=100, random_state=0) km.fit(X) list_km_pred_true_text = combine_pred_true_txt_from_list( km.labels_, trues, texts) print("k-means") printClusterEvaluation_list(list_km_pred_true_text) ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text = combine_pred_true_txt_from_list( ward.labels_, trues, texts) print("hr-ward") printClusterEvaluation_list(list_hr_pred_true_text) clustering = SpectralClustering(n_clusters=pred_clusters, assign_labels="discretize", random_state=0).fit(X) list_sp_pred_true_text = combine_pred_true_txt_from_list( clustering.labels_, trues, texts) print("spectral") printClusterEvaluation_list(list_sp_pred_true_text) brc = Birch(branching_factor=50, n_clusters=pred_clusters, threshold=0.5, compute_labels=True) brc.fit_predict(X) list_brc_pred_true_text = combine_pred_true_txt_from_list( brc.labels_, trues, texts) print("brc") printClusterEvaluation_list(list_brc_pred_true_text) gmm = GaussianMixture(n_components=pred_clusters, covariance_type='full') gmm_labels = gmm.fit_predict(X) list_gmm_pred_true_text = combine_pred_true_txt_from_list( gmm_labels, trues, texts) print("gmm") printClusterEvaluation_list(list_gmm_pred_true_text)
train_textdata = [] for line in lines: line = line.lower().strip() arr = re.split("\t", line) train_data.append(arr[2]) train_textdata.append(word_tokenize(arr[2])) train_labels.append(arr[0]) train_trueLabels.append(arr[1]) #vectorizer = TfidfVectorizer( max_df=1.0, min_df=1, stop_words='english', use_idf=True, smooth_idf=True, norm='l2') #x_train = vectorizer.fit_transform(train_data) gloveFile = "/home/owner/PhD/dr.norbert/dataset/shorttext/glove.42B.300d/glove.42B.300d.txt" termsVectorsDic = extract_word_vecs(train_textdata, gloveFile, 300) x_train = generate_sent_vecs_toktextdata(train_textdata, termsVectorsDic, 300) contratio = 0.1 #len(train_data)/20000*2 #if len(train_data)>1100: # contratio = len(train_data)/20000*7; isf = IsolationForest(n_estimators=100, max_samples='auto', contamination=contratio, max_features=1.0, bootstrap=True, verbose=0, random_state=0) outlierPreds = isf.fit(x_train).predict(x_train)
def cluster_biterm(f, list_pred_true_words_index, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterId_Freq={}, dic_biterm__allClusterFreq={}, dic_biterm__clusterIds={}): print("cluster_bigram") current_txt_id=last_txtId eval_pred_true_txt=[] line_count=0 t11=datetime.now() for item in list_pred_true_words_index: words=item[2] bi_terms=construct_biterms(words) current_txt_id+=1 line_count+=1 txtBitermsFreqs=Counter(bi_terms) bi_terms_len= len(bi_terms) txtWordsFreqs=Counter(words) words_len= len(words) text_Vec=[0]*embedDim if isSemantic==True: X=generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec=X[0] targetClusterIds=findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) clusterId=findCloseClusterByTargetClusters(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds, targetClusterIds) max_c_id=max([max_c_id, clusterId,len(c_bitermsFreqs)]) dic_clus__id[clusterId]=max_c_id txtId_txt[current_txt_id]=words c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds=populateClusterFeature(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds) eval_pred_true_txt.append([clusterId, item[1], item[2]]) if ignoreMinusOne==True: if str(item[1])!='-1': f.write(str(clusterId)+" "+str(item[1])+" "+str(item[2])+"\n") else: f.write(str(clusterId)+" "+str(item[1])+" "+str(item[2])+"\n") '''if line_count%500==0: #remove multi-cluster biterms from c_bitermsFreqs using targetClusterIds; before computing similarity c_bitermsFreqs, c_totalBiterms, c_txtIds, txtBitermsFreqs=removeTargetMultiClusterBiTerms(c_bitermsFreqs, c_totalBiterms, c_txtIds, targetClusterIds, txtBitermsFreqs, dic_biterm__clusterIds)''' if line_count%500==0: #print(dic_clus__id) print(len(dic_clus__id)) #delete old and small clusters, remove multi-cluster words from clusters list_c_sizes=[] list_c_ids=[] #list_size__cid={} for c_id, txtIds in c_txtIds.items(): list_c_sizes.append(len(txtIds)) list_c_ids.append(dic_clus__id[c_id]) #list_size__cid[len(txtIds)]=c_id mean_c_size=statistics.mean(list_c_sizes) std_c_size=statistics.stdev(list_c_sizes) mean_c_id=statistics.mean(list_c_ids) std_c_id=statistics.stdev(list_c_ids) print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size) print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id) list_del_cids=[] del_count=0 '''for c_id, txtIds in c_txtIds.items(): c_size= len(txtIds) ##print('c_id=', c_id, 'c_size=', c_size) #if c_size<=2 :#or del_count<15: # list_del_cids.append(c_id) # print('delete cluster=',c_id, '#size=', c_size) #del_count+=1 #if c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size)) or float(c_size)>=mean_c_size+std_c_size or float(c_size)>=mean_c_size: #if float(c_size)<float(abs(mean_c_size)): # list_del_cids.append(c_id) #print('delete cluster=',c_id, '#size=', c_size) #float(c_id)<=float(abs(mean_c_id-std_c_id)) if (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))) or float(c_size)>=mean_c_size: #and del_count<100: list_del_cids.append(c_id) del_count+=1 # print('delete cluster=',c_id, '#size=', c_size) #list_c_sizes.sort(reverse=True) #for c_size in list_c_sizes[0:20]: # list_del_cids.append(list_size__cid[c_size])''' for c_id, orderId in dic_clus__id.items(): #if float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id)): if c_id not in c_txtIds: continue c_size=len(c_txtIds[c_id]) if ( float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id))) and (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))): #or float(c_size)>=mean_c_size+std_c_size*1): list_del_cids.append(c_id) print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs)) listTargetBiterms=[] for c_id in list_del_cids: BitermsFreqs=c_bitermsFreqs[c_id] for biterm, freq in BitermsFreqs.items(): if biterm not in dic_biterm__clusterIds: continue clusterIds=set(dic_biterm__clusterIds[biterm]) if c_id not in clusterIds: continue clusterIds.remove(c_id) dic_biterm__clusterIds[biterm]=list(clusterIds) if len(dic_biterm__clusterIds[biterm])==0: del dic_biterm__clusterIds[biterm] del c_bitermsFreqs[c_id] del c_totalBiterms[c_id] del c_txtIds[c_id] del c_wordsFreqs[c_id] del c_totalWords[c_id] del dic_clus__id[c_id] if isSemantic==True: del c_clusterVecs[c_id] #for biterm, dic_clusterId__Freq in dic_biterm__clusterId_Freq.items(): # if c_id in dic_biterm__clusterId_Freq[biterm]: # bitermClusterIdFreq=dic_biterm__clusterId_Freq[biterm][c_id] #dic_biterm__clusterId_Freq[biterm][c_id]=0 # dic_biterm__allClusterFreq[biterm]-=bitermClusterIdFreq # listTargetBiterms.append(biterm) # del dic_biterm__clusterId_Freq[biterm][c_id] #listTargetBiterms=set(listTargetBiterms) #for biterm in listTargetBiterms: # if dic_biterm__allClusterFreq[biterm]<=0: # del dic_biterm__clusterId_Freq[biterm] # del dic_biterm__allClusterFreq[biterm] #c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) if line_count%1000==0: print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt)) Evaluate_old(eval_pred_true_txt, ignoreMinusOne) t12=datetime.now() t_diff = t12-t11 print("total time diff secs=",t_diff.seconds) last_txtId=current_txt_id return [c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds]
def test_cluster_biterm(testList_pred_true_words_index_postid, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterIds={}, dicTrain_pred__trues={}): print("test cluster_bigram") current_txt_id = last_txtId eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for item in testList_pred_true_words_index_postid: pred = item[0] testTrue = int(item[1]) words = item[2] postId = item[4] bi_terms = construct_biterms(words) #print(words, bi_terms, pred) current_txt_id += 1 line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] targetClusterIds = findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) print(targetClusterIds) clusterId = findCloseClusterByTargetClusters( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds, targetClusterIds) if clusterId in dicTrain_pred__trues and testTrue in dicTrain_pred__trues[ clusterId]: print('found found', 'clusterId', clusterId, 'testTrue', testTrue, words, postId, 'len', len(dicTrain_pred__trues[clusterId])) else: print('not found', 'clusterId', clusterId, 'testTrue', testTrue, words, postId) #max_c_id=max([max_c_id, clusterId,len(c_bitermsFreqs)]) #dic_clus__id[clusterId]=max_c_id #txtId_txt[current_txt_id]=words #c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterIds=populateClusterFeature(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterIds) #no need here '''eval_pred_true_txt.append([clusterId, item[1], item[2]]) if ignoreMinusOne==True: if str(item[1])!='-1': f.write(str(clusterId)+" "+str(item[1])+" "+str(item[2])+" "+postId+"\n") else: f.write(str(clusterId)+" "+str(item[1])+" "+str(item[2])+" "+postId+"\n") if line_count%500==0: #remove multi-cluster biterms from c_bitermsFreqs using targetClusterIds; before computing similarity c_bitermsFreqs, c_totalBiterms, c_txtIds, txtBitermsFreqs=removeTargetMultiClusterBiTerms(c_bitermsFreqs, c_totalBiterms, c_txtIds, targetClusterIds, txtBitermsFreqs, dic_biterm__clusterIds)''' '''if line_count%500==0: #print(dic_clus__id) print(len(dic_clus__id)) #delete old and small clusters, remove multi-cluster words from clusters list_c_sizes=[] list_c_ids=[] #list_size__cid={} for c_id, txtIds in c_txtIds.items(): list_c_sizes.append(len(txtIds)) list_c_ids.append(dic_clus__id[c_id]) #list_size__cid[len(txtIds)]=c_id mean_c_size=statistics.mean(list_c_sizes) std_c_size=statistics.stdev(list_c_sizes) mean_c_id=statistics.mean(list_c_ids) std_c_id=statistics.stdev(list_c_ids) print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size) print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id) list_del_cids=[] del_count=0 for c_id, orderId in dic_clus__id.items(): #if float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id)): if c_id not in c_txtIds: continue c_size=len(c_txtIds[c_id]) if ( float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id))) and (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))): #or float(c_size)>=mean_c_size+std_c_size*1): list_del_cids.append(c_id) print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs)) listTargetBiterms=[] for c_id in list_del_cids: BitermsFreqs=c_bitermsFreqs[c_id] for biterm, freq in BitermsFreqs.items(): if biterm not in dic_biterm__clusterIds: continue clusterIds=set(dic_biterm__clusterIds[biterm]) if c_id not in clusterIds: continue clusterIds.remove(c_id) dic_biterm__clusterIds[biterm]=list(clusterIds) if len(dic_biterm__clusterIds[biterm])==0: del dic_biterm__clusterIds[biterm] del c_bitermsFreqs[c_id] del c_totalBiterms[c_id] del c_txtIds[c_id] del c_wordsFreqs[c_id] del c_totalWords[c_id] del dic_clus__id[c_id] if isSemantic==True: del c_clusterVecs[c_id] if line_count%1000==0: print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt)) Evaluate(eval_pred_true_txt, ignoreMinusOne) t12=datetime.now() t_diff = t12-t11 print("total time diff secs=",t_diff.seconds) ''' last_txtId = current_txt_id return [ c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterIds ]
def test_cluster_bitermMapping_buffer( testList_pred_true_words_index_postid_createtime, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterIds={}, c_textItems={}, dic_ngram__textItems={}, min_gram=1, max_gram=2, max_hitindex=10000): eval_pred_true_txt = [] line_count = 0 print("testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" + "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff") for item in testList_pred_true_words_index_postid_createtime: t11 = datetime.now() pred = item[0] testTrue = int(item[1]) words = item[2] testpostId = item[4] testDateTime = datetime.strptime(item[5].split("t")[0], "%Y-%m-%d") #datetime.now() # item[5] #print('testDateTime', item[5]) bi_terms = construct_biterms(words) #print(words, bi_terms, pred) #current_txt_id=int(testpostId) line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] #text->biterms #biterms->targetClusterIds #targetClusterIds->txtIds by c_txtIds #txtIds->textItems by txtId_txt targetClusterIds = findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) trainItems = findTextItems(targetClusterIds, c_textItems) grams = generateGramsConsucetive(words, min_gram, max_gram) sortedGrams = list(sorted(grams, key=len, reverse=True)) train_Items = aggregateTextItems(sortedGrams, dic_ngram__textItems) trainItems.extend(train_Items) #print('len(targetClusterIds)', len(targetClusterIds), 'len(trainItems)',len(trainItems), words) pathCount = 0 flag = False for trainItem in trainItems: #list_pred_true_words_index_postid in clustring_term_online_stack= trainItem trainTrue = int(trainItem[1]) train_words = trainItem[2] trainPostId = trainItem[4] pathCount += 1 if str(testTrue) == str(trainTrue): #grams=generateGramsConsucetive(words, min_gram, max_gram) #sortedGrams = list(sorted(grams, key = len, reverse=True)) ProposedHitRank_val = int( max(1, math.floor(pathCount / len(sortedGrams)))) t12 = datetime.now() t_diff = t12 - t11 #print(str(testpostId)+"\t"+str(trainPostId)+"\t0\t0\t0\t0\t"+str(ProposedHitRank_val)+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue)) text_sim, commonCount = computeTextSimCommonWord_WordDic( Counter(words), Counter(train_words), len(words), len(train_words)) trainDateTime = datetime.strptime(trainItem[5].split("t")[0], "%Y-%m-%d") #datetime.now() date_diff = trainDateTime - testDateTime date_diff = date_diff.days print( str(testpostId) + "\t" + str(trainPostId) + "\t" + str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(words) + "\t" + ' '.join(train_words) + "\t" + str(trainDateTime) + "\t" + str(testDateTime) + "\t" + str(date_diff)) flag = True break if pathCount > max_hitindex: break if flag == False: '''grams=generateGramsConsucetive(words, min_gram, max_gram) sortedGrams = list(sorted(grams, key = len, reverse=True)) flag=False largestGram='' ProposedHitRank=0 train_Items=aggregateTextItems(sortedGrams, dic_ngram__textItems) #print("len(train_Items)", len(train_Items) ) for train_item in train_Items: ProposedHitRank+=1 trainTruelabel=train_item[1] train_words=train_item[2] trainPostId=train_item[4] if str(trainTruelabel)==str(testTrue): t12=datetime.now() t_diff = t12-t11 text_sim, commonCount = computeTextSimCommonWord_WordDic(Counter(words), Counter(train_words), len(words), len(train_words) ) ProposedHitRank_val=int(max(1,math.floor(ProposedHitRank/len(sortedGrams)))) trainDateTime= datetime.strptime(train_item[5].split("t")[0] ,"%Y-%m-%d") #datetime.now() date_diff=trainDateTime-testDateTime date_diff=date_diff.days print(str(testpostId)+"\t"+str(trainPostId)+"\t"+str(text_sim)+"\t"+str(ProposedHitRank_val)+"\t"+str(t_diff.microseconds/float(microDivide))+"\t"+str(testTrue)+"\t"+' '.join(words)+"\t"+' '.join(train_words)+"\t"+str(trainDateTime)+"\t"+str(testDateTime)+"\t"+str(date_diff)) flag=True break if ProposedHitRank > max_hitindex: break''' if flag == False: #print('not found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds)) t12 = datetime.now() t_diff = t12 - t11 #print(str(testpostId)+"\t"+"-100"+"\t0\t0\t0\t0\t-100"+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue)) print( str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(words) + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "")
def test_cluster_bitermMapping(testList_pred_true_words_index_postid, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterIds={}, dic_word__clusterIds={}, dicTrain_pred__trues={}): #print("test_cluster_bitermMapping") eval_pred_true_txt = [] line_count = 0 print("testpostId" + "\t" + "trainPostId" + "\tTitleSim\tBodySim\tTagSim\tLuceneHitRank\t" + "ProposedHitRank" + "\tlucene_hit_duration\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel") for item in testList_pred_true_words_index_postid: t11 = datetime.now() pred = item[0] testTrue = int(item[1]) words = item[2] postId = item[4] bi_terms = construct_biterms(words) #bi_terms=generateGramsConsucetive(words, minGSize, maxGSize) print(words, bi_terms, pred) current_txt_id = int(postId) line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] #text->biterms #biterms->targetClusterIds #targetClusterIds->txtIds by c_txtIds #txtIds->textItems by txtId_txt targetClusterIds = findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) print('len(targetClusterIds)', len(targetClusterIds)) textIds = findTextIds(targetClusterIds, c_txtIds) print('len(textIds)', len(textIds)) pathCount = 0 flag = False for textId in textIds: trainItem = txtId_txt[textId] trainTrue = int(trainItem[1]) trainPostId = trainItem[3] pathCount += 1 if str(testTrue) == str(trainTrue): #print('found found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds)) t12 = datetime.now() t_diff = t12 - t11 print( str(postId) + "\t" + str(trainPostId) + "\t0\t0\t0\t0\t" + str(len(targetClusterIds)) + "\t0\t" + str(t_diff.microseconds) + "\t" + str(testTrue)) flag = True break if pathCount > max_hitindex: break if flag == False: '''targetClusterIds=findTargetClusters(txtWordsFreqs, dic_word__clusterIds) textIds=findTextIds(targetClusterIds, c_txtIds) pathCount=0 flag=False for textId in textIds: trainItem = txtId_txt[textId] trainTrue=int(trainItem[1]) trainPostId=trainItem[3] pathCount+=1 if str(testTrue) == str(trainTrue): #print('found found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds)) t12=datetime.now() t_diff = t12-t11 print(str(postId)+"\t"+str(trainPostId)+"\t0\t0\t0\t0\t"+str(len(targetClusterIds))+"\t0\t"+str(t_diff.microseconds)+"\t"+str(testTrue)) flag=True break ''' if flag == False: #print('not found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds)) t12 = datetime.now() t_diff = t12 - t11 print( str(postId) + "\t" + "-100" + "\t0\t0\t0\t0\t-100" + "\t0\t" + str(t_diff.microseconds) + "\t" + str(testTrue))
def clusteringDCT(pred_true_txt_ind_prevPreds, wordVectorsDic, batchDocs, maxPredLabel): print("#m-stream-cleaned") Evaluate(pred_true_txt_ind_prevPreds) pred_true_text_ind_prevPreds_to_cluster, pred_true_text_ind_prevPreds_to_not_cluster = extrcatLargeClusterItems( pred_true_txt_ind_prevPreds) print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3])) print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4])) '''minPredToC, maxPredToC, minTrueToC, maxTrueToC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_cluster) print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_cluster)") print(minPredToC, maxPredToC, minTrueToC, maxTrueToC) minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_not_cluster) print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_not_cluster)") print(minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC)''' all_pred_clusters = len(groupTxtByClass(pred_true_txt_ind_prevPreds, False)) pred_clusters = len( groupTxtByClass(pred_true_text_ind_prevPreds_to_cluster, False)) non_pred_clusters = len( groupTxtByClass(pred_true_text_ind_prevPreds_to_not_cluster, False)) print("#clusters=" + str(pred_clusters)) print("#not clusters=" + str(non_pred_clusters)) print("this clustering with embedding DCT") pred_clusters = non_pred_clusters - pred_clusters print("#update clusters=" + str(pred_clusters)) nparr = np.array(pred_true_text_ind_prevPreds_to_cluster) print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3])) print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4])) preds = list(nparr[:, 0]) trues = list(nparr[:, 1]) texts = list(nparr[:, 2]) inds = list(nparr[:, 3]) prevPreds = list(nparr[:, 4]) skStopWords = getScikitLearn_StopWords() texts = processTextsRemoveStopWordTokenized(texts, skStopWords) '''dicDocFreq=getDocFreq(texts) dctCoffs=1 X=generate_sent_vecs_toktextdata_DCT(texts, wordVectorsDic, 300,dctCoffs) #vectorizer = TfidfVectorizer(tokenizer=stem_text,max_df=0.5,min_df=1) #vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english') #X = vectorizer.fit_transform(texts)''' '''svd = TruncatedSVD(50) #svd = PCA(n_components=50) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) #X=X.toarray() X = lsa.fit_transform(X)''' '''km = KMeans(n_clusters=pred_clusters, init='k-means++', max_iter=100,random_state=0) km.fit(X) list_km_pred_true_text=combine_pred_true_txt_from_list(km.labels_, trues, texts) print("#k-means") Evaluate(list_km_pred_true_text)''' '''ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text=combine_pred_true_txt_from_list(ward.labels_, trues, texts) print("#hr-ward-DCT") print(min(ward.labels_), max(ward.labels_)) pred_true_text_ind_prevPreds_to_not_cluster_hr=change_pred_label(pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters+1) Evaluate(list_hr_pred_true_text) Evaluate(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr) ''' X = generate_sent_vecs_toktextdata(texts, wordVectorsDic, 300) ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text_ind_prevPred = np.column_stack( (ward.labels_, trues, texts, inds, prevPreds)).tolist() print("#hr-ward-AVG") pred_true_text_ind_prevPreds_to_not_cluster_hr = change_pred_label( pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1) Evaluate(list_hr_pred_true_text_ind_prevPred) Evaluate(list_hr_pred_true_text_ind_prevPred + pred_true_text_ind_prevPreds_to_not_cluster_hr) #print_by_group(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr) print("#spectral-avg") clustering = SpectralClustering(n_clusters=pred_clusters, assign_labels="discretize", random_state=0).fit(X) list_sp_pred_true_text_ind_prevPred = np.column_stack( (clustering.labels_, trues, texts, inds, prevPreds)).tolist() pred_true_text_ind_prevPreds_to_not_cluster_spec = change_pred_label( pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1) Evaluate(list_sp_pred_true_text_ind_prevPred) Evaluate(list_sp_pred_true_text_ind_prevPred + pred_true_text_ind_prevPreds_to_not_cluster_spec)
def test_cluster_bitermMapping_buffer_framework( list_CPost_test, c_CFVector, dic_txtId__CPost, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram, max_hitindex, oCSimilarityFlgas, wordVectorsDic): eval_pred_true_txt = [] line_count = 0 fileWrite = open(outfileName, 'w') fileWrite.write("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" + "\t" + "Proposed_hitrank" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank" + "\n") print("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" + "\t" + "Proposed_hitrank" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank") for oCPost in list_CPost_test: t11 = datetime.now() testTrue = oCPost.trueLabel tagWords = oCPost.tagWords titleWords = oCPost.titleWords bodyWords = oCPost.bodyWords # id = oCPost.id # may not be useful for test testpostId = oCPost.soPostId testCreatetime = oCPost.createtime testWords = tagWords # this can be changed txtBitermsFreqs_Tag = None bi_terms_len_Tag = 0 grams_Tag = None txtBitermsFreqs_Title = None bi_terms_len_Title = 0 grams_Title = None txtBitermsFreqs_Body = None bi_terms_len_Body = 0 grams_Body = None text_VecTag = None text_VecTitle = None text_VecBody = None targetClusterIds = [] grams = [] line_count += 1 # text->biterms # biterms->targetClusterIds # targetClusterIds->txtIds by c_txtIds # txtIds->textItems by txtId_txt if oCSimilarityFlgas.isTagSim: bi_termsTag = construct_biterms(tagWords) grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram) grams.extend(grams_Tag) txtBitermsFreqs_Tag = Counter(bi_termsTag) bi_terms_len_Tag = len(bi_termsTag) tCIds = findTargetClusters(txtBitermsFreqs_Tag, dic_bitermTag__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic, embedDim) text_VecTag = X[0] if oCSimilarityFlgas.isTitleSim: bi_termsTitle = construct_biterms(titleWords) grams_Title = generateGramsConsucetive(titleWords, min_gram, max_gram) grams.extend(grams_Title) txtBitermsFreqs_Title = Counter(bi_termsTitle) bi_terms_len_Title = len(bi_termsTitle) tCIds = findTargetClusters(txtBitermsFreqs_Title, dic_bitermTitle__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([titleWords], wordVectorsDic, embedDim) text_VecTitle = X[0] if oCSimilarityFlgas.isBodySim: bi_termsBody = construct_biterms(bodyWords) grams_Body = generateGramsConsucetive(bodyWords, min_gram, max_gram) grams.extend(grams_Body) txtBitermsFreqs_Body = Counter(bi_termsBody) bi_terms_len_Body = len(bi_termsBody) tCIds = findTargetClusters(txtBitermsFreqs_Body, dic_bitermBody__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic, embedDim) text_VecBody = X[0] oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag, txtBitermsFreqs_Title, bi_terms_len_Title, txtBitermsFreqs_Body, bi_terms_len_Body, text_VecTag, text_VecTitle, text_VecBody) targetClusterIds = set(targetClusterIds) closeClusterIds = findCloseClustersIds_framework( oCPostProcessed, targetClusterIds, c_CFVector, oCSimilarityFlgas) train_cluster_CPosts = findTextItems_framework(closeClusterIds, c_CFVector, dic_txtId__CPost) # train_cluster_CPosts = filterTextItems_framework(train_cluster_CPosts, oCSimilarityFlgas, oCPostProcessed) sortedGrams = list(sorted(grams, key=len, reverse=True)) train_gram_CPosts = aggregateTextItems_framework( sortedGrams, dic_ngram__txtIds, dic_txtId__CPost) train_gram_CPosts.extend(train_cluster_CPosts) # train_Items.extend(trainItems) # print('len(train_gram_CPosts)', len(train_gram_CPosts), 'len(targetClusterIds)', len(targetClusterIds)) pathCount = 0 flag = False for trainCPost in train_gram_CPosts: trainTrue = int(str(trainCPost.trueLabel)) train_words = trainCPost.tagWords # this can be changed trainPostId = trainCPost.soPostId trainCreateTime = trainCPost.createtime pathCount += 1 if str(testTrue) == str(trainTrue): ProposedHitRank_val = int( max(1, math.floor(pathCount / len(sortedGrams)))) t12 = datetime.now() t_diff = t12 - t11 text_sim, commonCount = computeTextSimCommonWord_WordDic( Counter(testWords), Counter(train_words), len(testWords), len(train_words)) date_diff = trainCreateTime - testCreatetime date_diff = date_diff.days # "testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" + "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank" print( str(testpostId) + "\t" + str(trainPostId) + "\t" + str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" + str(trainCreateTime) + "\t" + str(date_diff) + "\t" + str(pathCount)) fileWrite.write( str(testpostId) + "\t" + str(trainPostId) + "\t" + str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" + str(trainCreateTime) + "\t" + str(date_diff) + "\t" + str(pathCount) + "\n") flag = True break if pathCount > max_hitindex: break if not flag: t12 = datetime.now() t_diff = t12 - t11 print( str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "") fileWrite.write( str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n") fileWrite.close()