def assignToClusterSimDistribution(not_clustered_inds_batch, dic_bitri_keys_selectedClusters_seenBatch, seen_list_pred_true_words_index, wordVectorsDic): new_not_clustered_inds_batch=[] ##follow Mstream dic_ClusterGroupsDetail={} dic_ClusterWords={} dic_ClusterTextWords={} dic_ClusterVecs={} for key, txtInds in dic_bitri_keys_selectedClusters_seenBatch.items(): list_pred_true_words_index=[] cluster_words=[] txtWords=[] vec=np.zeros(shape=[300]) for txtInd in txtInds: pred= seen_list_pred_true_words_index[txtInd][0] true= seen_list_pred_true_words_index[txtInd][1] words= seen_list_pred_true_words_index[txtInd][2] index= seen_list_pred_true_words_index[txtInd][3] list_pred_true_words_index.append([pred, true, words, index]) cluster_words.extend(words) txtWords.append(words) sent_vec=generate_sent_vecs_toktextdata([words], wordVectorsDic, 300)[0] sent_vec=np.asarray(sent_vec) vec=np.add(vec, sent_vec) dic_ClusterGroupsDetail[key]=list_pred_true_words_index dic_ClusterWords[key]=[Counter(cluster_words), len(cluster_words)] dic_ClusterTextWords[key]=txtWords dic_ClusterVecs[key]=vec # np.true_divide(vec, len(txtInds)+1) #print("dic_ClusterVecs[key]", dic_ClusterVecs[key]) ##end follow Mstream ####our logic starts keys_list=list(dic_bitri_keys_selectedClusters_seenBatch.keys()) #new_clusters={} for item in not_clustered_inds_batch: word_arr=item[2] global_index=item[3] true=item[1] dic_lex_Sim_CommonWords, maxPredLabel_lex, maxSim_lex, maxCommon_lex, minSim_lex=commonWordSims_clusterGroup(word_arr, dic_ClusterWords) text_Vec=generate_sent_vecs_toktextdata([word_arr], wordVectorsDic, 300)[0] dic_semanticSims, maxPredLabel_Semantic, maxSim_Semantic, minSim_semantic=semanticSims(text_Vec, dic_ClusterVecs) if maxCommon_lex>0 and str(maxPredLabel_lex)==str(maxPredLabel_Semantic): new_pred=str(maxPredLabel_lex) new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) '''if len(new_pred.split(' '))==1 and new_pred.isnumeric()==True: #print("new_pred.isnumeric=", new_pred) dic_ClusterVecs[new_pred]= np.add(dic_ClusterVecs[new_pred], np.asarray(text_Vec)) count_dic=dic_ClusterWords[new_pred][0] totalwords_dic=dic_ClusterWords[new_pred][1] dic_ClusterWords[new_pred]=[count_dic+Counter(word_arr), totalwords_dic+len(word_arr)]''' '''else: new_key=str(len(dic_ClusterVecs)+10) new_pred=new_key new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) dic_ClusterVecs[new_pred]=np.asarray(text_Vec) dic_ClusterWords[new_pred]=[Counter(word_arr), len(word_arr)] #print("new_pred=", new_pred)''' '''closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,1) closeKey_Semantic, max_Semantic_sim_gram=findCloseCluster_GramKey_Semantic(keys_list,word_arr,0, wordVectorsDic, False) if closeKey_Lexical==closeKey_Semantic: new_pred=str(closeKey_Lexical) new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) else: closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,2) if closeKey_Lexical != None: new_pred=str(closeKey_Lexical) new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) #elif max_Semantic_sim_gram>=0.8: # new_pred=str(closeKey_Lexical) # new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) else: dic_lex_Sim_CommonWords, maxPredLabel_lex, maxSim_lex, maxCommon_lex, minSim_lex=commonWordSims_clusterGroup(word_arr, dic_ClusterWords) text_Vec=generate_sent_vecs_toktextdata([word_arr], wordVectorsDic, 300)[0] dic_semanticSims, maxPredLabel_Semantic, maxSim_Semantic, minSim_semantic=semanticSims(text_Vec, dic_ClusterVecs) if maxCommon_lex>0 and str(maxPredLabel_lex)==str(maxPredLabel_Semantic): new_pred=str(maxPredLabel_lex) new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) #else: #assign to a new cluster # new_key=str(len(new_clusters) + len(keys_list)+10) # new_pred=new_key # new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) # new_clusters.setdefault(new_key,[]).append([new_pred, true, word_arr, global_index]) #elif maxCommon_lex>=6: # new_pred=str(maxPredLabel_lex) # new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) #elif maxSim_Semantic>=0.5: # new_pred=str(maxPredLabel_Semantic) # new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index]) # maxPredLabel=int(str(maxPredLabel))+1 # pred_true_text_ind_prevPred[0]=str(maxPredLabel) # new_outs.append(pred_true_text_ind_prevPred) #elif closeKey_Lexical != None: # new_pred=str(closeKey_Lexical) # new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])''' '''else: if closeKey_Semantic !=None: new_pred=str(closeKey_Semantic) new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])''' return new_not_clustered_inds_batch
def findCloseClusterByTargetClusters(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds, targetClusterIds): clusterId_lex = -1 clusterId_sem = -1 clusterId = -1 max_sim = 0 max_sim_lex = 0 dic_lexicalSims = {} for clusId in targetClusterIds: if clusId not in c_bitermsFreqs: continue # print('####targetClusterIds', len(targetClusterIds)) clusBitermsFreqs = c_bitermsFreqs[clusId] txt_j_len = c_totalBiterms[clusId] text_sim, commonCount = computeTextSimCommonWord_WordDic( txtBitermsFreqs, clusBitermsFreqs, bi_terms_len, txt_j_len) if text_sim > max_sim: max_sim = text_sim clusterId = clusId if text_sim > max_sim_lex: max_sim_lex = text_sim clusterId_lex = clusId dic_lexicalSims[clusId] = text_sim lex_sim_values = list(dic_lexicalSims.values()) mean_lex_sim = 0 std_lex_sim = 0 if len(lex_sim_values) > 2: mean_lex_sim = statistics.mean(lex_sim_values) std_lex_sim = statistics.stdev(lex_sim_values) if clusterId_lex == -1: # or clusterId_sem==-1: # clusterId=len(c_bitermsFreqs)+1 clusterId = max_c_id + 1 if isSemantic == True: dic_semanticSims, clusterId_sem, maxSim_Semantic, minSim_semantic = semanticSims( text_Vec, c_clusterVecs, c_txtIds) sem_sim_values = list(dic_semanticSims.values()) mean_sem_sim = 0 std_sem_sim = 0 if len(sem_sim_values) > 2: mean_sem_sim = statistics.mean(sem_sim_values) std_sem_sim = statistics.stdev(sem_sim_values) if maxSim_Semantic >= mean_sem_sim + std_sem_sim: # and randint(0,1)==1: work clusterId = clusterId_sem # elif clusterId_lex!=clusterId_sem: # clusterId=max_c_id+1 # elif clusterId_lex==clusterId_sem: # clusterId=clusterId_lex elif max_sim_lex >= mean_lex_sim + std_lex_sim: # and randint(0,1)==1: work clusterId = clusterId_lex else: clusterId = max_c_id + 1 # clusterId_lex # print(text_Vec, clusterId_lex, clusterId_sem, clusterId) return clusterId