def cluster_gram_freq(list_docs, minGSize, maxGSize): dic_ngram__docs = {} # set_docIds = [document.documentID for document in list_docs] # set_docIds = set(set_docIds) for document in list_docs: words = document.text grams = generateGramsConsucetive(words, minGSize, maxGSize) for gram in grams: dic_ngram__docs.setdefault(gram, []).append(document) # print('cluster_gram_freq', words, document.documentID, grams, len(dic_ngram__docs)) gram_std, gram_mean, gram_max, gram_min = populateNgramStatistics( dic_ngram__docs, 1) print('gram_std, gram_mean, gram_max, gram_min', gram_std, gram_mean, gram_max, gram_min, 'before len(dic_ngram__docs)', len(dic_ngram__docs)) minClusterSize = gram_mean + 0 * gram_std dic_filtered_ngram__docs = filterGrams(dic_ngram__docs, minClusterSize) print('after len(dic_filtered_ngram__docs)', len(dic_filtered_ngram__docs)) dic_removed_common__docs = removeCommonDocs(dic_filtered_ngram__docs) print('after dic_removed_common__docs', len(dic_removed_common__docs)) print('###total docs in batch=', len(list_docs)) dic_docId__cluster = evaluateByGramUsingDic(dic_removed_common__docs) del dic_removed_common__docs del dic_filtered_ngram__docs del dic_ngram__docs # return [dic_docId__cluster, set_docIds - dic_docId__cluster.keys()] return dic_docId__cluster
def buildNGramIndex(list_pred_true_words_index_postid_createtime): for item in list_pred_true_words_index_postid_createtime: words=item[2] txtId=item[3] #print('process index for', item) text_Vec=None if isSemantic==True: if txtId in dic_txtId__vec: text_Vec= dic_txtId__vec[txtId] else: X=generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec=X[0] dic_txtId__vec[txtId]=text_Vec dic_txtId__text[txtId]=item grams=generateGramsConsucetive(words,min_gram,max_gram) #len(words)) for gram in grams: dic_ngram__txtIds.setdefault(gram, []).append(txtId) if isSemantic==True: if gram in dic_ngram__center: dic_ngram__center[gram]=list( map(add, dic_ngram__center[gram], text_Vec) ) else: dic_ngram__center[gram]=text_Vec
def buildNGramIndex(list_pred_true_words_index_postid): for item in list_pred_true_words_index_postid: words = item[2] txtId = item[3] dic_txtId__text[txtId] = item #grams=generateGrams(words,min_gram,len(words)) grams = generateGramsConsucetive(words, min_gram, max_gram) for gram in grams: dic_ngram__txtIds.setdefault(gram, []).append(txtId)
def buildNGramIndex(list_pred_true_words_index_postid_createtime): dic_ngram__txtIds = {} dic_txtId__text = {} for item in list_pred_true_words_index_postid_createtime: words = item[2] txtId = item[3] # index dic_txtId__text[txtId] = item grams = generateGramsConsucetive(words, min_gram, max_gram) # len(words)) for gram in grams: dic_ngram__txtIds.setdefault(gram, []).append(txtId) return [dic_ngram__txtIds, dic_txtId__text]
def gramClusterToFeatures(dic_gram__txtIds, dic_txtId__text): dic_term_clusterGramIds = {} dic_cluster_ftrs = {} dic_cluster_size = {} for gramClusterID, txtIds in dic_gram__txtIds.items( ): # gram is a cluster id cluster_ftrs = [] for txtId in txtIds: item = dic_txtId__text[txtId] words = item[2] ftrs = generateGramsConsucetive(words, min_gram, max_gram) # len(words)) cluster_ftrs.extend(ftrs) for ftr in ftrs: dic_term_clusterGramIds.setdefault(ftr, []).append(gramClusterID) ftr_dict = Counter(cluster_ftrs) dic_cluster_ftrs[gramClusterID] = ftr_dict dic_cluster_size[gramClusterID] = len(cluster_ftrs) return [dic_term_clusterGramIds, dic_cluster_ftrs, dic_cluster_size]
testfile = 'test_stackoverflow_' + lang + '_true_id_title_tags_body_createtime' testList_pred_true_words_index_postid_createtime = readStackOverflowDataSetBody( testfile, isStopWord, 6, textType, tagIgnore) count = 0 for item in testList_pred_true_words_index_postid_createtime: testTruelabel = item[1] words = item[2] testpostId = item[4] testCreateTime = item[5] testDateTime = datetime.strptime(str(item[5]).split("t")[0], "%Y-%m-%d") t11 = datetime.now() test_grams = generateGramsConsucetive(words, min_gram, max_gram) # len(words)) test_term_dict = Counter(test_grams) test_term_size = len(test_grams) tagetGramClusterIds = getTagetGramClusterIds(test_grams, dic_term_clusterGramIds) count += 1 print('count', count, 'len(tagetGramClusterIds)', len(tagetGramClusterIds)) dict_cluster_sims = {} for gramClusterId in tagetGramClusterIds: # print('clusterId', clusterId, 'len(dic_tupple_class[clusterId])', len(dic_tupple_class[clusterId])) sim, commCount = computeTextSimCommonWord_WordDic( test_term_dict, dic_cluster_ftrs[gramClusterId], test_term_size, dic_cluster_size[gramClusterId])
def cluster_biterm(f, list_pred_true_words_index_postid_createtime, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterId_Freq={}, dic_biterm__allClusterFreq={}, dic_biterm__clusterIds={}, c_textItems={}, dic_ngram__textItems={}, min_gram=1, max_gram=2, isTagSim=True, isTitleSim=False, isBodySim=False): print("cluster_bigram") # current_txt_id=last_txtId eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for item in list_pred_true_words_index_postid_createtime: words = item[2] current_txt_id = int(item[3]) postId = item[4] bi_terms = construct_biterms(words) grams = generateGramsConsucetive(words, min_gram, max_gram) # bi_terms=generateGramsConsucetive(words,minGSize, maxGSize) # print(words, bi_terms) for gram in grams: dic_ngram__textItems.setdefault(gram, []).append(item) line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] # clusterId=findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds) targetClusterIds = findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) clusterId = findCloseClusterByTargetClusters( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds, targetClusterIds) c_textItems.setdefault(clusterId, []).append(item) max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)]) dic_clus__id[clusterId] = max_c_id txtId_txt[current_txt_id] = words c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds = populateClusterFeature( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds) # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) # print('clusterId', clusterId, 'current_txt_id', current_txt_id, len(c_textItems), len(c_txtIds), words, len(targetClusterIds), len(dic_ngram__textItems)) eval_pred_true_txt.append([clusterId, item[1], item[2]]) if ignoreMinusOne == True: if str(item[1]) != '-1': f.write( str(clusterId) + " " + str(item[1]) + " " + str(' '.join(item[2])) + " " + postId + "\n") else: f.write( str(clusterId) + " " + str(item[1]) + " " + str(' '.join(item[2])) + " " + postId + "\n") if line_count % 500 == 0: # print(dic_clus__id) print(len(dic_clus__id)) # delete old and small clusters, remove multi-cluster words from clusters list_c_sizes = [] list_c_ids = [] # list_size__cid={} for c_id, txtIds in c_txtIds.items(): list_c_sizes.append(len(txtIds)) list_c_ids.append(dic_clus__id[c_id]) # list_size__cid[len(txtIds)]=c_id mean_c_size = 0 std_c_size = 0 if len(list_c_sizes) > 2: mean_c_size = statistics.mean(list_c_sizes) std_c_size = statistics.stdev(list_c_sizes) mean_c_id = 0 std_c_id = 0 if len(list_c_ids) > 2: mean_c_id = statistics.mean(list_c_ids) std_c_id = statistics.stdev(list_c_ids) print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size) print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id) list_del_cids = [] del_count = 0 for c_id, txtIds in c_txtIds.items(): c_size = len(txtIds) if ((c_size <= 1 or float(c_size) <= float(abs(mean_c_size - std_c_size))) or (float(c_size) >= mean_c_size + std_c_size)) or ( (float(c_id) <= float(abs(mean_c_id - std_c_id))) or (float(c_id) >= float(abs(mean_c_id + std_c_id)))): list_del_cids.append(c_id) list_del_cids = set(list_del_cids) print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs)) listTargetBiterms = [] # need to uncomment for c_id in list_del_cids: if c_id in c_bitermsFreqs: # print('del c_id', c_id, len(c_bitermsFreqs[c_id])) del c_bitermsFreqs[c_id] if c_id in c_totalBiterms: del c_totalBiterms[c_id] if c_id in c_txtIds: del c_txtIds[c_id] if c_id in c_wordsFreqs: del c_wordsFreqs[c_id] if c_id in c_totalWords: del c_totalWords[c_id] if c_id in dic_clus__id: del dic_clus__id[c_id] if isSemantic == True: del c_clusterVecs[c_id] # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) if line_count % 1000 == 0: print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt)) Evaluate(eval_pred_true_txt, ignoreMinusOne) t12 = datetime.now() t_diff = t12 - t11 print("total time diff secs=", t_diff.seconds) last_txtId = current_txt_id return [ c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds, c_textItems, dic_ngram__textItems ]
def cluster_biterm_framework( f, list_CPost, c_CFVector, max_c_id, dic_txtId__CPost, wordVectorsDic, dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram, oCSimilarityFlgas, c_itemsCount): eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for oCPost in list_CPost: trueLabel = oCPost.trueLabel tagWords = oCPost.tagWords titleWords = oCPost.titleWords bodyWords = oCPost.bodyWords id = oCPost.id soPostId = oCPost.soPostId createtime = oCPost.createtime print('id', id, 'tagWords', tagWords, 'titleWords', titleWords, 'bodyWords', bodyWords) txtBitermsFreqs_Tag = None bi_terms_len_Tag = 0 grams_Tag = None txtBitermsFreqs_Title = None bi_terms_len_Title = 0 grams_Title = None txtBitermsFreqs_Body = None bi_terms_len_Body = 0 grams_Body = None text_VecTag = None text_VecTitle = None text_VecBody = None targetClusterIds = [] dic_txtId__CPost[id] = oCPost if oCSimilarityFlgas.isTagSim: bi_termsTag = construct_biterms(tagWords) grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram) for gram in grams_Tag: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Tag = Counter(bi_termsTag) bi_terms_len_Tag = len(bi_termsTag) tCIds = findTargetClusters(txtBitermsFreqs_Tag, dic_bitermTag__clusterIds) # print('dic_bitermTag__clusterIds', dic_bitermTag__clusterIds, 'txtBitermsFreqs_Tag', txtBitermsFreqs_Tag) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic, embedDim) text_VecTag = X[0] if oCSimilarityFlgas.isTitleSim: bi_termsTitle = construct_biterms(titleWords) grams_Title = generateGramsConsucetive(titleWords, min_gram, max_gram) for gram in grams_Title: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Title = Counter(bi_termsTitle) bi_terms_len_Title = len(bi_termsTitle) tCIds = findTargetClusters(txtBitermsFreqs_Title, dic_bitermTitle__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([titleWords], wordVectorsDic, embedDim) text_VecTitle = X[0] if oCSimilarityFlgas.isBodySim: bi_termsBody = construct_biterms(bodyWords) grams_Body = generateGramsConsucetive(bodyWords, min_gram, max_gram) for gram in grams_Body: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Body = Counter(bi_termsBody) bi_terms_len_Body = len(bi_termsBody) tCIds = findTargetClusters(txtBitermsFreqs_Body, dic_bitermBody__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic, embedDim) text_VecBody = X[0] oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag, txtBitermsFreqs_Title, bi_terms_len_Title, txtBitermsFreqs_Body, bi_terms_len_Body, text_VecTag, text_VecTitle, text_VecBody) targetClusterIds = set(targetClusterIds) clusterId = findCloseClusterByTargetClusters_framework( c_CFVector, oCPostProcessed, targetClusterIds, max_c_id, oCSimilarityFlgas) if ignoreMinusOne: if str(trueLabel) != '-1': f.write( str(clusterId) + " " + str(trueLabel) + " " + ' '.join(tagWords) + " " + str(soPostId) + "\n") else: f.write( str(clusterId) + " " + str(trueLabel) + " " + ' '.join(tagWords) + " " + str(soPostId) + "\n") eval_pred_true_txt.append([clusterId, trueLabel, tagWords]) if clusterId not in c_itemsCount: c_itemsCount[clusterId] = 0 c_itemsCount[clusterId] += 1 max_c_id = max([max_c_id, clusterId, len(c_CFVector)]) dic_clus__id[clusterId] = max_c_id # print('max_c_id, len(c_CFVector)', max_c_id, len(c_CFVector)) c_CFVector, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds = populateClusterFeature_framework( c_CFVector, oCPostProcessed, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, clusterId, id, oCSimilarityFlgas) del oCPostProcessed del oCPost line_count += 1 if line_count % DeleteInterval == 0: c_CFVector, c_itemsCount = deleteOldClusters_framework( c_CFVector, c_itemsCount, dic_clus__id) if line_count % 1000 == 0: # print('c_itemsCount', c_itemsCount) Evaluate(eval_pred_true_txt, ignoreMinusOne) return [ c_CFVector, max_c_id, dic_txtId__CPost, dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, dic_ngram__txtIds, c_itemsCount ]
def test_cluster_bitermMapping_buffer( testList_pred_true_words_index_postid_createtime, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterIds={}, c_textItems={}, dic_ngram__textItems={}, min_gram=1, max_gram=2, max_hitindex=10000): eval_pred_true_txt = [] line_count = 0 print("testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" + "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff") for item in testList_pred_true_words_index_postid_createtime: t11 = datetime.now() pred = item[0] testTrue = int(item[1]) words = item[2] testpostId = item[4] testDateTime = datetime.strptime(item[5].split("t")[0], "%Y-%m-%d") #datetime.now() # item[5] #print('testDateTime', item[5]) bi_terms = construct_biterms(words) #print(words, bi_terms, pred) #current_txt_id=int(testpostId) line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] #text->biterms #biterms->targetClusterIds #targetClusterIds->txtIds by c_txtIds #txtIds->textItems by txtId_txt targetClusterIds = findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) trainItems = findTextItems(targetClusterIds, c_textItems) grams = generateGramsConsucetive(words, min_gram, max_gram) sortedGrams = list(sorted(grams, key=len, reverse=True)) train_Items = aggregateTextItems(sortedGrams, dic_ngram__textItems) trainItems.extend(train_Items) #print('len(targetClusterIds)', len(targetClusterIds), 'len(trainItems)',len(trainItems), words) pathCount = 0 flag = False for trainItem in trainItems: #list_pred_true_words_index_postid in clustring_term_online_stack= trainItem trainTrue = int(trainItem[1]) train_words = trainItem[2] trainPostId = trainItem[4] pathCount += 1 if str(testTrue) == str(trainTrue): #grams=generateGramsConsucetive(words, min_gram, max_gram) #sortedGrams = list(sorted(grams, key = len, reverse=True)) ProposedHitRank_val = int( max(1, math.floor(pathCount / len(sortedGrams)))) t12 = datetime.now() t_diff = t12 - t11 #print(str(testpostId)+"\t"+str(trainPostId)+"\t0\t0\t0\t0\t"+str(ProposedHitRank_val)+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue)) text_sim, commonCount = computeTextSimCommonWord_WordDic( Counter(words), Counter(train_words), len(words), len(train_words)) trainDateTime = datetime.strptime(trainItem[5].split("t")[0], "%Y-%m-%d") #datetime.now() date_diff = trainDateTime - testDateTime date_diff = date_diff.days print( str(testpostId) + "\t" + str(trainPostId) + "\t" + str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(words) + "\t" + ' '.join(train_words) + "\t" + str(trainDateTime) + "\t" + str(testDateTime) + "\t" + str(date_diff)) flag = True break if pathCount > max_hitindex: break if flag == False: '''grams=generateGramsConsucetive(words, min_gram, max_gram) sortedGrams = list(sorted(grams, key = len, reverse=True)) flag=False largestGram='' ProposedHitRank=0 train_Items=aggregateTextItems(sortedGrams, dic_ngram__textItems) #print("len(train_Items)", len(train_Items) ) for train_item in train_Items: ProposedHitRank+=1 trainTruelabel=train_item[1] train_words=train_item[2] trainPostId=train_item[4] if str(trainTruelabel)==str(testTrue): t12=datetime.now() t_diff = t12-t11 text_sim, commonCount = computeTextSimCommonWord_WordDic(Counter(words), Counter(train_words), len(words), len(train_words) ) ProposedHitRank_val=int(max(1,math.floor(ProposedHitRank/len(sortedGrams)))) trainDateTime= datetime.strptime(train_item[5].split("t")[0] ,"%Y-%m-%d") #datetime.now() date_diff=trainDateTime-testDateTime date_diff=date_diff.days print(str(testpostId)+"\t"+str(trainPostId)+"\t"+str(text_sim)+"\t"+str(ProposedHitRank_val)+"\t"+str(t_diff.microseconds/float(microDivide))+"\t"+str(testTrue)+"\t"+' '.join(words)+"\t"+' '.join(train_words)+"\t"+str(trainDateTime)+"\t"+str(testDateTime)+"\t"+str(date_diff)) flag=True break if ProposedHitRank > max_hitindex: break''' if flag == False: #print('not found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds)) t12 = datetime.now() t_diff = t12 - t11 #print(str(testpostId)+"\t"+"-100"+"\t0\t0\t0\t0\t-100"+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue)) print( str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(words) + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "")
def test_cluster_bitermMapping_buffer_framework( list_CPost_test, c_CFVector, dic_txtId__CPost, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram, max_hitindex, oCSimilarityFlgas, wordVectorsDic): eval_pred_true_txt = [] line_count = 0 fileWrite = open(outfileName, 'w') fileWrite.write("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" + "\t" + "Proposed_hitrank" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank" + "\n") print("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" + "\t" + "Proposed_hitrank" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank") for oCPost in list_CPost_test: t11 = datetime.now() testTrue = oCPost.trueLabel tagWords = oCPost.tagWords titleWords = oCPost.titleWords bodyWords = oCPost.bodyWords # id = oCPost.id # may not be useful for test testpostId = oCPost.soPostId testCreatetime = oCPost.createtime testWords = tagWords # this can be changed txtBitermsFreqs_Tag = None bi_terms_len_Tag = 0 grams_Tag = None txtBitermsFreqs_Title = None bi_terms_len_Title = 0 grams_Title = None txtBitermsFreqs_Body = None bi_terms_len_Body = 0 grams_Body = None text_VecTag = None text_VecTitle = None text_VecBody = None targetClusterIds = [] grams = [] line_count += 1 # text->biterms # biterms->targetClusterIds # targetClusterIds->txtIds by c_txtIds # txtIds->textItems by txtId_txt if oCSimilarityFlgas.isTagSim: bi_termsTag = construct_biterms(tagWords) grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram) grams.extend(grams_Tag) txtBitermsFreqs_Tag = Counter(bi_termsTag) bi_terms_len_Tag = len(bi_termsTag) tCIds = findTargetClusters(txtBitermsFreqs_Tag, dic_bitermTag__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic, embedDim) text_VecTag = X[0] if oCSimilarityFlgas.isTitleSim: bi_termsTitle = construct_biterms(titleWords) grams_Title = generateGramsConsucetive(titleWords, min_gram, max_gram) grams.extend(grams_Title) txtBitermsFreqs_Title = Counter(bi_termsTitle) bi_terms_len_Title = len(bi_termsTitle) tCIds = findTargetClusters(txtBitermsFreqs_Title, dic_bitermTitle__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([titleWords], wordVectorsDic, embedDim) text_VecTitle = X[0] if oCSimilarityFlgas.isBodySim: bi_termsBody = construct_biterms(bodyWords) grams_Body = generateGramsConsucetive(bodyWords, min_gram, max_gram) grams.extend(grams_Body) txtBitermsFreqs_Body = Counter(bi_termsBody) bi_terms_len_Body = len(bi_termsBody) tCIds = findTargetClusters(txtBitermsFreqs_Body, dic_bitermBody__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic, embedDim) text_VecBody = X[0] oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag, txtBitermsFreqs_Title, bi_terms_len_Title, txtBitermsFreqs_Body, bi_terms_len_Body, text_VecTag, text_VecTitle, text_VecBody) targetClusterIds = set(targetClusterIds) closeClusterIds = findCloseClustersIds_framework( oCPostProcessed, targetClusterIds, c_CFVector, oCSimilarityFlgas) train_cluster_CPosts = findTextItems_framework(closeClusterIds, c_CFVector, dic_txtId__CPost) # train_cluster_CPosts = filterTextItems_framework(train_cluster_CPosts, oCSimilarityFlgas, oCPostProcessed) sortedGrams = list(sorted(grams, key=len, reverse=True)) train_gram_CPosts = aggregateTextItems_framework( sortedGrams, dic_ngram__txtIds, dic_txtId__CPost) train_gram_CPosts.extend(train_cluster_CPosts) # train_Items.extend(trainItems) # print('len(train_gram_CPosts)', len(train_gram_CPosts), 'len(targetClusterIds)', len(targetClusterIds)) pathCount = 0 flag = False for trainCPost in train_gram_CPosts: trainTrue = int(str(trainCPost.trueLabel)) train_words = trainCPost.tagWords # this can be changed trainPostId = trainCPost.soPostId trainCreateTime = trainCPost.createtime pathCount += 1 if str(testTrue) == str(trainTrue): ProposedHitRank_val = int( max(1, math.floor(pathCount / len(sortedGrams)))) t12 = datetime.now() t_diff = t12 - t11 text_sim, commonCount = computeTextSimCommonWord_WordDic( Counter(testWords), Counter(train_words), len(testWords), len(train_words)) date_diff = trainCreateTime - testCreatetime date_diff = date_diff.days # "testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" + "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank" print( str(testpostId) + "\t" + str(trainPostId) + "\t" + str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" + str(trainCreateTime) + "\t" + str(date_diff) + "\t" + str(pathCount)) fileWrite.write( str(testpostId) + "\t" + str(trainPostId) + "\t" + str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" + str(trainCreateTime) + "\t" + str(date_diff) + "\t" + str(pathCount) + "\n") flag = True break if pathCount > max_hitindex: break if not flag: t12 = datetime.now() t_diff = t12 - t11 print( str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "") fileWrite.write( str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" + str(t_diff.microseconds / float(microDivide)) + "\t" + str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n") fileWrite.close()