def cluster_biterm(f,
                   list_pred_true_words_index,
                   c_bitermsFreqs={},
                   c_totalBiterms={},
                   c_wordsFreqs={},
                   c_totalWords={},
                   c_txtIds={},
                   c_clusterVecs={},
                   txtId_txt={},
                   last_txtId=0,
                   max_c_id=0,
                   wordVectorsDic={},
                   dic_clus__id={},
                   dic_biterm__clusterId_Freq={},
                   dic_biterm__allClusterFreq={}):
    print("cluster_bigram")

    current_txt_id = last_txtId

    eval_pred_treu_txt = []

    line_count = 0

    t11 = datetime.now()

    for item in list_pred_true_words_index:
        words = item[2]
        bi_terms = construct_biterms(words)

        current_txt_id += 1

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        #X=generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim)
        #text_Vec=X[0]
        text_Vec = [0] * embedDim

        clusterId = findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds,
                                     c_wordsFreqs, c_totalWords, c_clusterVecs,
                                     txtBitermsFreqs, bi_terms_len,
                                     txtWordsFreqs, words_len, max_c_id,
                                     text_Vec)

        max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)])

        dic_clus__id[clusterId] = max_c_id

        txtId_txt[current_txt_id] = words

        c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq = populateClusterFeature(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec,
            dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)
        '''if line_count%1000==0:	
      c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)'''

        eval_pred_treu_txt.append([clusterId, item[1], item[2]])
        if ignoreMinusOne == True:
            if str(item[1]) != '-1':
                f.write(
                    str(clusterId) + "	" + str(item[1]) + "	" + str(item[2]) +
                    "\n")
        else:
            f.write(
                str(clusterId) + "	" + str(item[1]) + "	" + str(item[2]) +
                "\n")

        if line_count % 500 == 0:

            #print(dic_clus__id)
            print(len(dic_clus__id))
            #delete old and small clusters, remove multi-cluster words from clusters
            list_c_sizes = []
            list_c_ids = []
            #list_size__cid={}

            for c_id, txtIds in c_txtIds.items():
                list_c_sizes.append(len(txtIds))
                list_c_ids.append(dic_clus__id[c_id])
                #list_size__cid[len(txtIds)]=c_id
            mean_c_size = statistics.mean(list_c_sizes)
            std_c_size = statistics.stdev(list_c_sizes)

            mean_c_id = statistics.mean(list_c_ids)
            std_c_id = statistics.stdev(list_c_ids)

            print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size,
                  'std_c_size', std_c_size)
            print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id,
                  'std_c_id', std_c_id)

            list_del_cids = []
            del_count = 0
            '''for c_id, txtIds in c_txtIds.items():
        c_size=	len(txtIds)
        ##print('c_id=', c_id, 'c_size=', c_size)		
        #if c_size<=2 :#or del_count<15:
        #  list_del_cids.append(c_id)
        #  print('delete cluster=',c_id, '#size=', c_size) 		  		  
          #del_count+=1	  
        	  
        #if c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size)) or float(c_size)>=mean_c_size+std_c_size or float(c_size)>=mean_c_size:  		
        #if float(c_size)<float(abs(mean_c_size)):
        #  list_del_cids.append(c_id)
          #print('delete cluster=',c_id, '#size=', c_size)  		  
		  
        #float(c_id)<=float(abs(mean_c_id-std_c_id))		  
        if (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))) or float(c_size)>=mean_c_size: #and del_count<100:  		   		
          list_del_cids.append(c_id)
          del_count+=1
        		
        #  print('delete cluster=',c_id, '#size=', c_size) 		  
          
      #list_c_sizes.sort(reverse=True)
	  
      #for c_size in list_c_sizes[0:20]:
      #  list_del_cids.append(list_size__cid[c_size])'''

            for c_id, orderId in dic_clus__id.items():
                if c_id not in c_txtIds:
                    continue
                c_size = len(c_txtIds[c_id])
                #if (float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id))):
                #if (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size)) or float(c_size)>=mean_c_size+std_c_size*1):
                if (float(c_id) <= float(abs(mean_c_id - std_c_id))
                        or float(orderId) <= float(abs(mean_c_id - std_c_id))
                    ) and (c_size <= 1 or float(c_size) <= float(
                        abs(mean_c_size - std_c_size))
                           or float(c_size) >= mean_c_size + std_c_size):
                    list_del_cids.append(c_id)

            print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)',
                  len(c_bitermsFreqs))

            listTargetBiterms = []

            for c_id in list_del_cids:
                del c_bitermsFreqs[c_id]
                del c_totalBiterms[c_id]
                del c_txtIds[c_id]
                del c_wordsFreqs[c_id]
                del c_totalWords[c_id]
                del dic_clus__id[c_id]
                #del c_clusterVecs[c_id]
                '''for biterm, dic_clusterId__Freq in dic_biterm__clusterId_Freq.items():
          if c_id in dic_biterm__clusterId_Freq[biterm]:
            bitermClusterIdFreq=dic_biterm__clusterId_Freq[biterm][c_id]		  
            #dic_biterm__clusterId_Freq[biterm][c_id]=0	
            dic_biterm__allClusterFreq[biterm]-=bitermClusterIdFreq	
            listTargetBiterms.append(biterm) 			
            del dic_biterm__clusterId_Freq[biterm][c_id]'''
            '''listTargetBiterms=set(listTargetBiterms)
      for biterm in listTargetBiterms:
        if dic_biterm__allClusterFreq[biterm]<=0:
          del dic_biterm__clusterId_Freq[biterm]
          del dic_biterm__allClusterFreq[biterm]'''

            #c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)

        if line_count % 1000 == 0:
            print('#######-personal-eval_pred_treu_txt',
                  len(eval_pred_treu_txt))
            Evaluate(eval_pred_treu_txt, ignoreMinusOne)

            t12 = datetime.now()
            t_diff = t12 - t11
            print("total time diff secs=", t_diff.seconds)

    last_txtId = current_txt_id
    return [
        c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds,
        c_clusterVecs, txtId_txt, last_txtId, dic_clus__id,
        dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq
    ]
def cluster_biterm_framework(
        f, list_CPost, c_CFVector, max_c_id, dic_txtId__CPost, wordVectorsDic,
        dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds,
        dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram,
        oCSimilarityFlgas, c_itemsCount):
    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for oCPost in list_CPost:

        trueLabel = oCPost.trueLabel
        tagWords = oCPost.tagWords
        titleWords = oCPost.titleWords
        bodyWords = oCPost.bodyWords
        id = oCPost.id
        soPostId = oCPost.soPostId
        createtime = oCPost.createtime

        print('id', id, 'tagWords', tagWords, 'titleWords', titleWords,
              'bodyWords', bodyWords)

        txtBitermsFreqs_Tag = None
        bi_terms_len_Tag = 0
        grams_Tag = None

        txtBitermsFreqs_Title = None
        bi_terms_len_Title = 0
        grams_Title = None

        txtBitermsFreqs_Body = None
        bi_terms_len_Body = 0
        grams_Body = None

        text_VecTag = None
        text_VecTitle = None
        text_VecBody = None
        targetClusterIds = []

        dic_txtId__CPost[id] = oCPost

        if oCSimilarityFlgas.isTagSim:
            bi_termsTag = construct_biterms(tagWords)

            grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram)
            for gram in grams_Tag:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Tag = Counter(bi_termsTag)
            bi_terms_len_Tag = len(bi_termsTag)
            tCIds = findTargetClusters(txtBitermsFreqs_Tag,
                                       dic_bitermTag__clusterIds)
            # print('dic_bitermTag__clusterIds', dic_bitermTag__clusterIds, 'txtBitermsFreqs_Tag', txtBitermsFreqs_Tag)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic,
                                                   embedDim)
                text_VecTag = X[0]

        if oCSimilarityFlgas.isTitleSim:
            bi_termsTitle = construct_biterms(titleWords)
            grams_Title = generateGramsConsucetive(titleWords, min_gram,
                                                   max_gram)
            for gram in grams_Title:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Title = Counter(bi_termsTitle)
            bi_terms_len_Title = len(bi_termsTitle)
            tCIds = findTargetClusters(txtBitermsFreqs_Title,
                                       dic_bitermTitle__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([titleWords],
                                                   wordVectorsDic, embedDim)
                text_VecTitle = X[0]

        if oCSimilarityFlgas.isBodySim:
            bi_termsBody = construct_biterms(bodyWords)
            grams_Body = generateGramsConsucetive(bodyWords, min_gram,
                                                  max_gram)
            for gram in grams_Body:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Body = Counter(bi_termsBody)
            bi_terms_len_Body = len(bi_termsBody)
            tCIds = findTargetClusters(txtBitermsFreqs_Body,
                                       dic_bitermBody__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic,
                                                   embedDim)
                text_VecBody = X[0]

        oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag,
                                         txtBitermsFreqs_Title,
                                         bi_terms_len_Title,
                                         txtBitermsFreqs_Body,
                                         bi_terms_len_Body, text_VecTag,
                                         text_VecTitle, text_VecBody)

        targetClusterIds = set(targetClusterIds)

        clusterId = findCloseClusterByTargetClusters_framework(
            c_CFVector, oCPostProcessed, targetClusterIds, max_c_id,
            oCSimilarityFlgas)

        if ignoreMinusOne:
            if str(trueLabel) != '-1':
                f.write(
                    str(clusterId) + "	" + str(trueLabel) + "	" +
                    ' '.join(tagWords) + "	" + str(soPostId) + "\n")
        else:
            f.write(
                str(clusterId) + "	" + str(trueLabel) + "	" +
                ' '.join(tagWords) + "	" + str(soPostId) + "\n")

        eval_pred_true_txt.append([clusterId, trueLabel, tagWords])

        if clusterId not in c_itemsCount:
            c_itemsCount[clusterId] = 0
        c_itemsCount[clusterId] += 1

        max_c_id = max([max_c_id, clusterId, len(c_CFVector)])

        dic_clus__id[clusterId] = max_c_id
        # print('max_c_id, len(c_CFVector)', max_c_id, len(c_CFVector))

        c_CFVector, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds = populateClusterFeature_framework(
            c_CFVector, oCPostProcessed, dic_bitermTag__clusterIds,
            dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, clusterId,
            id, oCSimilarityFlgas)

        del oCPostProcessed
        del oCPost

        line_count += 1

        if line_count % DeleteInterval == 0:
            c_CFVector, c_itemsCount = deleteOldClusters_framework(
                c_CFVector, c_itemsCount, dic_clus__id)

        if line_count % 1000 == 0:
            # print('c_itemsCount', c_itemsCount)
            Evaluate(eval_pred_true_txt, ignoreMinusOne)

    return [
        c_CFVector, max_c_id, dic_txtId__CPost, dic_clus__id,
        dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds,
        dic_bitermBody__clusterIds, dic_ngram__txtIds, c_itemsCount
    ]
def cluster_biterm(f,
                   list_pred_true_words_index_postid_createtime,
                   c_bitermsFreqs={},
                   c_totalBiterms={},
                   c_wordsFreqs={},
                   c_totalWords={},
                   c_txtIds={},
                   c_clusterVecs={},
                   txtId_txt={},
                   last_txtId=0,
                   max_c_id=0,
                   wordVectorsDic={},
                   dic_clus__id={},
                   dic_biterm__clusterId_Freq={},
                   dic_biterm__allClusterFreq={},
                   dic_biterm__clusterIds={},
                   c_textItems={},
                   dic_ngram__textItems={},
                   min_gram=1,
                   max_gram=2,
                   isTagSim=True,
                   isTitleSim=False,
                   isBodySim=False):
    print("cluster_bigram")

    # current_txt_id=last_txtId

    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for item in list_pred_true_words_index_postid_createtime:

        words = item[2]
        current_txt_id = int(item[3])
        postId = item[4]

        bi_terms = construct_biterms(words)
        grams = generateGramsConsucetive(words, min_gram, max_gram)
        # bi_terms=generateGramsConsucetive(words,minGSize, maxGSize)
        # print(words, bi_terms)

        for gram in grams:
            dic_ngram__textItems.setdefault(gram, []).append(item)

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        # clusterId=findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds)

        targetClusterIds = findTargetClusters(txtBitermsFreqs,
                                              dic_biterm__clusterIds)

        clusterId = findCloseClusterByTargetClusters(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, max_c_id, text_Vec,
            dic_biterm__clusterIds, targetClusterIds)

        c_textItems.setdefault(clusterId, []).append(item)

        max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)])

        dic_clus__id[clusterId] = max_c_id

        txtId_txt[current_txt_id] = words

        c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds = populateClusterFeature(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec,
            dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq,
            dic_biterm__clusterIds)

        # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)

        # print('clusterId', clusterId, 'current_txt_id', current_txt_id, len(c_textItems), len(c_txtIds), words, len(targetClusterIds), len(dic_ngram__textItems))

        eval_pred_true_txt.append([clusterId, item[1], item[2]])
        if ignoreMinusOne == True:
            if str(item[1]) != '-1':
                f.write(
                    str(clusterId) + "	" + str(item[1]) + "	" +
                    str(' '.join(item[2])) + "	" + postId + "\n")
        else:
            f.write(
                str(clusterId) + "	" + str(item[1]) + "	" +
                str(' '.join(item[2])) + "	" + postId + "\n")

        if line_count % 500 == 0:

            # print(dic_clus__id)
            print(len(dic_clus__id))
            # delete old and small clusters, remove multi-cluster words from clusters
            list_c_sizes = []
            list_c_ids = []
            # list_size__cid={}

            for c_id, txtIds in c_txtIds.items():
                list_c_sizes.append(len(txtIds))
                list_c_ids.append(dic_clus__id[c_id])
                # list_size__cid[len(txtIds)]=c_id
            mean_c_size = 0
            std_c_size = 0
            if len(list_c_sizes) > 2:
                mean_c_size = statistics.mean(list_c_sizes)
                std_c_size = statistics.stdev(list_c_sizes)

            mean_c_id = 0
            std_c_id = 0
            if len(list_c_ids) > 2:
                mean_c_id = statistics.mean(list_c_ids)
                std_c_id = statistics.stdev(list_c_ids)

            print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size,
                  'std_c_size', std_c_size)
            print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id,
                  'std_c_id', std_c_id)

            list_del_cids = []
            del_count = 0

            for c_id, txtIds in c_txtIds.items():
                c_size = len(txtIds)
                if ((c_size <= 1 or
                     float(c_size) <= float(abs(mean_c_size - std_c_size))) or
                    (float(c_size) >= mean_c_size + std_c_size)) or (
                        (float(c_id) <= float(abs(mean_c_id - std_c_id))) or
                        (float(c_id) >= float(abs(mean_c_id + std_c_id)))):
                    list_del_cids.append(c_id)

            list_del_cids = set(list_del_cids)
            print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)',
                  len(c_bitermsFreqs))

            listTargetBiterms = []  # need to uncomment

            for c_id in list_del_cids:

                if c_id in c_bitermsFreqs:
                    # print('del c_id', c_id, len(c_bitermsFreqs[c_id]))
                    del c_bitermsFreqs[c_id]

                if c_id in c_totalBiterms:
                    del c_totalBiterms[c_id]

                if c_id in c_txtIds:
                    del c_txtIds[c_id]

                if c_id in c_wordsFreqs:
                    del c_wordsFreqs[c_id]

                if c_id in c_totalWords:
                    del c_totalWords[c_id]

                if c_id in dic_clus__id:
                    del dic_clus__id[c_id]

                if isSemantic == True:
                    del c_clusterVecs[c_id]

            # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)

        if line_count % 1000 == 0:
            print('#######-personal-eval_pred_true_txt',
                  len(eval_pred_true_txt))
            Evaluate(eval_pred_true_txt, ignoreMinusOne)

            t12 = datetime.now()
            t_diff = t12 - t11
            print("total time diff secs=", t_diff.seconds)

    last_txtId = current_txt_id
    return [
        c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds,
        c_clusterVecs, txtId_txt, last_txtId, dic_clus__id,
        dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq,
        dic_biterm__clusterIds, c_textItems, dic_ngram__textItems
    ]
Esempio n. 4
0
def test_cluster_biterm(testList_pred_true_words_index_postid,
                        c_bitermsFreqs={},
                        c_totalBiterms={},
                        c_wordsFreqs={},
                        c_totalWords={},
                        c_txtIds={},
                        c_clusterVecs={},
                        txtId_txt={},
                        last_txtId=0,
                        max_c_id=0,
                        wordVectorsDic={},
                        dic_clus__id={},
                        dic_biterm__clusterIds={},
                        dicTrain_pred__trues={}):
    print("test cluster_bigram")

    current_txt_id = last_txtId

    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for item in testList_pred_true_words_index_postid:
        pred = item[0]
        testTrue = int(item[1])
        words = item[2]
        postId = item[4]
        bi_terms = construct_biterms(words)
        #print(words, bi_terms, pred)

        current_txt_id += 1

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        targetClusterIds = findTargetClusters(txtBitermsFreqs,
                                              dic_biterm__clusterIds)

        print(targetClusterIds)

        clusterId = findCloseClusterByTargetClusters(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, max_c_id, text_Vec,
            dic_biterm__clusterIds, targetClusterIds)

        if clusterId in dicTrain_pred__trues and testTrue in dicTrain_pred__trues[
                clusterId]:
            print('found found', 'clusterId', clusterId, 'testTrue', testTrue,
                  words, postId, 'len', len(dicTrain_pred__trues[clusterId]))
        else:
            print('not found', 'clusterId', clusterId, 'testTrue', testTrue,
                  words, postId)

        #max_c_id=max([max_c_id, clusterId,len(c_bitermsFreqs)])

        #dic_clus__id[clusterId]=max_c_id

        #txtId_txt[current_txt_id]=words

        #c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterIds=populateClusterFeature(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec,  dic_biterm__clusterIds) #no need here
        '''eval_pred_true_txt.append([clusterId, item[1], item[2]])
    if ignoreMinusOne==True:
      if str(item[1])!='-1':   	
        f.write(str(clusterId)+"	"+str(item[1])+"	"+str(item[2])+"	"+postId+"\n")
    else:
      f.write(str(clusterId)+"	"+str(item[1])+"	"+str(item[2])+"	"+postId+"\n")  	

    
    if line_count%500==0:
       #remove multi-cluster biterms from c_bitermsFreqs   using targetClusterIds; before computing similarity	
       c_bitermsFreqs, c_totalBiterms, c_txtIds, txtBitermsFreqs=removeTargetMultiClusterBiTerms(c_bitermsFreqs, c_totalBiterms, c_txtIds, targetClusterIds, txtBitermsFreqs, dic_biterm__clusterIds)'''
        '''if line_count%500==0:

      #print(dic_clus__id)      
      print(len(dic_clus__id)) 	  
      #delete old and small clusters, remove multi-cluster words from clusters
      list_c_sizes=[]
      list_c_ids=[] 	  
      #list_size__cid={}
        	  
      for c_id, txtIds in c_txtIds.items():
        list_c_sizes.append(len(txtIds))
        list_c_ids.append(dic_clus__id[c_id])		
        #list_size__cid[len(txtIds)]=c_id		
      mean_c_size=statistics.mean(list_c_sizes)
      std_c_size=statistics.stdev(list_c_sizes)

      mean_c_id=statistics.mean(list_c_ids)
      std_c_id=statistics.stdev(list_c_ids)	  

      print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size)	
      print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id)	  
	  
      list_del_cids=[]  
      del_count=0	

	  
      	


      for c_id, orderId in dic_clus__id.items():
        #if float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id)):
        if c_id not in c_txtIds:
          continue  		
        c_size=len(c_txtIds[c_id])	  
        if ( float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id))) and (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))):
        #or float(c_size)>=mean_c_size+std_c_size*1):		
          list_del_cids.append(c_id)  		
	  
	  
	  
		  
      print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs))


      listTargetBiterms=[]
	  
      for c_id in list_del_cids:
        BitermsFreqs=c_bitermsFreqs[c_id]  
        for biterm, freq in BitermsFreqs.items():
          if biterm not in dic_biterm__clusterIds:             
            continue			
          clusterIds=set(dic_biterm__clusterIds[biterm])
          if c_id not in clusterIds:			
            continue 			
          clusterIds.remove(c_id)				
          dic_biterm__clusterIds[biterm]=list(clusterIds)		
          if len(dic_biterm__clusterIds[biterm])==0:
            del dic_biterm__clusterIds[biterm]
			
  		
        		
	  
        del c_bitermsFreqs[c_id]
        del c_totalBiterms[c_id]
        del c_txtIds[c_id] 
        del c_wordsFreqs[c_id] 
        del c_totalWords[c_id]
        del dic_clus__id[c_id]
        if isSemantic==True:		
          del c_clusterVecs[c_id]
        		
       
            
			
      
    	
    if line_count%1000==0:  
      print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt))	 	
      Evaluate(eval_pred_true_txt, ignoreMinusOne)

      t12=datetime.now()	  
      t_diff = t12-t11
      print("total time diff secs=",t_diff.seconds) '''

    last_txtId = current_txt_id
    return [
        c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds,
        c_clusterVecs, txtId_txt, last_txtId, dic_clus__id,
        dic_biterm__clusterIds
    ]
Esempio n. 5
0
def test_cluster_bitermMapping_buffer(
        testList_pred_true_words_index_postid_createtime,
        c_bitermsFreqs={},
        c_totalBiterms={},
        c_wordsFreqs={},
        c_totalWords={},
        c_txtIds={},
        c_clusterVecs={},
        txtId_txt={},
        last_txtId=0,
        max_c_id=0,
        wordVectorsDic={},
        dic_clus__id={},
        dic_biterm__clusterIds={},
        c_textItems={},
        dic_ngram__textItems={},
        min_gram=1,
        max_gram=2,
        max_hitindex=10000):

    eval_pred_true_txt = []

    line_count = 0

    print("testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" +
          "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" +
          "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" +
          "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" +
          "DaysDiff")

    for item in testList_pred_true_words_index_postid_createtime:
        t11 = datetime.now()
        pred = item[0]
        testTrue = int(item[1])
        words = item[2]
        testpostId = item[4]
        testDateTime = datetime.strptime(item[5].split("t")[0],
                                         "%Y-%m-%d")  #datetime.now() # item[5]
        #print('testDateTime', item[5])
        bi_terms = construct_biterms(words)

        #print(words, bi_terms, pred)

        #current_txt_id=int(testpostId)

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        #text->biterms
        #biterms->targetClusterIds
        #targetClusterIds->txtIds  by c_txtIds
        #txtIds->textItems	by txtId_txt

        targetClusterIds = findTargetClusters(txtBitermsFreqs,
                                              dic_biterm__clusterIds)
        trainItems = findTextItems(targetClusterIds, c_textItems)

        grams = generateGramsConsucetive(words, min_gram, max_gram)
        sortedGrams = list(sorted(grams, key=len, reverse=True))
        train_Items = aggregateTextItems(sortedGrams, dic_ngram__textItems)

        trainItems.extend(train_Items)

        #print('len(targetClusterIds)', len(targetClusterIds), 'len(trainItems)',len(trainItems), words)
        pathCount = 0
        flag = False
        for trainItem in trainItems:
            #list_pred_true_words_index_postid in clustring_term_online_stack=	trainItem
            trainTrue = int(trainItem[1])
            train_words = trainItem[2]
            trainPostId = trainItem[4]

            pathCount += 1

            if str(testTrue) == str(trainTrue):
                #grams=generateGramsConsucetive(words, min_gram, max_gram)
                #sortedGrams = list(sorted(grams, key = len, reverse=True))
                ProposedHitRank_val = int(
                    max(1, math.floor(pathCount / len(sortedGrams))))

                t12 = datetime.now()
                t_diff = t12 - t11
                #print(str(testpostId)+"\t"+str(trainPostId)+"\t0\t0\t0\t0\t"+str(ProposedHitRank_val)+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue))
                text_sim, commonCount = computeTextSimCommonWord_WordDic(
                    Counter(words), Counter(train_words), len(words),
                    len(train_words))

                trainDateTime = datetime.strptime(trainItem[5].split("t")[0],
                                                  "%Y-%m-%d")  #datetime.now()
                date_diff = trainDateTime - testDateTime
                date_diff = date_diff.days

                print(
                    str(testpostId) + "\t" + str(trainPostId) + "\t" +
                    str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" +
                    str(t_diff.microseconds / float(microDivide)) + "\t" +
                    str(testTrue) + "\t" + ' '.join(words) + "\t" +
                    ' '.join(train_words) + "\t" + str(trainDateTime) + "\t" +
                    str(testDateTime) + "\t" + str(date_diff))
                flag = True
                break

            if pathCount > max_hitindex:
                break

        if flag == False:
            '''grams=generateGramsConsucetive(words, min_gram, max_gram)		 
      sortedGrams = list(sorted(grams, key = len, reverse=True))

      flag=False  
      largestGram='' 
      ProposedHitRank=0  
       
      train_Items=aggregateTextItems(sortedGrams, dic_ngram__textItems)
      #print("len(train_Items)", len(train_Items) ) 
      for train_item in train_Items:
        ProposedHitRank+=1	
        
	  
        trainTruelabel=train_item[1]
        train_words=train_item[2]
        trainPostId=train_item[4]	

	  
        if str(trainTruelabel)==str(testTrue):
     
          t12=datetime.now()	  
          t_diff = t12-t11 	
	  
          text_sim, commonCount = computeTextSimCommonWord_WordDic(Counter(words), Counter(train_words), len(words), len(train_words) )	  
          ProposedHitRank_val=int(max(1,math.floor(ProposedHitRank/len(sortedGrams))))	  
      	
          trainDateTime= datetime.strptime(train_item[5].split("t")[0] ,"%Y-%m-%d") #datetime.now()
          date_diff=trainDateTime-testDateTime
          date_diff=date_diff.days      	  
	  
          print(str(testpostId)+"\t"+str(trainPostId)+"\t"+str(text_sim)+"\t"+str(ProposedHitRank_val)+"\t"+str(t_diff.microseconds/float(microDivide))+"\t"+str(testTrue)+"\t"+' '.join(words)+"\t"+' '.join(train_words)+"\t"+str(trainDateTime)+"\t"+str(testDateTime)+"\t"+str(date_diff)) 		
          flag=True		
          break 

        if ProposedHitRank > max_hitindex:
          break'''

            if flag == False:
                #print('not found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds))
                t12 = datetime.now()
                t_diff = t12 - t11
                #print(str(testpostId)+"\t"+"-100"+"\t0\t0\t0\t0\t-100"+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue))
                print(
                    str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) +
                    "\t" + str(t_diff.microseconds / float(microDivide)) +
                    "\t" + str(testTrue) + "\t" + ' '.join(words) + "\t" + "" +
                    "\t" + "" + "\t" + "" + "\t" + "")
Esempio n. 6
0
def test_cluster_bitermMapping(testList_pred_true_words_index_postid,
                               c_bitermsFreqs={},
                               c_totalBiterms={},
                               c_wordsFreqs={},
                               c_totalWords={},
                               c_txtIds={},
                               c_clusterVecs={},
                               txtId_txt={},
                               last_txtId=0,
                               max_c_id=0,
                               wordVectorsDic={},
                               dic_clus__id={},
                               dic_biterm__clusterIds={},
                               dic_word__clusterIds={},
                               dicTrain_pred__trues={}):
    #print("test_cluster_bitermMapping")

    eval_pred_true_txt = []

    line_count = 0

    print("testpostId" + "\t" + "trainPostId" +
          "\tTitleSim\tBodySim\tTagSim\tLuceneHitRank\t" + "ProposedHitRank" +
          "\tlucene_hit_duration\t" + "Proposed_hit_duration_micro" + "\t" +
          "Proposed_TestTrueLabel")

    for item in testList_pred_true_words_index_postid:
        t11 = datetime.now()
        pred = item[0]
        testTrue = int(item[1])
        words = item[2]
        postId = item[4]
        bi_terms = construct_biterms(words)
        #bi_terms=generateGramsConsucetive(words, minGSize, maxGSize)
        print(words, bi_terms, pred)

        current_txt_id = int(postId)

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        #text->biterms
        #biterms->targetClusterIds
        #targetClusterIds->txtIds  by c_txtIds
        #txtIds->textItems	by txtId_txt

        targetClusterIds = findTargetClusters(txtBitermsFreqs,
                                              dic_biterm__clusterIds)

        print('len(targetClusterIds)', len(targetClusterIds))
        textIds = findTextIds(targetClusterIds, c_txtIds)
        print('len(textIds)', len(textIds))
        pathCount = 0
        flag = False
        for textId in textIds:
            trainItem = txtId_txt[textId]
            trainTrue = int(trainItem[1])
            trainPostId = trainItem[3]
            pathCount += 1

            if str(testTrue) == str(trainTrue):
                #print('found found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds))
                t12 = datetime.now()
                t_diff = t12 - t11
                print(
                    str(postId) + "\t" + str(trainPostId) + "\t0\t0\t0\t0\t" +
                    str(len(targetClusterIds)) + "\t0\t" +
                    str(t_diff.microseconds) + "\t" + str(testTrue))
                flag = True
                break

            if pathCount > max_hitindex:
                break

        if flag == False:
            '''targetClusterIds=findTargetClusters(txtWordsFreqs, dic_word__clusterIds)
      textIds=findTextIds(targetClusterIds, c_txtIds)
      pathCount=0	
      flag=False	
      for textId in textIds:
        trainItem = txtId_txt[textId] 
        trainTrue=int(trainItem[1])	
        trainPostId=trainItem[3]			
        pathCount+=1
        if str(testTrue) == str(trainTrue):      	  
          #print('found found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds))	
          t12=datetime.now()	  
          t_diff = t12-t11		
          print(str(postId)+"\t"+str(trainPostId)+"\t0\t0\t0\t0\t"+str(len(targetClusterIds))+"\t0\t"+str(t_diff.microseconds)+"\t"+str(testTrue)) 		  
          flag=True
          break	'''

            if flag == False:
                #print('not found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds))
                t12 = datetime.now()
                t_diff = t12 - t11
                print(
                    str(postId) + "\t" + "-100" + "\t0\t0\t0\t0\t-100" +
                    "\t0\t" + str(t_diff.microseconds) + "\t" + str(testTrue))
Esempio n. 7
0
def trainLoad_cluster_biterm(trainList_pred_true_text_postid,
                             c_bitermsFreqs={},
                             c_totalBiterms={},
                             c_wordsFreqs={},
                             c_totalWords={},
                             c_txtIds={},
                             c_clusterVecs={},
                             txtId_txt={},
                             wordVectorsDic={},
                             dic_clus__id={},
                             dic_biterm__clusterIds={},
                             dic_word__clusterIds={}):
    print("train cluster_bigram")

    dicTrain_pred__trues = {}

    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for item in trainList_pred_true_text_postid:
        pred = item[0]  #pred clusId
        true = item[1]
        words = item[2].split(' ')
        postId = item[3]
        bi_terms = construct_biterms(words)
        #bi_terms=generateGramsConsucetive(words, minGSize, maxGSize)
        #print(words, bi_terms)

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        clusterId = int(pred)
        #dicTrain_pred__trues[clusterId]=int(true)
        dicTrain_pred__trues.setdefault(clusterId, []).append(int(true))

        dic_clus__id[clusterId] = clusterId
        current_txt_id = int(postId)

        txtId_txt[current_txt_id] = item

        c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterIds, dic_word__clusterIds = populateClusterFeature(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec,
            dic_biterm__clusterIds, dic_word__clusterIds)

        eval_pred_true_txt.append([clusterId, item[1], item[2]])

        #if clusterId>0:
        #  print(item, bi_terms)
        #print(dic_biterm__clusterIds.keys())

        if line_count % 1000 == 0:
            print('#######-personal-eval_pred_true_txt',
                  len(eval_pred_true_txt))
            Evaluate(eval_pred_true_txt, ignoreMinusOne)

            t12 = datetime.now()
            t_diff = t12 - t11
            print("total time diff secs=", t_diff.seconds)

    return [
        c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds,
        c_clusterVecs, txtId_txt, dic_clus__id, dic_biterm__clusterIds,
        dic_word__clusterIds, dicTrain_pred__trues
    ]
Esempio n. 8
0
def test_cluster_bitermMapping_buffer_framework(
        list_CPost_test, c_CFVector, dic_txtId__CPost,
        dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds,
        dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram,
        max_hitindex, oCSimilarityFlgas, wordVectorsDic):
    eval_pred_true_txt = []

    line_count = 0

    fileWrite = open(outfileName, 'w')

    fileWrite.write("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" +
                    "\t" + "Proposed_hitrank" + "\t" +
                    "Proposed_hit_duration_micro" + "\t" +
                    "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" +
                    "trainText" + "\t" + "testCreateTime" + "\t" +
                    "TrainCreateTime" + "\t" + "DaysDiff" + "\t" +
                    "OriginalRank" + "\n")
    print("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" + "\t" +
          "Proposed_hitrank" + "\t" + "Proposed_hit_duration_micro" + "\t" +
          "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" +
          "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" +
          "DaysDiff" + "\t" + "OriginalRank")

    for oCPost in list_CPost_test:
        t11 = datetime.now()

        testTrue = oCPost.trueLabel
        tagWords = oCPost.tagWords
        titleWords = oCPost.titleWords
        bodyWords = oCPost.bodyWords
        # id = oCPost.id  # may not be useful for test
        testpostId = oCPost.soPostId
        testCreatetime = oCPost.createtime

        testWords = tagWords  # this can be changed

        txtBitermsFreqs_Tag = None
        bi_terms_len_Tag = 0
        grams_Tag = None

        txtBitermsFreqs_Title = None
        bi_terms_len_Title = 0
        grams_Title = None

        txtBitermsFreqs_Body = None
        bi_terms_len_Body = 0
        grams_Body = None

        text_VecTag = None
        text_VecTitle = None
        text_VecBody = None
        targetClusterIds = []
        grams = []

        line_count += 1

        # text->biterms
        # biterms->targetClusterIds
        # targetClusterIds->txtIds  by c_txtIds
        # txtIds->textItems	by txtId_txt

        if oCSimilarityFlgas.isTagSim:
            bi_termsTag = construct_biterms(tagWords)
            grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram)
            grams.extend(grams_Tag)

            txtBitermsFreqs_Tag = Counter(bi_termsTag)
            bi_terms_len_Tag = len(bi_termsTag)
            tCIds = findTargetClusters(txtBitermsFreqs_Tag,
                                       dic_bitermTag__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic,
                                                   embedDim)
                text_VecTag = X[0]

        if oCSimilarityFlgas.isTitleSim:
            bi_termsTitle = construct_biterms(titleWords)
            grams_Title = generateGramsConsucetive(titleWords, min_gram,
                                                   max_gram)
            grams.extend(grams_Title)

            txtBitermsFreqs_Title = Counter(bi_termsTitle)
            bi_terms_len_Title = len(bi_termsTitle)
            tCIds = findTargetClusters(txtBitermsFreqs_Title,
                                       dic_bitermTitle__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([titleWords],
                                                   wordVectorsDic, embedDim)
                text_VecTitle = X[0]

        if oCSimilarityFlgas.isBodySim:
            bi_termsBody = construct_biterms(bodyWords)
            grams_Body = generateGramsConsucetive(bodyWords, min_gram,
                                                  max_gram)
            grams.extend(grams_Body)

            txtBitermsFreqs_Body = Counter(bi_termsBody)
            bi_terms_len_Body = len(bi_termsBody)
            tCIds = findTargetClusters(txtBitermsFreqs_Body,
                                       dic_bitermBody__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic,
                                                   embedDim)
                text_VecBody = X[0]

        oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag,
                                         txtBitermsFreqs_Title,
                                         bi_terms_len_Title,
                                         txtBitermsFreqs_Body,
                                         bi_terms_len_Body, text_VecTag,
                                         text_VecTitle, text_VecBody)

        targetClusterIds = set(targetClusterIds)
        closeClusterIds = findCloseClustersIds_framework(
            oCPostProcessed, targetClusterIds, c_CFVector, oCSimilarityFlgas)
        train_cluster_CPosts = findTextItems_framework(closeClusterIds,
                                                       c_CFVector,
                                                       dic_txtId__CPost)
        # train_cluster_CPosts = filterTextItems_framework(train_cluster_CPosts, oCSimilarityFlgas, oCPostProcessed)

        sortedGrams = list(sorted(grams, key=len, reverse=True))
        train_gram_CPosts = aggregateTextItems_framework(
            sortedGrams, dic_ngram__txtIds, dic_txtId__CPost)

        train_gram_CPosts.extend(train_cluster_CPosts)

        # train_Items.extend(trainItems)

        # print('len(train_gram_CPosts)', len(train_gram_CPosts), 'len(targetClusterIds)', len(targetClusterIds))
        pathCount = 0
        flag = False
        for trainCPost in train_gram_CPosts:
            trainTrue = int(str(trainCPost.trueLabel))
            train_words = trainCPost.tagWords  # this can be changed
            trainPostId = trainCPost.soPostId
            trainCreateTime = trainCPost.createtime

            pathCount += 1

            if str(testTrue) == str(trainTrue):
                ProposedHitRank_val = int(
                    max(1, math.floor(pathCount / len(sortedGrams))))

                t12 = datetime.now()
                t_diff = t12 - t11
                text_sim, commonCount = computeTextSimCommonWord_WordDic(
                    Counter(testWords), Counter(train_words), len(testWords),
                    len(train_words))

                date_diff = trainCreateTime - testCreatetime
                date_diff = date_diff.days

                # "testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" + "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank"
                print(
                    str(testpostId) + "\t" + str(trainPostId) + "\t" +
                    str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" +
                    str(t_diff.microseconds / float(microDivide)) + "\t" +
                    str(testTrue) + "\t" + ' '.join(testWords) + "\t" +
                    ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" +
                    str(trainCreateTime) + "\t" + str(date_diff) + "\t" +
                    str(pathCount))

                fileWrite.write(
                    str(testpostId) + "\t" + str(trainPostId) + "\t" +
                    str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" +
                    str(t_diff.microseconds / float(microDivide)) + "\t" +
                    str(testTrue) + "\t" + ' '.join(testWords) + "\t" +
                    ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" +
                    str(trainCreateTime) + "\t" + str(date_diff) + "\t" +
                    str(pathCount) + "\n")

                flag = True
                break

            if pathCount > max_hitindex:
                break

        if not flag:
            t12 = datetime.now()
            t_diff = t12 - t11
            print(
                str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" +
                str(t_diff.microseconds / float(microDivide)) + "\t" +
                str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" +
                "" + "\t" + "" + "\t" + "" + "\t" + "")

            fileWrite.write(
                str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" +
                str(t_diff.microseconds / float(microDivide)) + "\t" +
                str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" +
                "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n")

    fileWrite.close()