Exemple #1
0
        if len(elements) != 4: continue
        label = int(elements[0])
        words = elements[1].split("\t")
        count = elements[2]
        if len(elements[3]) > 1:
            urls = elements[3].split("\t")
        else:
            urls = [elements[3].strip()]
        for url in urls:
            if url == 0: continue
            if url not in urlset:
                urlset.add(url)
        for word in words:
            if not isGoodWord(word):
                words.remove(word)

        querylist.append((label, words, count, urls))

    return querylist, urlset


if __name__ == '__main__':
    doclist = getDocList(
        "/users/songwei/xuwenbin/subtopic/ntcir11/doc/0001.txt")
    iIndex = Index.BuildIndex(doclist)
    original_query, candidates, term_weight_sum, word2idWeight = getQueryListNoTopic(
        "/users/songwei/xuwenbin/subtopic/ntcir11/candidate/0001.txt",
        "/users/songwei/xuwenbin/subtopic/ntcir11/topic/0001.txt", iIndex)
    print len(candidates)
    urlset = set()
    for line in lines:
#         print line
        elements = line.strip().split('||')
      
        if len(elements)!=4:continue
        label = int(elements[0])
        words = elements[1].split("\t")
        count = elements[2]
        if len(elements[3])>1:
            urls = elements[3].split("\t")
        else:
            urls = [elements[3].strip()]
        for url in urls:
            if url == 0:continue
            if url not in urlset:
                urlset.add(url)
        for word in words:
            if  not isGoodWord(word):
                words.remove(word)
       
        querylist.append((label, words,count,urls))
   
    return querylist,urlset
    
if __name__ == '__main__':
    doclist = getDocList("/users/songwei/xuwenbin/subtopic/ntcir11/doc/0001.txt")
    iIndex = Index.BuildIndex(doclist)
    original_query, candidates, term_weight_sum, word2idWeight=getQueryListNoTopic("/users/songwei/xuwenbin/subtopic/ntcir11/candidate/0001.txt","/users/songwei/xuwenbin/subtopic/ntcir11/topic/0001.txt",iIndex)
    print len(candidates)
   
Exemple #3
0
def getCandidatesRepresentations(queryfilename,topicfilename,docfilename,filepath,read_or_not):
    """getCandidatesRepresentations
    
    Parameters Return
    ----------
    candidateReps:查询候选
    cRepsForCluser:扩展的词和候选中的词的wordReps
    candembedding:通过字符串扩展的相似的词
    word2idWeight:词的权重和ID
    
    """
    topic_ID = queryfilename.split("/")[-1]
    pickle_path = filepath+"subtopic_cand_pickle/"
    if not os.path.exists(pickle_path):
        os.mkdir(pickle_path)
    pickle_file = pickle_path+topic_ID
    if os.path.exists(pickle_file) and os.path.isfile(pickle_file):
        file_pick = open(pickle_file, 'rb')
        candidateReps=pickle.load(file_pick)
        cRepsForCluser=pickle.load(file_pick)
        candembedding= pickle.load(file_pick)
        file_pick.close()
        return candidateReps,cRepsForCluser ,candembedding
    else:
        
        doclist = getDocList(docfilename)
        iIndex = Index.BuildIndex(doclist)
        original_query, candidates, term_weight_sum, word2idWeight = getQueryListNoTopic(queryfilename, topicfilename, iIndex) # candidate in candidate = (label, wordlist, str)
        print "this is the original candidates,",len(candidates)
        candidateReps = []
        cRepsForCluser = {}
        emwordlist = []
        emworddict = {}
        word2Embeddingdict = {}
        wid = 0
        eminstance = WordRepresentation("", word2idWeight, iIndex)
        wordkeys = []
        candembedding={}
        
        for cand in candidates:
            words = cand[1]
            wordlist = []
            wordkeys.extend(words)
            candstr = cand[2]
            if candembedding.has_key(candstr):
                continue
            for word in words:
                if word.strip()=="":continue
                if word ==  original_query:continue
                if not emworddict.has_key(word):
                    emworddict[word] = wid
                    wid += 1
                wordlist.append(word)
                if word2Embeddingdict.has_key(word):
                    continue
                else:
                    wordrep =  WordRepresentation(word, word2idWeight, iIndex)
                    word2Embeddingdict[word] = wordrep.GetSimilarWordWeights()
            
            if len(wordlist)>0:
                wordweight = eminstance.GetSimilarWordWeights(wordlist)
                if wordweight == None:
                    continue
                for  term,m in wordweight:
                    emwordlist.append(term)
                    if not emworddict.has_key(term):
                        emworddict[term] = wid
                        wid +1
                candembedding[candstr]= wordweight
                
        for word in emworddict.keys():
            
            wordrep =  WordRepresentation(word, word2idWeight, iIndex)
            if wordrep.GetEmbeddingVec() != None and wordrep.GetCategoryVec()!=None:
                if not cRepsForCluser.has_key(word):
                    cRepsForCluser[word]=wordrep
                   
    
        candstrs_filter=set()
       
        for cand in candidates:
            label = cand[0]
            words = cand[1]
            candstr = cand[2]
            if candstr  in candstrs_filter:
                continue
            candstrs_filter.add(candstr)
            urlvec = cand[3]
            wordlist = []
            local_term_weight = []
            local_sum = 0.0
            wordreps = []
            docsum = 0.0
            for word in words:
                if word == original_query:
                    wordlist.append(word)
                    continue
                wordrep =  WordRepresentation(word, word2idWeight, iIndex)
                wordrep.SetEmbeddingwordlist(word2Embeddingdict[word])
                wordreps.append(wordrep)
                wordlist.append(word)
                local_term_weight.append(word2idWeight[word][1])
                local_sum += word2idWeight[word][1]
                doccount = len(iIndex.GetDocListByKey(word))*1.0
                docsum += doccount
            if docsum == 0.0:continue
            for i in range(len(local_term_weight)):
                local_term_weight[i] = (local_term_weight[i]/local_sum)#*(doccounts[i]/docsum)
              
            average_weight = [1.0 / len(local_term_weight) for i in range(len(local_term_weight))]
                     
                    
            phraseRep = PhraseRepresentation(label, candstr, wordreps,wordlist, local_term_weight, original_query,urlvec,emworddict,iIndex,cand[4]) 
            vsmVec = phraseRep.GetVSM() #phraseRep.GetEmbeddingVec()
            cateVec = phraseRep.GetCategoryVec()
            embeddingVec = phraseRep.GetEmbeddingVec()
            iDocVec = phraseRep.GetInvertedDocVec()
            URLVec = phraseRep.GetURLVec()
#             ewVec = phraseRep.GetEmbeddingWordVec()
            if vsmVec != None and cateVec != None and embeddingVec != None and iDocVec != None and URLVec != None:# and ewVec != None :
                candidateReps.append(phraseRep)
            else:
                print "candidate representation l126"
            
        print "prepared subtopic candidates:", len(candidateReps)
    
        output = open(filepath+"subtopic_cand_pickle/"+topic_ID, 'wb')
        pickle.dump(candidateReps, output)
        pickle.dump(cRepsForCluser,output)
        pickle.dump(candembedding,output)
        output.close()

    return candidateReps,cRepsForCluser ,candembedding#word2idWeight#similardict