Example #1
0
def read_subtopic_from_files(basepath, topicID):
    """
    get the subtopic of the original query. return a list with subtopic information.
    for each data_prepare,would build a word dict to store the information of terms.
    ------------
    return:
    ------------
    subtopic_list: a list with [id,label,words,URLs]
    subtopic_length_average,subtopic_term_number_all,topic_word_set,query
    [ID,label,words,frequency,urls]
    
    """
    subtopic_path = basepath + "subtopic/"
    topic_path = basepath + "topic/"
    f = open(subtopic_path + topicID)
    lines = f.readlines()
    f.close()
    query_probability_dict = get_orinal_query_vec(basepath, topicID)
    subtopic_list = []
    urlset = set()

    topic_word_set = get_topic_words(topic_path + topicID)
    for line in lines:
        elements = line.strip().split('||')
        if len(elements) != 4: continue
        label = int(elements[0])
        words = elements[1].split("\t")
        subtopic_frequency = int(elements[2])
        if len(elements[3]) > 1:
            urls = elements[3].split("\t")
        else:
            urls = [elements[3].strip()]
        #processing the URL related
        for url in urls:
            if url == 0: continue
            if url not in urlset:
                urlset.add(url)
        terms = []
        subtopicID = "".join([word.split("/")[0] for word in words])
        for word in words:
            if (not isGoodWord(word)) or word in topic_word_set:
                continue
            else:
                term, pos = word.split("/")
                terms.append(term)

        if len(terms) == 0:
            continue

        subtopic_list.append(
            (subtopicID, label, terms, subtopic_frequency, urls))

    return subtopic_list, topic_word_set, query_probability_dict
def read_subtopic_from_files(basepath,topicID):
    """
    get the subtopic of the original query. return a list with subtopic information.
    for each data_prepare,would build a word dict to store the information of terms.
    ------------
    return:
    ------------
    subtopic_list: a list with [id,label,words,URLs]
    subtopic_length_average,subtopic_term_number_all,topic_word_set,query
    [ID,label,words,frequency,urls]
    
    """
    subtopic_path = basepath + "subtopic/"
    topic_path = basepath+"topic/"
    f = open(subtopic_path+topicID)
    lines = f.readlines()
    f.close()
    query_probability_dict = get_orinal_query_vec(basepath,topicID)
    subtopic_list = []   
    urlset = set()
    
    topic_word_set = get_topic_words(topic_path+topicID)
    for line in lines:
        elements = line.strip().split('||')
        if len(elements)!=4:continue
        label = int(elements[0])
        words = elements[1].split("\t")
        subtopic_frequency = int(elements[2])
        if len(elements[3])>1:
            urls = elements[3].split("\t")
        else:
            urls = [elements[3].strip()]
        #processing the URL related 
        for url in urls:
            if url == 0:continue
            if url not in urlset:
                urlset.add(url)
        terms = []
        subtopicID = "".join([word.split("/")[0] for word in words ])
        for word in words:
            if  (not isGoodWord(word)) or word in topic_word_set :
                continue
            else:
                term,pos = word.split("/")
                terms.append(term)
                
        if len(terms) == 0:
            continue
      
        subtopic_list.append((subtopicID,label, terms,subtopic_frequency,urls))
 
    return subtopic_list,topic_word_set,query_probability_dict
def subtopic_expansion(basepath,topicID,expansion_method,mining_method):
    """
    this function provides prepared information of subtopics.
    """
    print topicID,"subtopic expansion ",expansion_method,mining_method
    if not os.path.exists(basepath+"/pickle_expansion/"+topicID+"-"+mining_method+"/"):
        os.mkdir(basepath+"/pickle_expansion/"+topicID+"-"+mining_method+"/")
    pickle_file = basepath+"/pickle_expansion/"+topicID+"-"+mining_method+"/"+expansion_method
    if os.path.exists(pickle_file) and os.path.isfile(pickle_file):
        infile = open(pickle_file,"rb")
        subtopic_list = pickle.load(infile)
        word_details  = pickle.load(infile)
        topic_words = pickle.load(infile)
        query_dict = pickle.load(infile)
        infile.close()
    else:
        if mining_method == "1":
            subtopic_candidates,topic_words,query_dict = read_subtopic_from_files(basepath,topicID)
        else:
            subtopic_candidates,topic_words,query_dict = read_subtopic_from_mining(basepath,topicID)
        word_embedding = WordEmbeddingFramework()  
        subtopic_list = [] # (subtopicID,label,terms,subtopic_frequency,urls)
        word_details = {}  # {word:[embedding vector,frequency]}
        for subtopic_str,label,terms,subtopic_frequency,urls in subtopic_candidates:
            term_for_expansion = []
            subtopic_words =[]
            subtopic = []
            if expansion_method.startswith("E"):
                # E2,E6 and E4 expansion method require the Q+A for expansion
                if expansion_method == "E2" or expansion_method == "E4" or expansion_method == "E6":
                    for word in topic_words:
                        vec = word_embedding.get_embedding_vector(word)
                        if vec != None:
                            term_for_expansion.append(word)
                for word in terms:
                    vec = word_embedding.get_embedding_vector(word)
                    if vec == None:
                        continue
                    if not word_details.has_key(word):
                        word_details[word]=[vec]
                        word_details[word].append(subtopic_frequency)
                    term_for_expansion.append(word)
                    subtopic_words.append(word)
                    
                if len(term_for_expansion)==0:
                    print subtopic_str
                    continue
                sim_word_weight = word_embedding.get_expansion_words(term_for_expansion)
                if sim_word_weight == None: continue
                i =0
                if expansion_method =="E2" or expansion_method=="E1":
                    subtopic_words=[]
                for s_word,weight in sim_word_weight:
                    if not isGoodWord(s_word+"/n"):continue
                    i +=1
                    
                    if weight<0.6:continue
                    if word_details.has_key(s_word):
                        word_details[s_word][1]+= subtopic_frequency
                    else:
                        vec = word_embedding.get_embedding_vector(s_word)
                        if vec == None:continue
                        word_details[s_word]=[]
                        word_details[s_word].append(vec)
                        word_details[s_word].append(subtopic_frequency)
                    # E5 and E6 method use the expansion words and the original words as the subtopic words
                    if expansion_method == "E5" or expansion_method == "E6":
                        subtopic_words.append(s_word)
                        if i>3:break
                    if expansion_method == "E1" or expansion_method == "E2":
                        subtopic_words.append(s_word)
                        if i>5:break
            else:
                for word in terms:
                    vec = word_embedding.get_embedding_vector(word)
                    if vec == None:
                        continue
                    if not word_details.has_key(word):
                        word_details[word]=[vec]
                        word_details[word].append(subtopic_frequency)

                    subtopic_words.append(word)
               
            subtopic.append(subtopic_str)
            subtopic.append(label)
            subtopic.append(subtopic_words)
            subtopic.append(subtopic_frequency)
            subtopic.append(urls)
            subtopic_list.append(subtopic)
        outfile = open(pickle_file,"wb")
        pickle.dump(subtopic_list,outfile) 
        pickle.dump(word_details,outfile)
        pickle.dump(topic_words,outfile)
        pickle.dump(query_dict,outfile)
        outfile.close()
    
    return subtopic_list,word_details,topic_words,query_dict
def get_subtopic_candidate(basepath,topicID):
    """
    get the subtopic of the original query. return a list with subtopic information.
    for each data_prepare,would build a word dict to store the information of terms.
    ------------
    return:
    ------------
    subtopic_list: a list with [id,label,words,URLs]
    url set: url set.
    word2id_weight: word id and word appeared subtopic
    subtopic_length_average,subtopic_term_number_all,topic_word_set,query
    [ID,label,words,frequency,urls]
    
    """
    subtopic_path = basepath + "subtopic/"
    topic_path = basepath+"topic/"
    f = open(subtopic_path+topicID)
    lines = f.readlines()
    f.close()
    query_probability_dict = get_orinal_query_vec(basepath,topicID)
    subtopic_term_number_all = 0.0
    subtopic_number = 0
    subtopic_length_sum = 0.0
    word2id_weight = {}
    subtopic_list = []   
    urlset = set()
    wid = 0
    
    topic_word_set = get_topic_words(topic_path+topicID)
    for line in lines:
        elements = line.strip().split('||')
        if len(elements)!=4:continue
        label = int(elements[0])
        words = elements[1].split("\t")
        subtopic_frequency = int(elements[2])
        if len(elements[3])>1:
            urls = elements[3].split("\t")
        else:
            urls = [elements[3].strip()]
        #processing the URL related 
        for url in urls:
            if url == 0:continue
            if url not in urlset:
                urlset.add(url)
        terms = []
        
        subtopicID = "".join([word.split("/")[0] for word in words ])
       
                
        for word in words:
            if  (not isGoodWord(word)) or word in topic_word_set :
                continue
            else:
                
                term,pos = word.split("/")
                terms.append(term)
                if word2id_weight.has_key(term):
                    word2id_weight[term][1] += subtopic_frequency
                    subtopic_term_number_all += subtopic_frequency
                    word2id_weight[term][2].append([subtopicID,subtopic_frequency])
                else:
                    subtopic_term_number_all += subtopic_frequency
                    word2id_weight[term]=[wid,subtopic_frequency,[[subtopicID,subtopic_frequency]]]
                    wid += 1
        if len(terms) == 0:
            continue
        subtopic_length_sum += subtopic_frequency * len(terms)
        subtopic_number += subtopic_frequency
        
        subtopic_list.append((subtopicID,label, terms,subtopic_frequency,urls))
    subtopic_length_average = subtopic_length_sum/subtopic_number

    return subtopic_list,urlset,word2id_weight,subtopic_length_average,subtopic_term_number_all,topic_word_set,query_probability_dict,subtopic_number
Example #5
0
def subtopic_expansion(basepath, topicID, expansion_method, mining_method):
    """
    this function provides prepared information of subtopics.
    """
    print topicID, "subtopic expansion ", expansion_method, mining_method
    if not os.path.exists(basepath + "/pickle_expansion/" + topicID + "-" +
                          mining_method + "/"):
        os.mkdir(basepath + "/pickle_expansion/" + topicID + "-" +
                 mining_method + "/")
    pickle_file = basepath + "/pickle_expansion/" + topicID + "-" + mining_method + "/" + expansion_method
    if os.path.exists(pickle_file) and os.path.isfile(pickle_file):
        infile = open(pickle_file, "rb")
        subtopic_list = pickle.load(infile)
        word_details = pickle.load(infile)
        topic_words = pickle.load(infile)
        query_dict = pickle.load(infile)
        infile.close()
    else:
        if mining_method == "1":
            subtopic_candidates, topic_words, query_dict = read_subtopic_from_files(
                basepath, topicID)
        else:
            subtopic_candidates, topic_words, query_dict = read_subtopic_from_mining(
                basepath, topicID)
        word_embedding = WordEmbeddingFramework()
        subtopic_list = []  # (subtopicID,label,terms,subtopic_frequency,urls)
        word_details = {}  # {word:[embedding vector,frequency]}
        for subtopic_str, label, terms, subtopic_frequency, urls in subtopic_candidates:
            term_for_expansion = []
            subtopic_words = []
            subtopic = []
            if expansion_method.startswith("E"):
                # E2,E6 and E4 expansion method require the Q+A for expansion
                if expansion_method == "E2" or expansion_method == "E4" or expansion_method == "E6":
                    for word in topic_words:
                        vec = word_embedding.get_embedding_vector(word)
                        if vec != None:
                            term_for_expansion.append(word)
                for word in terms:
                    vec = word_embedding.get_embedding_vector(word)
                    if vec == None:
                        continue
                    if not word_details.has_key(word):
                        word_details[word] = [vec]
                        word_details[word].append(subtopic_frequency)
                    term_for_expansion.append(word)
                    subtopic_words.append(word)

                if len(term_for_expansion) == 0:
                    print subtopic_str
                    continue
                sim_word_weight = word_embedding.get_expansion_words(
                    term_for_expansion)
                if sim_word_weight == None: continue
                i = 0
                if expansion_method == "E2" or expansion_method == "E1":
                    subtopic_words = []
                for s_word, weight in sim_word_weight:
                    if not isGoodWord(s_word + "/n"): continue
                    i += 1

                    if weight < 0.6: continue
                    if word_details.has_key(s_word):
                        word_details[s_word][1] += subtopic_frequency
                    else:
                        vec = word_embedding.get_embedding_vector(s_word)
                        if vec == None: continue
                        word_details[s_word] = []
                        word_details[s_word].append(vec)
                        word_details[s_word].append(subtopic_frequency)
                    # E5 and E6 method use the expansion words and the original words as the subtopic words
                    if expansion_method == "E5" or expansion_method == "E6":
                        subtopic_words.append(s_word)
                        if i > 3: break
                    if expansion_method == "E1" or expansion_method == "E2":
                        subtopic_words.append(s_word)
                        if i > 5: break
            else:
                for word in terms:
                    vec = word_embedding.get_embedding_vector(word)
                    if vec == None:
                        continue
                    if not word_details.has_key(word):
                        word_details[word] = [vec]
                        word_details[word].append(subtopic_frequency)

                    subtopic_words.append(word)

            subtopic.append(subtopic_str)
            subtopic.append(label)
            subtopic.append(subtopic_words)
            subtopic.append(subtopic_frequency)
            subtopic.append(urls)
            subtopic_list.append(subtopic)
        outfile = open(pickle_file, "wb")
        pickle.dump(subtopic_list, outfile)
        pickle.dump(word_details, outfile)
        pickle.dump(topic_words, outfile)
        pickle.dump(query_dict, outfile)
        outfile.close()

    return subtopic_list, word_details, topic_words, query_dict
Example #6
0
def get_subtopic_candidate(basepath, topicID):
    """
    get the subtopic of the original query. return a list with subtopic information.
    for each data_prepare,would build a word dict to store the information of terms.
    ------------
    return:
    ------------
    subtopic_list: a list with [id,label,words,URLs]
    url set: url set.
    word2id_weight: word id and word appeared subtopic
    subtopic_length_average,subtopic_term_number_all,topic_word_set,query
    [ID,label,words,frequency,urls]
    
    """
    subtopic_path = basepath + "subtopic/"
    topic_path = basepath + "topic/"
    f = open(subtopic_path + topicID)
    lines = f.readlines()
    f.close()
    query_probability_dict = get_orinal_query_vec(basepath, topicID)
    subtopic_term_number_all = 0.0
    subtopic_number = 0
    subtopic_length_sum = 0.0
    word2id_weight = {}
    subtopic_list = []
    urlset = set()
    wid = 0

    topic_word_set = get_topic_words(topic_path + topicID)
    for line in lines:
        elements = line.strip().split('||')
        if len(elements) != 4: continue
        label = int(elements[0])
        words = elements[1].split("\t")
        subtopic_frequency = int(elements[2])
        if len(elements[3]) > 1:
            urls = elements[3].split("\t")
        else:
            urls = [elements[3].strip()]
        #processing the URL related
        for url in urls:
            if url == 0: continue
            if url not in urlset:
                urlset.add(url)
        terms = []

        subtopicID = "".join([word.split("/")[0] for word in words])

        for word in words:
            if (not isGoodWord(word)) or word in topic_word_set:
                continue
            else:

                term, pos = word.split("/")
                terms.append(term)
                if word2id_weight.has_key(term):
                    word2id_weight[term][1] += subtopic_frequency
                    subtopic_term_number_all += subtopic_frequency
                    word2id_weight[term][2].append(
                        [subtopicID, subtopic_frequency])
                else:
                    subtopic_term_number_all += subtopic_frequency
                    word2id_weight[term] = [
                        wid, subtopic_frequency,
                        [[subtopicID, subtopic_frequency]]
                    ]
                    wid += 1
        if len(terms) == 0:
            continue
        subtopic_length_sum += subtopic_frequency * len(terms)
        subtopic_number += subtopic_frequency

        subtopic_list.append(
            (subtopicID, label, terms, subtopic_frequency, urls))
    subtopic_length_average = subtopic_length_sum / subtopic_number

    return subtopic_list, urlset, word2id_weight, subtopic_length_average, subtopic_term_number_all, topic_word_set, query_probability_dict, subtopic_number
def doc_preprocessing(basepath, topicID, word2id_weight, topic_words):
    """
    """
    document_path = basepath + "documents_seg/" + topicID + "/"
    documents = os.listdir(document_path)
    doc_subtopic_path = basepath + "doc_subtopic_relation/"

    #store information of term in the subtopic word set.
    #include  docs ID which term appear and term frequency in collection.
    document_term_frequency = {}
    document_list = []

    sum_document_len = 0.0
    topic_word_count = 0.0
    #     true_rank = get_document_true_rank(basepath,topicID)
    doc_sid = get_document_relate_subtopicID(doc_subtopic_path, topicID)
    for document in documents:
        #         if not true_rank.has_key(document):
        #             continue
        o = open(document_path + document)
        lines = o.readlines()
        o.close()

        terms = []
        terms_pos = []
        for line in lines:
            line = line.replace("\n", "")
            words = line.split("\t")

            for windex, word in enumerate(words):
                if word in topic_words:
                    topic_word_count += 1
                if isGoodWord(word):
                    sum_document_len += 1
                else:
                    continue
                term, pos = word.split("/")
                if document_term_frequency.has_key(term):
                    document_term_frequency[term][0] += 1
                    if document_term_frequency[term][1].count(document) == 0:
                        document_term_frequency[term][1].append(document)
                else:
                    item = [1, [document]]
                    document_term_frequency[term] = item
                if word2id_weight.has_key(term):
                    terms.append(term)
                    terms_pos.append(pos)


#         sum_document_len+= len(terms)
        doc = Document()
        doc.set_doc_str(" ".join(terms))
        doc.set_id(document)
        doc.set_term_vec(terms)
        doc.set_pos_vec(terms_pos)
        doc.set_true_rank("1")
        doc.set_related_sid(doc_sid[document])
        document_list.append(doc)

    average_document_len = sum_document_len / len(document_list)

    return document_list, document_term_frequency, average_document_len, topic_word_count