def read_all_id_list():
    path_of_id_list = get_base_path() + '/data/post_id_list.txt'
    file = open(path_of_id_list)
    id_list = []
    for line in file:
        line = line.strip()
        id_list.append(line)
    return id_list
def read_query_for_testing():
    path_of_query = get_base_path() + '/data/query.txt'
    file = open(path_of_query)
    count = 0
    query_list = []
    query_id = -1
    query = ''
    for line in file:
        line = line.strip()
        if count % 3 == 0:
            query_id = line
        elif count % 3 == 1:
            query = line
        else:
            relevant_id_list = line.split(' ')
            query_list.append([query_id, query, relevant_id_list])
        count += 1
    return query_list
Ejemplo n.º 3
0
def load_Step1_result(approach_name, topnum):
    dir_of_result = get_base_path() + '/_1_Result/Baseline_' + approach_name
    result = []
    for i in range(0, 100, 1):
        path_of_result = dir_of_result + '/' + str(i) + '.txt'
        file = open(path_of_result)
        linenum = 0
        top_dq = []
        for line in file:
            line = line.strip()
            if linenum == 0:
                query = line
            elif linenum % 3 == 1:
                id = line
            elif linenum % 3 == 2:
                title = line
            else:
                sim = float(line)
                top_dq.append([id, title, sim])
                if len(top_dq) >= topnum:
                    break
            linenum += 1
        result.append([query, top_dq])
    return result
Ejemplo n.º 4
0
from utils.file_util import write_file
from pathConfig import get_base_path

path_of_voc = get_base_path() + '/_2_sentence_selection/Entropy/idf_voc.txt'


def read_voc():
    file = open(path_of_voc)
    voc = {}
    for line in file:
        word_idf = line.split('   ')
        word = word_idf[0]
        idf = float(word_idf[1].strip())
        voc[word] = idf
    return voc


if __name__ == '__main__':
    reponum = 50000
    voc_str = ''
    voc = read_voc()
    for key in voc.keys():
        voc_str += (key + '   ' + str(voc[key]) + '\n')
    write_file(path_of_voc, voc_str.strip())
    print 'Done.'
# -*- coding: UTF-8 -*-
from pathConfig import get_base_path
from nltk import word_tokenize

path_of_stopwords_EN = get_base_path() + '/utils/StopWords_EN.txt'


def read_EN_stopwords():
    sw_set = set()
    f = open(path_of_stopwords_EN)
    for line in f:
        sw_set.add(line.strip())
    return sw_set


def remove_stopwords(sent, sw):
    if type(sent) is str:
        wlist = word_tokenize(sent)
    elif type(sent) is list:
        wlist = sent
    else:
        raise Exception("Wrong type for removing stopwords!")
    sent_words = []
    for w in wlist:
        if w == '':
            continue
        if w not in sw:
            sent_words.append(w)
    return sent_words