def read_all_id_list(): path_of_id_list = get_base_path() + '/data/post_id_list.txt' file = open(path_of_id_list) id_list = [] for line in file: line = line.strip() id_list.append(line) return id_list
def read_query_for_testing(): path_of_query = get_base_path() + '/data/query.txt' file = open(path_of_query) count = 0 query_list = [] query_id = -1 query = '' for line in file: line = line.strip() if count % 3 == 0: query_id = line elif count % 3 == 1: query = line else: relevant_id_list = line.split(' ') query_list.append([query_id, query, relevant_id_list]) count += 1 return query_list
def load_Step1_result(approach_name, topnum): dir_of_result = get_base_path() + '/_1_Result/Baseline_' + approach_name result = [] for i in range(0, 100, 1): path_of_result = dir_of_result + '/' + str(i) + '.txt' file = open(path_of_result) linenum = 0 top_dq = [] for line in file: line = line.strip() if linenum == 0: query = line elif linenum % 3 == 1: id = line elif linenum % 3 == 2: title = line else: sim = float(line) top_dq.append([id, title, sim]) if len(top_dq) >= topnum: break linenum += 1 result.append([query, top_dq]) return result
from utils.file_util import write_file from pathConfig import get_base_path path_of_voc = get_base_path() + '/_2_sentence_selection/Entropy/idf_voc.txt' def read_voc(): file = open(path_of_voc) voc = {} for line in file: word_idf = line.split(' ') word = word_idf[0] idf = float(word_idf[1].strip()) voc[word] = idf return voc if __name__ == '__main__': reponum = 50000 voc_str = '' voc = read_voc() for key in voc.keys(): voc_str += (key + ' ' + str(voc[key]) + '\n') write_file(path_of_voc, voc_str.strip()) print 'Done.'
# -*- coding: UTF-8 -*- from pathConfig import get_base_path from nltk import word_tokenize path_of_stopwords_EN = get_base_path() + '/utils/StopWords_EN.txt' def read_EN_stopwords(): sw_set = set() f = open(path_of_stopwords_EN) for line in f: sw_set.add(line.strip()) return sw_set def remove_stopwords(sent, sw): if type(sent) is str: wlist = word_tokenize(sent) elif type(sent) is list: wlist = sent else: raise Exception("Wrong type for removing stopwords!") sent_words = [] for w in wlist: if w == '': continue if w not in sw: sent_words.append(w) return sent_words