class ExtendedBoolean: def __init__(self, json): self.index = [] self.vocabulary = set([]) json = JSONDecoder().decode(json) self.index_mod = IndexModule() if json['action'] == "build": self.build(json['path']) else: self.process_query(json['query'], json['count']) def build(self, path): pp = PathProcessor(path) self.terms = pp.process_files() self.calculate_tf() self.calculate_itf() self.index_mod.process_json(JSONEncoder().encode({'action': 'build', 'data': self.index})) def process_query(self, query, count): pass def calculate_tf(self): for file, terms in self.terms.items(): words_counter_dic = {} max_frec = 0 for w in terms: self.vocabulary.add(w) if w in words_counter_dic.keys(): words_counter_dic[w] += 1 if words_counter_dic[w] > max_frec: max_frec = words_counter_dic[w] else: words_counter_dic[w] = 1 for w in terms: words_counter_dic[w] /= float(max_frec) print 'max_frec', str(max_frec) self.terms[file] = words_counter_dic def calculate_itf(self): for w in self.vocabulary: files_with_words = [] for doc_name, doc_terms in self.terms.items(): if w in doc_terms.keys(): files_with_words.append(doc_name) itf = len(self.terms) / float(len(files_with_words)) self.index.append({'key': w, 'value': {'itf': itf, 'documents': [{'tf': self.terms[doc][w], 'document': doc} for doc in files_with_words]}})
def __init__(self, json): self.index = [] self.vocabulary = set([]) json = JSONDecoder().decode(json) self.index_mod = IndexModule() if json['action'] == "build": self.build(json['path']) else: self.process_query(json['query'], json['count'])
# print(max_page) return (max_page) def crawling(): print('-----start crawling time: %s-----'%(datetime.today())) config = configparser.ConfigParser() config.read('../config.ini', 'utf-8') root = 'http://news.sohu.com/1/0903/61/subject212846158' max_page = get_max_page(root + '.shtml') news_pool = get_news_pool(root, max_page, max_page - 5) crawl_news(news_pool, 140, config['DEFAULT']['doc_dir_path'], config['DEFAULT']['doc_encoding']) if __name__ == "__main__": print('-----start time:%s-----'%(datetime.today())) # 抓取新闻数据 # crawling() # 构建索引 print('-----start indexing time: %s-----'%(datetime.today())) im = IndexModule('../config.ini', 'utf-8') im.construct_postings_lists() # 推荐阅读 print('-----start recommending time: %s-----'%(datetime.today())) rm = RecommendationModule('../config.ini', 'utf-8') rm.find_k_nearest(5, 25) print('-----finish time: %s-----'%(datetime.today()))