class ExtendedBoolean:
    def __init__(self, json):
        self.index = []
        self.vocabulary = set([])
        json = JSONDecoder().decode(json)
        self.index_mod = IndexModule()

        if json['action'] == "build":
            self.build(json['path'])
        else:
            self.process_query(json['query'], json['count'])

    def build(self, path):
        pp = PathProcessor(path)
        self.terms = pp.process_files()
        self.calculate_tf()
        self.calculate_itf()
        self.index_mod.process_json(JSONEncoder().encode({'action': 'build', 'data': self.index}))

    def process_query(self, query, count):
        pass

    def calculate_tf(self):
        for file, terms in self.terms.items():
            words_counter_dic = {}
            max_frec = 0
            for w in terms:
                self.vocabulary.add(w)
                if w in words_counter_dic.keys():
                    words_counter_dic[w] += 1
                    if words_counter_dic[w] > max_frec:
                        max_frec = words_counter_dic[w]
                else:
                    words_counter_dic[w] = 1
            for w in terms:
                words_counter_dic[w] /= float(max_frec)
            print 'max_frec', str(max_frec)
            self.terms[file] = words_counter_dic

    def calculate_itf(self):
        for w in self.vocabulary:
            files_with_words = []
            for doc_name, doc_terms in self.terms.items():
                if w in doc_terms.keys():
                    files_with_words.append(doc_name)
            itf = len(self.terms) / float(len(files_with_words))
            self.index.append({'key': w, 'value': {'itf': itf,
                                                   'documents': [{'tf': self.terms[doc][w], 'document': doc} for doc in
                                                                 files_with_words]}})
    def __init__(self, json):
        self.index = []
        self.vocabulary = set([])
        json = JSONDecoder().decode(json)
        self.index_mod = IndexModule()

        if json['action'] == "build":
            self.build(json['path'])
        else:
            self.process_query(json['query'], json['count'])
Example #3
0
	# print(max_page)
	return (max_page)

def crawling():
	print('-----start crawling time: %s-----'%(datetime.today()))
	config = configparser.ConfigParser()
	config.read('../config.ini', 'utf-8')
	root = 'http://news.sohu.com/1/0903/61/subject212846158'
	max_page = get_max_page(root + '.shtml')
	news_pool = get_news_pool(root, max_page, max_page - 5)
	crawl_news(news_pool, 140, config['DEFAULT']['doc_dir_path'], config['DEFAULT']['doc_encoding'])

if __name__ == "__main__":
	print('-----start time:%s-----'%(datetime.today()))

	# 抓取新闻数据
	# crawling()

	# 构建索引
	print('-----start indexing time: %s-----'%(datetime.today()))
	im = IndexModule('../config.ini', 'utf-8')
	im.construct_postings_lists()

	# 推荐阅读
	print('-----start recommending time: %s-----'%(datetime.today()))
	rm = RecommendationModule('../config.ini', 'utf-8')
	rm.find_k_nearest(5, 25)
	print('-----finish time: %s-----'%(datetime.today()))