Esempio n. 1
0
 def info(self, info='', col_bits=5, pagenum=100):
     keywords = PreDeal.seg(info)
     # 1. 关键词提取
     keys = jieba.analyse.textrank(info,
                                   topK=10,
                                   withWeight=False,
                                   allowPOS=('ns', 'n', 'vn', 'v'))
     # 2. 调用搜索引擎爬取相关网页
     # 2.1 抓取链接
     spider_link = SpiderLink(keys, self.root)
     spider_link.crawl(pagenum)
     # 2.2 抓取内容
     filename = '_'.join(keys) + '.html'
     spider_to = SpiderTo(filename)
     spider_to.crawl()
     # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合
     p = PreDeal()
     filepath = os.path.join(config.spidertext, '_'.join(keys))
     propath = os.path.join(config.prepapath, '_'.join(keys))
     p.savetexts(filepath=filepath, prepath=propath)
     # 4. 构建索引, 并检索,得到包含关键词信息的网页
     # 4.1 索引构建
     indexpath = os.path.join(config.indexpath, '_'.join(keys))
     Index.build(datapath=propath, indexpath=indexpath)
     search = Search(keys=keys, pindexp=indexpath)
     # 4.2 搜索并保存
     search.retrieve(keywords=keywords)
     # 5. 选取最佳网页,位置信息描述,编码
     info_kws = keywords[:]
     loc = Location(keywords=info_kws, col_bits=col_bits)
     name = '_'.join(keys)
     res_list = loc.describe(name)
     return res_list
Esempio n. 2
0
 def info(self, fi='', pagenum=100):
     info = FileUtil.readfile(fi)
     keywords = PreDeal.seg(info)
     # 1. 关键词提取
     keys = jieba.analyse.textrank(info,
                                   topK=10,
                                   withWeight=False,
                                   allowPOS=('ns', 'n', 'vn', 'v'))
     # 2. 调用搜索引擎爬取相关网页
     # 2.1 抓取链接
     spider_link = SpiderLink(keys, self.root)
     spider_link.crawl(pagenum)
     # 2.2 抓取内容
     filename = '_'.join(keys) + '.html'
     spider_to = SpiderTo(filename)
     spider_to.crawl()
     # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合
     p = PreDeal()
     filepath = os.path.join(config.spidertext, '_'.join(keys))
     prepath = os.path.join(config.prepapath, '_'.join(keys))
     p.savetexts(filepath=filepath, prepath=prepath)
     # 4. 构建索引, 并检索,得到包含关键词信息的网页
     # 4.1 索引构建
     indexpath = os.path.join(config.indexpath, '_'.join(keys))
     idx = Index()
     idx.build(datapath=prepath, indexpath=indexpath)
     search = Search1(filename=fi, pindexp=indexpath)
     # 4.2 搜索并保存
     info_k = keywords[:]
     num = search.retrieve(keywords=info_k)
     return keywords, num