def all_CR(self, infopath, extpath): CR = [] file_list = os.listdir(infopath) for name in file_list: origin_info = FileUtil.readfile(filename=os.path.join(infopath, name)) ext_info = FileUtil.readfile(filename=os.path.join(extpath, name)) cr = self.CR(origin_info, ext_info) CR.append(cr) return CR
def info(self, fi='', pagenum=100): info = FileUtil.readfile(fi) keywords = PreDeal.seg(info) # 1. 关键词提取 keys = jieba.analyse.textrank(info, topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) # 2. 调用搜索引擎爬取相关网页 # 2.1 抓取链接 spider_link = SpiderLink(keys, self.root) spider_link.crawl(pagenum) # 2.2 抓取内容 filename = '_'.join(keys) + '.html' spider_to = SpiderTo(filename) spider_to.crawl() # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合 p = PreDeal() filepath = os.path.join(config.spidertext, '_'.join(keys)) prepath = os.path.join(config.prepapath, '_'.join(keys)) p.savetexts(filepath=filepath, prepath=prepath) # 4. 构建索引, 并检索,得到包含关键词信息的网页 # 4.1 索引构建 indexpath = os.path.join(config.indexpath, '_'.join(keys)) idx = Index() idx.build(datapath=prepath, indexpath=indexpath) search = Search1(filename=fi, pindexp=indexpath) # 4.2 搜索并保存 info_k = keywords[:] num = search.retrieve(keywords=info_k) return keywords, num
def get_url_titles(self): parse_list = [] html_str = FileUtil.readfile(self.filename) linktr = etree.HTML(text=html_str).xpath('//tr') for item in linktr: url = item.xpath('string(./td[1])') title = item.xpath('string(./td[2])') parse_list.append(dict([('url', url), ('title', title)])) return parse_list
def build(self, datapath, indexpath): self.logger.info('the process of create full-text index!') schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), content=TEXT(analyzer=SpaceSeparatedTokenizer())) if not os.path.exists(indexpath): # 索引存储路径 os.makedirs(indexpath) ix = create_in(indexpath, schema) # 创建索引 writer = ix.writer() for filename in os.listdir(datapath): filepath = os.path.join(datapath, filename) content = FileUtil.readfile(filepath) writer.add_document(path=filepath, title=filename, content=content) writer.commit()
class PreDeal(object): """ 提取中文字符,去停用词,分词,保存 """ stopwords = FileUtil.readfile(config.stopwordpath).splitlines() def __init__(self): logging.basicConfig( level=logging.INFO, format= '%(asctime)s - %(name)s - %(levelname)s - %(module)s - %(message)s' ) self.logger = logging.getLogger("Text manipulation") @classmethod def seg(cls, sentences): sentences = cls._renostr(sentences) kws = [] stopwords = cls.stopwords # departs = jieba.cut_for_search(sentences, HMM=True) # 搜索引擎模式分词 departs = jieba.cut(sentences, HMM=True) # 普通模式 for word in departs: if word not in stopwords: # 去停用词 kws.append(word) return kws @staticmethod def _renostr(strings): """ 提取所有汉字 :param strings: :return: """ pattern = re.compile('[\u4e00-\u9fa5]+') strs = re.findall(pattern, strings) return ''.join(strs) def savetexts(self, filepath, prepath): """ 保存预处理后的文本 :param filepath: html文件路径 :param prepath: 保存路径 :return: """ self.logger.info('init pretreatment directory:"{0}"'.format(prepath)) FileUtil.init_path(prepath) try: file_lists = os.listdir(filepath) # 返回当前路径下所有文件和路径,字符串类型 for filename in file_lists: file = os.path.join(filepath, filename) if os.path.isfile(file): # 1.获取url及文本 url, text = FileUtil.get_url_text(file) # 2.关键词信息 kws = PreDeal.seg(text) self.logger.info( "Store pretreatment texts content:{0}".format( filename)) FileUtil.writefile(url + '\t'.join(kws), os.path.join(prepath, filename)) self.logger.info('Text pretreatment End!') except Exception as e: print(e)