def savetexts(self, filepath, prepath): """ 保存预处理后的文本 :param filepath: html文件路径 :param prepath: 保存路径 :return: """ self.logger.info('init pretreatment directory:"{0}"'.format(prepath)) FileUtil.init_path(prepath) try: file_lists = os.listdir(filepath) # 返回当前路径下所有文件和路径,字符串类型 for filename in file_lists: file = os.path.join(filepath, filename) if os.path.isfile(file): # 1.获取url及文本 url, text = FileUtil.get_url_text(file) # 2.关键词信息 kws = PreDeal.seg(text) self.logger.info( "Store pretreatment texts content:{0}".format( filename)) FileUtil.writefile(url + '\t'.join(kws), os.path.join(prepath, filename)) self.logger.info('Text pretreatment End!') except Exception as e: print(e)
def init_path(self): savepath = os.path.join(config.hidepath, '_'.join(self.keys)) kwpath = os.path.join(config.hidekwpath, '_'.join(self.keys)) if not os.path.exists(savepath): os.makedirs(savepath) else: FileUtil.init_path(savepath) return savepath, kwpath
def crawl(self): self.download.download() readpath = os.path.join(config.spiderhtml, self.filename) savepath = os.path.join(config.spidertext, self.filename) FileUtil.init_path(savepath) for filename in os.listdir(readpath): file = os.path.join(readpath, filename) url, content = self.parse.parse(file) filename = filename.rstrip('.html') + '.txt' self.logger.info("Save spider url and content:{0}".format(url)) FileUtil.writefile(url + content, os.path.join(savepath, filename)) print('crawl web contents end!')
def init_config(self): FileUtil.init_path(self.savepath)