def savetexts(self, filepath, prepath): """ 保存预处理后的文本 :param filepath: html文件路径 :param prepath: 保存路径 :return: """ self.logger.info('init pretreatment directory:"{0}"'.format(prepath)) FileUtil.init_path(prepath) try: file_lists = os.listdir(filepath) # 返回当前路径下所有文件和路径,字符串类型 for filename in file_lists: file = os.path.join(filepath, filename) if os.path.isfile(file): # 1.获取url及文本 url, text = FileUtil.get_url_text(file) # 2.关键词信息 kws = PreDeal.seg(text) self.logger.info( "Store pretreatment texts content:{0}".format( filename)) FileUtil.writefile(url + '\t'.join(kws), os.path.join(prepath, filename)) self.logger.info('Text pretreatment End!') except Exception as e: print(e)
def query(self, keywords, kwpath=''): path = [] # 已经找到的文章列表 num = [] # 每篇含文章组合的个数 unmatch = 0 # 失配个数 maxh = 0 # 关键词个数 q = '' # 联合关键词 flag = True # 失配标志 hidekey = [] while keywords: kw = keywords[0] paper = Index.search(self.pindexp, q + ' ' + kw, limit=None) if paper: keywords.pop(0) hidekey.append(kw) q = q + ' ' + kw maxh += 1 else: # 当联合搜索无法进行下去时,转为寻找相似关键词 simikeys = WV.similarwords(kw) t_paper = [] if not simikeys: print( ".................Failed to find similar words................" ) flag = False else: for skw, similarity in simikeys: sq = q + ' ' + skw t_paper = Index.search(self.pindexp, sq, limit=None) if t_paper: hidekey.append(skw) keywords.pop(0) q = sq maxh += 1 break if not t_paper: # 有关键词但联合搜索仍失败 flag = False # 失配 if not flag: doc = Index.search(self.pindexp, q, limit=None) if not doc: print("The keyword '%s' is unMatch !" % kw) unmatch += 1 hidekey.append('0') keywords.pop(0) path.append(None) # flag = True else: path.append(doc) num.append(maxh) maxh = 0 q = '' flag = True if not keywords: path.append(paper) hide_string = ' '.join(hidekey) FileUtil.writefile(hide_string, kwpath) return path
def crawl(self): self.download.download() readpath = os.path.join(config.spiderhtml, self.filename) savepath = os.path.join(config.spidertext, self.filename) FileUtil.init_path(savepath) for filename in os.listdir(readpath): file = os.path.join(readpath, filename) url, content = self.parse.parse(file) filename = filename.rstrip('.html') + '.txt' self.logger.info("Save spider url and content:{0}".format(url)) FileUtil.writefile(url + content, os.path.join(savepath, filename)) print('crawl web contents end!')