def get_html_article(self, response): """ 先调用readability识别正文,再去除标签以及空行,接下来因为模块识别出的正文会混入导航内容,需进一步处理 具体做法是以换行符分割识别到内容,判断字数.取出是文章的项 """ readable_article = Document(response).summary() readable_article = self.remove_html_tag(readable_article) readable_article = self.remove_empty_line(readable_article) article_split = readable_article.split('\n') # 记录识别到文章开始和结束的位置 begin = 0 end = 0 begin_find = False end_find = False has_article = False for index in range(len(article_split)): # # 当有一段特别大的时候只拿那一段 # if len(article_split[index]) > 500: # begin, end = index, index # break if not begin_find: # 一项长度大于40的话就认为是文章的开头 if len(article_split[index]) > IS_ARTICLE_SIZE: begin = index begin_find = True has_article = True elif not end_find: if len(article_split[-index - 1]) == 0: continue # \u3002\uff01分别对应中文的.跟? 因为一般中文句子结尾都是.跟? elif article_split[-index - 1][-1] in u'\u3002\uff01': if len(article_split[-index - 1]) > IS_ARTICLE_SIZE: end = index end_find = True has_article = True empty_list=[] if not has_article: return empty_list elif begin == end: empty_list.append(article_split[begin]) return empty_list else: return article_split[begin: len(article_split) - end]