Example #1
0
    def get_html_article(self, response):
        """
        先调用readability识别正文,再去除标签以及空行,接下来因为模块识别出的正文会混入导航内容,需进一步处理
        具体做法是以换行符分割识别到内容,判断字数.取出是文章的项
        """

        readable_article = Document(response).summary()
        readable_article = self.remove_html_tag(readable_article)
        readable_article = self.remove_empty_line(readable_article)

        article_split = readable_article.split('\n')

        # 记录识别到文章开始和结束的位置
        begin = 0
        end = 0

        begin_find = False
        end_find = False
        has_article = False

        for index in range(len(article_split)):

            # # 当有一段特别大的时候只拿那一段
            # if len(article_split[index]) > 500:
            #     begin, end = index, index
            #     break

            if not begin_find:
                # 一项长度大于40的话就认为是文章的开头
                if len(article_split[index]) > IS_ARTICLE_SIZE:
                    begin = index
                    begin_find = True
                    has_article = True

            elif not end_find:
                if len(article_split[-index - 1]) == 0:
                    continue
                # \u3002\uff01分别对应中文的.跟? 因为一般中文句子结尾都是.跟?
                elif article_split[-index - 1][-1] in u'\u3002\uff01':
                    if len(article_split[-index - 1]) > IS_ARTICLE_SIZE:
                        end = index
                        end_find = True
                        has_article = True

        empty_list=[]

        if not has_article:
            return empty_list
        elif begin == end:
            empty_list.append(article_split[begin])
            return empty_list
        else:
            return article_split[begin: len(article_split) - end]