def parse_content(self, html_file_path): """ 解析 HTML 中的段落文本 按顺序返回多个 paragraph 构成一个数组, 每个 paragraph 是一个 content 行构成的数组 :param html_file_path: :return: """ rs = [] with codecs.open(html_file_path, encoding='utf-8', mode='r') as fp: soup = BeautifulSoup(fp.read(), "html.parser") paragraphs = [] for div in soup.find_all('div'): div_type = div.get('type') if div_type is not None and div_type == 'paragraph': paragraphs.append(div) for paragraph_div in paragraphs: has_sub_paragraph = False for div in paragraph_div.find_all('div'): div_type = div.get('type') if div_type is not None and div_type == 'paragraph': has_sub_paragraph = True if has_sub_paragraph: continue rs.append([]) for content_div in paragraph_div.find_all('div'): div_type = content_div.get('type') if div_type is not None and div_type == 'content': rs[-1].append(TextUtils.clean_text(content_div.text)) paragraphs = [] for content_list in rs: if len(content_list) > 0: paragraphs.append(''.join(content_list)) return paragraphs
def parse_content(self, html_file_path): """ 解析 HTML 中的段落文本 按顺序返回多个 paragraph 构成一个数组, 每个 paragraph 是一个 content 行构成的数组 :param html_file_path: :return: """ rs = [] with codecs.open(html_file_path, encoding='utf-8', mode='r') as fp: soup = BeautifulSoup(fp.read(), "html.parser") paragraphs = [] for div in soup.find_all('div'): div_type = div.get('type') #添加div_type == 'paragraph'的div块中的文本 if div_type is not None and div_type == 'paragraph': paragraphs.append(div) for paragraph_div in paragraphs: has_sub_paragraph = False #判断paragraph中是否有子paragraph for div in paragraph_div.find_all('div'): div_type = div.get('type') if div_type is not None and div_type == 'paragraph': has_sub_paragraph = True if has_sub_paragraph: continue #若存在子paragraph则continue,因为后面会遍历到该paragraph rs.append([]) #每个paragraphs中的content保存在rs的子列表中 #将paragraph中的content添加到列表中 for content_div in paragraph_div.find_all('div'): div_type = content_div.get('type') if div_type is not None and div_type == 'content': rs[-1].append(TextUtils.clean_text(content_div.text)) paragraphs = [] for content_list in rs: if len(content_list) > 0: paragraphs.append( ''.join(content_list)) #每个content_list结合在一起成为一个字符串 return paragraphs