def fetchInfoFromURL(URL): #URL = "http://gameofthrones.wikia.com/wiki/Gregor_Clegane" splitter = re.compile('/') tmp = splitter.split(URL) if (len(tmp) < 3): return 1 splitter = re.compile('_') tmp = splitter.split(tmp[len(tmp) - 1]) character = "" for s in tmp: character = character + " " + s d = PyQuery(url=URL) TableLable = d("#mw-content-text>aside h3.pi-data-label") TableValue = d("#mw-content-text>aside div.pi-data-value") #for i in range(0, len(TableLable)): # print() #print(d("#mw-content-text>aside div.pi-data-value").text()) #print(len(TableLable), len(TableValue)) if (len(TableLable) != len(TableValue) or len(TableLable) == 0): return 1 f = open('../Data/CollectedData.txt', 'a') f.write("<<<<<" + character + "\n") for i in range(0, len(TableValue)): dd = PyQuery(TableValue.eq(i)) stRow = "<<<" + TableLable.eq(i).text() stAttr = "" for j in range(0, len(dd.contents())): if (dd.contents()[j] != ' '): if (dd.contents().eq(j).text() != ""): stAttr = stAttr + " " + dd.contents().eq(j).text() elif len(stAttr) > 0: stRow = stRow + "<<<" + stAttr stAttr = "" if (stAttr != ""): stRow = stRow + "<<<" + stAttr f.write(stRow + "\n") f.write("\n\n") f.close() return 0
def get_pq_object_inner_text(el: PyQuery): """获取元素的 innerText""" inner_text = '' if not is_pq_object_visible(el): return inner_text for e in el.contents(): if isinstance(e, _ElementUnicodeResult): inner_text += e else: inner_text += get_pq_object_inner_text(PyQuery(e)) return inner_text
def _render_unorder_list(self, p: Paragraph, pq: PyQuery): """ 渲染无序号列表 :param p: :param pq: :return: """ contents = pq.contents() for item in contents: sub_p = p._parent.add_paragraph(style='List Bullet') self._render_children(sub_p, pq(item).contents(), skip_br=True, is_root=True)
def _match_xpaths(pos: int, parent_node: PyQuery, xpath_dict: Dict[str, int]) -> int: if not is_pq_object_visible(parent_node): return pos child_nodes = parent_node.contents() for child in child_nodes: if isinstance(child, _ElementUnicodeResult): # 字符文本 pos += len(child) else: child_xpath = element_to_xpath(child) if child_xpath in xpath_dict: xpath_dict[child_xpath] = pos pos = _match_xpaths(pos, PyQuery(child), xpath_dict) return pos
def __render_inline_element(self, p: Paragraph, pq: PyQuery, bold=False, italic=False, sub=False, sup=False, underline=False, font_size=None, strike=False): """ 渲染行内元素 :param p: 段落 :param pq: 带渲染元素 :param bold: 加粗 :param italic: 斜体 :param sub: 下标 :param sup: 上标 :param underline: 下划线 :param font_size:默认字号 9pt,小五号 :return: """ for item in pq.contents(): text = item.text if isinstance(item, (HtmlElement, _Element)) else item if isinstance(item, (HtmlElement, _Element)): self._render_element(p, item, bold=bold, italic=italic, underline=underline, strike=strike, sup=sup, sub=sub, font_size=font_size) continue run = p.add_run(text) self.__force_simsun(run) run.underline = underline run.bold = bold run.italic = italic run.font.superscript = sup run.font.subscript = sub if font_size: run.font.size = font_size run.font.strike = strike
def get_dom_node_start_pos(parent_node: PyQuery, target_el: PyQuery) -> Tuple[int, bool]: """ 找到在 parent node 的文字段落中的 target_el 的文字起始位置 Examples -------- doc = PyQuery(html_str) selected_item = doc(jquery_selector) start_pos,is_find = get_dom_node_start_pos(doc, selected_item) Parameters ---------- parent_node 表示上一个parent node, 是一个PyQuery object类型的节点 target_el 要找到的节点,是一个 PyQuery 类型的节点 Returns ------- Tuple[int, bool] int: 当前遍历到的字符串位置,找不到是该值也可能为一个正数 bool: 是否找到目标元素 """ pos = 0 if not is_pq_object_visible(parent_node): return pos, False child_nodes = parent_node.contents() target_el_xpath = element_to_xpath(target_el) for child in child_nodes: if isinstance(child, _ElementUnicodeResult): # 字符文本 pos += len(child) else: child_xpath = element_to_xpath(child) if child_xpath == target_el_xpath: return pos, True else: child_pos, child_found = get_dom_node_start_pos( PyQuery(child), target_el) if child_found: return pos + child_pos, True else: pos += child_pos return pos, False
def build_dict_from_sane_json(elem: PyQuery, already_wrapped=False) -> dict: # Find if has children elem = PyQuery(elem) children = list(elem.contents()) has_children = len(elem.children()) > 0 contents = [] if has_children: # Fix unwrapped children if not already_wrapped: children = fix_unwrapped_text(elem).contents() for child in children: child_dict = build_dict_from_sane_json(child, already_wrapped=True) if child_dict: contents.append(child_dict) else: contents = elem.html() extra = {} # Only tables need the HTML (to use later for extraction of relevant data) if elem.is_("table"): extra = {'original_html': str(elem)} if 'src' in elem[0].attrib: extra['src'] = elem.attr('src') if 'href' in elem[0].attrib: extra['href'] = elem.attr('href') tag_type = list(elem)[0].tag tag_type_mapped = PRE_TAG_MATCH.get(tag_type, tag_type) contents = PRE_CONTENTS_MATCH.get(tag_type, contents) return { 'type': tag_type_mapped, 'attrs': [], 'layout': {}, 'contents': contents, 'extra': extra }
def _fix_unwrapped_text(children: PyQuery, do_not_wrap=False) -> List[PyQuery]: """ Add spans over all elements and their sub elements except other spans""" ret = [] if do_not_wrap and len(children) == 1: for i in children: if isinstance(i, str): ret.append(i) else: for fixed in fix_unwrapped_text(PyQuery(i)): ret.append(PyQuery(fixed)) # PyQuery(i).outer_html()) return ret if len(children) == 1 and isinstance(children[0], str): return [children[0]] for child in children: if isinstance(child, str) and len(children) > 1: ret.append(_wrap(child)) continue tag = child.tag attribs = "".join([f'{k}="{v}" ' for k, v in child.attrib.items()]) child = PyQuery(child) descendants = _fix_unwrapped_text(child.contents(), do_not_wrap=tag in HTML_NOT_WRAPABLES) descendants_html = "" for i in descendants: if isinstance(i, str): descendants_html += i else: descendants_html += i.outer_html() if tag in HTML_NOT_WRAPABLES: child.html(descendants_html) ret.append(child) else: child = PyQuery(f'<{tag} {attribs}>{descendants_html}</{tag}>') ret.append(_wrap(child)) return ret
def render_contents(self, p: Paragraph or _Cell, body: str, flat_p=False): """ 渲染内容 :param flat_p: 扁平化p标签。但是如果内容是一组并排的p,则此参数不起作用。如<p>..</p><p>..</p><p>..</p><p>..</p>,则此参数被设置为False :param p: :param body: :return: """ if not body: return pq = PyQuery( body.replace('\r', '').replace('\n', '').replace('\t', '')) contents = pq.contents() # is_first = True is_partial_p = True # 并排了一组p for item in contents: if not isinstance(item, _Element) or item.tag != 'p': is_partial_p = False break if is_partial_p: flat_p = False for item in contents: self._render_element(p, item, is_root=flat_p)
def parse(self, content: str): """ Parse html to parsed object :param content: :return: """ d = PyQuery(content) element_list = [] # list of children children = d.contents() # if no children, parse first one if len(d.children()) == 0: element_list.append(self.__parse__(d[0])) else: for child in children: try: parsed = self.__parse__(child) if parsed: element_list.append(parsed) except Exception as e: pass self.parsed_objects = element_list return self
def _render_element(self, p: Paragraph, element: str or Element, is_root=False, bold=False, italic=False, strike=False, underline=False, font_size=None, sup=False, sub=False): """ 转换html节点到word :param element: :return: """ if isinstance(element, str): run = p.add_run(self._clear_text(element)) run.bold = bold run.italic = italic run.font.strike = strike run.font.underline = underline run.font.subscript = sub run.font.superscript = sup if font_size: run.font.size = font_size self.__force_simsun(run) return pq = PyQuery(element) if pq.is_('p'): # 不支持嵌套p,自动扁平化 contents = pq.contents() align = self._get_pq_style(pq, 'text-align') if align == 'center': p.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == 'right': p.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: p.alignment = WD_ALIGN_PARAGRAPH.LEFT if is_root: self._render_children(p, contents) else: sub_p = p._parent.add_paragraph() if align == 'center': sub_p.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == 'right': sub_p.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: sub_p.alignment = WD_ALIGN_PARAGRAPH.LEFT self._render_children(sub_p, contents) elif pq.is_('u'): # 下划线 self.__render_inline_element(p, pq, underline=True, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('strong') or pq.is_('b'): # 加粗 self.__render_inline_element(p, pq, underline=underline, bold=True, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('i') or pq.is_('em'): # 斜体 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=True, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('sub'): # 下标 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=True, sup=sup) elif pq.is_('sup'): # 上标 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=True) elif pq.is_('var'): # 老公式 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=True, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('span'): self._render_span(p, pq, bold=bold, italic=italic, strike=strike, underline=underline, font_size=font_size) elif pq.is_("br"): p.add_run().add_break() elif pq.is_("div"): # sub_p = p._parent.add_paragraph() p.add_run().add_break() self._render_children(p, pq.contents()) elif pq.is_('ul'): self._render_unorder_list(p, pq) elif pq.is_('ol'): self._render_order_list(p, pq) elif pq.is_('table'): self._render_table(p, pq) elif pq.is_('img'): # 图片 self._render_img(p, pq) elif element.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): sub_p = p._parent.add_paragraph() self.__render_inline_element(sub_p, pq, bold=True, font_size=Pt(12), underline=underline, italic=True, strike=strike, sub=sub, sup=sup) else: sub_p = p._parent.add_paragraph() contents = pq.contents() self._render_children(sub_p, contents)
def _render_span(self, p: Paragraph, pq: PyQuery, bold=False, italic=False, strike=False, underline=False, font_size=None, sub=False, sup=False): """ 转换span change 19.5.3 公式转换错误,则直接用图片 :param pq: :return: """ try: if pq.attr('data-latex'): # 公式 omml_str = converter.to_omml( self.mini_trim(pq.attr('data-latex'))) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return if pq.has_class("math-tex"): # 公式 if pq.attr('data-latex'): omml_str = pq.attr('data-latex') else: omml_str = html.unescape( pq.html()) if pq.html() is not None else '' omml_str = omml_str.replace(r'\(', '').replace(r'\)', '') omml_str = converter.to_omml(self.mini_trim(omml_str)) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return # 阿凡题公式 if pq.has_class('afanti-latex'): metadata = AftQuestion(pq).parse_element() if metadata.startswith('^') or metadata.startswith('_'): last_ele = pq(p._element).children()[-1] metadata = last_ele.text[-1] + metadata last_ele.text = last_ele.text[:-1] omml_str = converter.to_omml(self.mini_trim(metadata)) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return except EquationConvertError: img = PyQuery('img', pq) self._render_img(p, img) return bold = any([ bold, self._get_pq_style(pq, 'font-weight') == 'bold', self._get_pq_style(pq, 'font-weight') == 'bolder' ]) italic = any( [italic, self._get_pq_style(pq, 'font-style') == 'italic']) strike = any([ strike, self._get_pq_style(pq, 'text-decoration') == 'line-through', self._get_pq_style(pq, 'text-decoration-line') == 'line-through' ]) underline = any([ underline, self._get_pq_style(pq, 'text-decoration') == 'underline', self._get_pq_style(pq, 'text-decoration-line') == 'underline' ]) if self._get_pq_style(pq, 'font-size'): size = self._get_pq_style(pq, 'font-size') if size.endswith('px'): size = size[:-2] size = int(float(size)) font_size = self.get_pt(size) elif size.endswith('pt'): size = size[:-2] size = float(size) font_size = Pt(size) # self.__render_inline_element(p, pq, bold=bold, italic=italic, underline=underline, font_size=font_size, # strike=strike) contents = pq.contents() for item in contents: if isinstance(item, (HtmlElement, _Element)): self._render_element(p, item, is_root=True, bold=bold, italic=italic, strike=strike, underline=underline, font_size=font_size) continue run = p.add_run(self._clear_text(item)) self.__force_simsun(run) if self._get_pq_style(pq, 'font-name'): run.font.name = self._get_pq_style(pq, 'font-name') if font_size: run.font.size = font_size run.underline = underline run.bold = bold run.italic = italic run.font.strike = strike run.font.superscript = sup run.font.subscript = sub