Esempio n. 1
0
def fetchInfoFromURL(URL):
    #URL = "http://gameofthrones.wikia.com/wiki/Gregor_Clegane"
    splitter = re.compile('/')
    tmp = splitter.split(URL)
    if (len(tmp) < 3):
        return 1
    splitter = re.compile('_')
    tmp = splitter.split(tmp[len(tmp) - 1])
    character = ""
    for s in tmp:
        character = character + " " + s

    d = PyQuery(url=URL)
    TableLable = d("#mw-content-text>aside h3.pi-data-label")
    TableValue = d("#mw-content-text>aside div.pi-data-value")

    #for i in range(0, len(TableLable)):
    #    print()

    #print(d("#mw-content-text>aside div.pi-data-value").text())

    #print(len(TableLable), len(TableValue))
    if (len(TableLable) != len(TableValue) or len(TableLable) == 0):
        return 1

    f = open('../Data/CollectedData.txt', 'a')
    f.write("<<<<<" + character + "\n")
    for i in range(0, len(TableValue)):
        dd = PyQuery(TableValue.eq(i))
        stRow = "<<<" + TableLable.eq(i).text()
        stAttr = ""
        for j in range(0, len(dd.contents())):
            if (dd.contents()[j] != ' '):
                if (dd.contents().eq(j).text() != ""):
                    stAttr = stAttr + " " + dd.contents().eq(j).text()
                elif len(stAttr) > 0:
                    stRow = stRow + "<<<" + stAttr
                    stAttr = ""
        if (stAttr != ""):
            stRow = stRow + "<<<" + stAttr
        f.write(stRow + "\n")
    f.write("\n\n")
    f.close()
    return 0
def get_pq_object_inner_text(el: PyQuery):
    """获取元素的 innerText"""
    inner_text = ''
    if not is_pq_object_visible(el):
        return inner_text
    for e in el.contents():
        if isinstance(e, _ElementUnicodeResult):
            inner_text += e
        else:
            inner_text += get_pq_object_inner_text(PyQuery(e))
    return inner_text
 def _render_unorder_list(self, p: Paragraph, pq: PyQuery):
     """
     渲染无序号列表
     :param p:
     :param pq:
     :return:
     """
     contents = pq.contents()
     for item in contents:
         sub_p = p._parent.add_paragraph(style='List Bullet')
         self._render_children(sub_p,
                               pq(item).contents(),
                               skip_br=True,
                               is_root=True)
def _match_xpaths(pos: int, parent_node: PyQuery,
                  xpath_dict: Dict[str, int]) -> int:
    if not is_pq_object_visible(parent_node):
        return pos
    child_nodes = parent_node.contents()
    for child in child_nodes:
        if isinstance(child, _ElementUnicodeResult):  # 字符文本
            pos += len(child)
        else:
            child_xpath = element_to_xpath(child)
            if child_xpath in xpath_dict:
                xpath_dict[child_xpath] = pos
            pos = _match_xpaths(pos, PyQuery(child), xpath_dict)
    return pos
 def __render_inline_element(self,
                             p: Paragraph,
                             pq: PyQuery,
                             bold=False,
                             italic=False,
                             sub=False,
                             sup=False,
                             underline=False,
                             font_size=None,
                             strike=False):
     """
     渲染行内元素
     :param p: 段落
     :param pq: 带渲染元素
     :param bold: 加粗
     :param italic: 斜体
     :param sub: 下标
     :param sup: 上标
     :param underline: 下划线
     :param font_size:默认字号 9pt,小五号
     :return:
     """
     for item in pq.contents():
         text = item.text if isinstance(item,
                                        (HtmlElement, _Element)) else item
         if isinstance(item, (HtmlElement, _Element)):
             self._render_element(p,
                                  item,
                                  bold=bold,
                                  italic=italic,
                                  underline=underline,
                                  strike=strike,
                                  sup=sup,
                                  sub=sub,
                                  font_size=font_size)
             continue
         run = p.add_run(text)
         self.__force_simsun(run)
         run.underline = underline
         run.bold = bold
         run.italic = italic
         run.font.superscript = sup
         run.font.subscript = sub
         if font_size:
             run.font.size = font_size
         run.font.strike = strike
def get_dom_node_start_pos(parent_node: PyQuery,
                           target_el: PyQuery) -> Tuple[int, bool]:
    """
    找到在 parent node 的文字段落中的 target_el 的文字起始位置

    Examples
    --------
        doc = PyQuery(html_str)
        selected_item = doc(jquery_selector)
        start_pos,is_find = get_dom_node_start_pos(doc, selected_item)

    Parameters
    ----------
    parent_node
        表示上一个parent node, 是一个PyQuery object类型的节点

    target_el
        要找到的节点,是一个 PyQuery 类型的节点

    Returns
    -------
    Tuple[int, bool]
       int: 当前遍历到的字符串位置,找不到是该值也可能为一个正数
       bool: 是否找到目标元素
    """

    pos = 0
    if not is_pq_object_visible(parent_node):
        return pos, False
    child_nodes = parent_node.contents()
    target_el_xpath = element_to_xpath(target_el)
    for child in child_nodes:
        if isinstance(child, _ElementUnicodeResult):  # 字符文本
            pos += len(child)
        else:
            child_xpath = element_to_xpath(child)
            if child_xpath == target_el_xpath:
                return pos, True
            else:
                child_pos, child_found = get_dom_node_start_pos(
                    PyQuery(child), target_el)
                if child_found:
                    return pos + child_pos, True
                else:
                    pos += child_pos
    return pos, False
Esempio n. 7
0
def build_dict_from_sane_json(elem: PyQuery, already_wrapped=False) -> dict:
    # Find if has children
    elem = PyQuery(elem)
    children = list(elem.contents())
    has_children = len(elem.children()) > 0

    contents = []
    if has_children:
        # Fix unwrapped children
        if not already_wrapped:
            children = fix_unwrapped_text(elem).contents()

        for child in children:
            child_dict = build_dict_from_sane_json(child, already_wrapped=True)
            if child_dict:
                contents.append(child_dict)
    else:
        contents = elem.html()

    extra = {}

    # Only tables need the HTML (to use later for extraction of relevant data)
    if elem.is_("table"):
        extra = {'original_html': str(elem)}

    if 'src' in elem[0].attrib:
        extra['src'] = elem.attr('src')
    if 'href' in elem[0].attrib:
        extra['href'] = elem.attr('href')

    tag_type = list(elem)[0].tag
    tag_type_mapped = PRE_TAG_MATCH.get(tag_type, tag_type)
    contents = PRE_CONTENTS_MATCH.get(tag_type, contents)

    return {
        'type': tag_type_mapped,
        'attrs': [],
        'layout': {},
        'contents': contents,
        'extra': extra
    }
Esempio n. 8
0
def _fix_unwrapped_text(children: PyQuery, do_not_wrap=False) -> List[PyQuery]:
    """ Add spans over all elements and their sub elements except other spans"""
    ret = []
    if do_not_wrap and len(children) == 1:
        for i in children:
            if isinstance(i, str):
                ret.append(i)
            else:
                for fixed in fix_unwrapped_text(PyQuery(i)):
                    ret.append(PyQuery(fixed))  # PyQuery(i).outer_html())
        return ret

    if len(children) == 1 and isinstance(children[0], str):
        return [children[0]]

    for child in children:
        if isinstance(child, str) and len(children) > 1:
            ret.append(_wrap(child))
            continue

        tag = child.tag
        attribs = "".join([f'{k}="{v}" ' for k, v in child.attrib.items()])
        child = PyQuery(child)
        descendants = _fix_unwrapped_text(child.contents(),
                                          do_not_wrap=tag
                                          in HTML_NOT_WRAPABLES)
        descendants_html = ""
        for i in descendants:
            if isinstance(i, str):
                descendants_html += i
            else:
                descendants_html += i.outer_html()

        if tag in HTML_NOT_WRAPABLES:
            child.html(descendants_html)
            ret.append(child)
        else:
            child = PyQuery(f'<{tag} {attribs}>{descendants_html}</{tag}>')
            ret.append(_wrap(child))

    return ret
 def render_contents(self, p: Paragraph or _Cell, body: str, flat_p=False):
     """
     渲染内容
     :param flat_p: 扁平化p标签。但是如果内容是一组并排的p,则此参数不起作用。如<p>..</p><p>..</p><p>..</p><p>..</p>,则此参数被设置为False
     :param p:
     :param body:
     :return:
     """
     if not body:
         return
     pq = PyQuery(
         body.replace('\r', '').replace('\n', '').replace('\t', ''))
     contents = pq.contents()
     # is_first = True
     is_partial_p = True  # 并排了一组p
     for item in contents:
         if not isinstance(item, _Element) or item.tag != 'p':
             is_partial_p = False
             break
     if is_partial_p:
         flat_p = False
     for item in contents:
         self._render_element(p, item, is_root=flat_p)
Esempio n. 10
0
    def parse(self, content: str):
        """
        Parse html to parsed object
        :param content:
        :return:
        """
        d = PyQuery(content)
        element_list = []
        # list of children
        children = d.contents()
        # if no children, parse first one
        if len(d.children()) == 0:
            element_list.append(self.__parse__(d[0]))
        else:
            for child in children:
                try:
                    parsed = self.__parse__(child)
                    if parsed:
                        element_list.append(parsed)
                except Exception as e:
                    pass

        self.parsed_objects = element_list
        return self
    def _render_element(self,
                        p: Paragraph,
                        element: str or Element,
                        is_root=False,
                        bold=False,
                        italic=False,
                        strike=False,
                        underline=False,
                        font_size=None,
                        sup=False,
                        sub=False):
        """
        转换html节点到word
        :param element:
        :return:
        """
        if isinstance(element, str):
            run = p.add_run(self._clear_text(element))
            run.bold = bold
            run.italic = italic
            run.font.strike = strike
            run.font.underline = underline
            run.font.subscript = sub
            run.font.superscript = sup
            if font_size:
                run.font.size = font_size
            self.__force_simsun(run)
            return
        pq = PyQuery(element)
        if pq.is_('p'):  # 不支持嵌套p,自动扁平化
            contents = pq.contents()
            align = self._get_pq_style(pq, 'text-align')

            if align == 'center':
                p.alignment = WD_ALIGN_PARAGRAPH.CENTER
            elif align == 'right':
                p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
            else:
                p.alignment = WD_ALIGN_PARAGRAPH.LEFT

            if is_root:
                self._render_children(p, contents)
            else:
                sub_p = p._parent.add_paragraph()

                if align == 'center':
                    sub_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
                elif align == 'right':
                    sub_p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                else:
                    sub_p.alignment = WD_ALIGN_PARAGRAPH.LEFT

                self._render_children(sub_p, contents)
        elif pq.is_('u'):  # 下划线
            self.__render_inline_element(p,
                                         pq,
                                         underline=True,
                                         bold=bold,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('strong') or pq.is_('b'):  # 加粗
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=True,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('i') or pq.is_('em'):  # 斜体
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=True,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('sub'):  # 下标
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=True,
                                         sup=sup)
        elif pq.is_('sup'):  # 上标
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=True)
        elif pq.is_('var'):  # 老公式
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=True,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('span'):
            self._render_span(p,
                              pq,
                              bold=bold,
                              italic=italic,
                              strike=strike,
                              underline=underline,
                              font_size=font_size)
        elif pq.is_("br"):
            p.add_run().add_break()
        elif pq.is_("div"):
            # sub_p = p._parent.add_paragraph()
            p.add_run().add_break()
            self._render_children(p, pq.contents())
        elif pq.is_('ul'):
            self._render_unorder_list(p, pq)
        elif pq.is_('ol'):
            self._render_order_list(p, pq)
        elif pq.is_('table'):
            self._render_table(p, pq)
        elif pq.is_('img'):  # 图片
            self._render_img(p, pq)
        elif element.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
            sub_p = p._parent.add_paragraph()
            self.__render_inline_element(sub_p,
                                         pq,
                                         bold=True,
                                         font_size=Pt(12),
                                         underline=underline,
                                         italic=True,
                                         strike=strike,
                                         sub=sub,
                                         sup=sup)
        else:
            sub_p = p._parent.add_paragraph()
            contents = pq.contents()
            self._render_children(sub_p, contents)
    def _render_span(self,
                     p: Paragraph,
                     pq: PyQuery,
                     bold=False,
                     italic=False,
                     strike=False,
                     underline=False,
                     font_size=None,
                     sub=False,
                     sup=False):
        """
        转换span
        change 19.5.3
            公式转换错误,则直接用图片
        :param pq:
        :return:
        """
        try:
            if pq.attr('data-latex'):  # 公式
                omml_str = converter.to_omml(
                    self.mini_trim(pq.attr('data-latex')))
                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return
            if pq.has_class("math-tex"):  # 公式
                if pq.attr('data-latex'):
                    omml_str = pq.attr('data-latex')
                else:
                    omml_str = html.unescape(
                        pq.html()) if pq.html() is not None else ''
                omml_str = omml_str.replace(r'\(', '').replace(r'\)', '')
                omml_str = converter.to_omml(self.mini_trim(omml_str))

                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return

            # 阿凡题公式
            if pq.has_class('afanti-latex'):
                metadata = AftQuestion(pq).parse_element()
                if metadata.startswith('^') or metadata.startswith('_'):
                    last_ele = pq(p._element).children()[-1]
                    metadata = last_ele.text[-1] + metadata
                    last_ele.text = last_ele.text[:-1]

                omml_str = converter.to_omml(self.mini_trim(metadata))
                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return
        except EquationConvertError:
            img = PyQuery('img', pq)
            self._render_img(p, img)
            return

        bold = any([
            bold,
            self._get_pq_style(pq, 'font-weight') == 'bold',
            self._get_pq_style(pq, 'font-weight') == 'bolder'
        ])
        italic = any(
            [italic, self._get_pq_style(pq, 'font-style') == 'italic'])
        strike = any([
            strike,
            self._get_pq_style(pq, 'text-decoration') == 'line-through',
            self._get_pq_style(pq, 'text-decoration-line') == 'line-through'
        ])
        underline = any([
            underline,
            self._get_pq_style(pq, 'text-decoration') == 'underline',
            self._get_pq_style(pq, 'text-decoration-line') == 'underline'
        ])

        if self._get_pq_style(pq, 'font-size'):
            size = self._get_pq_style(pq, 'font-size')
            if size.endswith('px'):
                size = size[:-2]
                size = int(float(size))
                font_size = self.get_pt(size)
            elif size.endswith('pt'):
                size = size[:-2]
                size = float(size)
                font_size = Pt(size)
        # self.__render_inline_element(p, pq, bold=bold, italic=italic, underline=underline, font_size=font_size,
        #                              strike=strike)

        contents = pq.contents()
        for item in contents:
            if isinstance(item, (HtmlElement, _Element)):
                self._render_element(p,
                                     item,
                                     is_root=True,
                                     bold=bold,
                                     italic=italic,
                                     strike=strike,
                                     underline=underline,
                                     font_size=font_size)
                continue
            run = p.add_run(self._clear_text(item))
            self.__force_simsun(run)
            if self._get_pq_style(pq, 'font-name'):
                run.font.name = self._get_pq_style(pq, 'font-name')
            if font_size:
                run.font.size = font_size

            run.underline = underline

            run.bold = bold
            run.italic = italic
            run.font.strike = strike
            run.font.superscript = sup
            run.font.subscript = sub