Beispiel #1
0
    def _extract_fields(self, html_str: str) -> List[str]:
        try:
            back = htmls.find(html_str, 'div', 'class="di-body"')
            front = htmls.find(back, 'div', 'class="di-title"')

            # remove titles
            back = htmls.removeall(back, 'div', 'class="di-title"')
            # # remove audios
            # back = htmls.removeall(back, 'span', 'class="daud"')
            # support online audios
            back = re.sub(r'src="/zhs/media',
                          'src="{}zhs/media'.format(URL_ROOT), back)
            # remove phrases and idioms
            back = htmls.removeall(back, 'div', 'class="xref')
            # seems useless
            back = htmls.removeall(back, 'div', 'class="cid"')
            back = htmls.removeall(back, 'div', 'class="dwl hax"')

            def remove_tag(h):
                return parse_tag.sub(r'\g<2>', h)

            # remove links
            back = htmls.sub(back, remove_tag, 'a', 'class="query"')
            back = htmls.sub(back, remove_tag, 'a', 'href=')

            # remove share
            back = htmls.removeall(back, 'div', 'class="hfr lpb-2"')
            # remove more examples
            back = htmls.removeall(back, 'div', 'class="daccord"')
            # remove js
            back = htmls.removeall(back, 'script')
            # remove underlines
            back = htmls.sub(back, remove_tag, 'span', 'class="x-h dx-h"')
            # remove adds
            back = htmls.removeall(back, 'div', 'ad_contentslot')
            back = htmls.removeall(back, 'div', 'class="bb hax"')
            # collapse long cards
            if len(back) > THRESHOLD_COLLAPSE:
                back = self._collapse(back)
            return [front, back]
        except Exception as e:
            raise ExtractError('can\'t extract fields', e)
Beispiel #2
0
    def _extract_fields(self, html_str: str) -> List[str]:
        try:
            back = htmls.find(html_str, 'div',
                              'class="di-body"').replace('\n', '')
            front = htmls.find(back, 'div', 'class="di-title"')

            # remove titles
            back = htmls.removeall(back, 'div', 'class="di-title"')
            # remove audios
            back = htmls.removeall(back, 'span', 'class="daud"')
            # remove amp-access
            back = htmls.removeall(back, 'a', 'amp-access=')

            def remove_tag(h):
                return parse_tag.sub(r'\g<2>', h)

            # remove links
            back = htmls.sub(back, remove_tag, 'a', 'class="query"')
            back = htmls.sub(back, remove_tag, 'a', 'href=')

            # remove share
            back = htmls.removeall(back, 'div', 'class="hfr lpb-2"')
            # remove more examples
            back = htmls.removeall(back, 'div', 'class="daccord"')
            # remove js
            back = htmls.removeall(back, 'script')
            # remove underlines
            back = htmls.sub(back, remove_tag, 'span', 'class="x-h dx-h"')
            # remove adds
            back = htmls.removeall(back, 'div', 'ad_contentslot')
            back = htmls.removeall(back, 'div', 'class="bb hax"')
            # collapse long cards
            if len(back) > THRESHOLD_COLLAPSE:
                back = self._collapse(back)
            return [front, back]
        except Exception as e:
            raise ExtractError('can\'t extract fields', e)
Beispiel #3
0
 def collapse1(h):
     header = htmls.find(htmls.find(h, 'div', 'def-body ddef_b'),
                         'span', 'trans dtrans dtrans-se')
     return HTML_COLLAPSE1.format(header, h)
Beispiel #4
0
 def test_find(self):
     self.assertEqual('<a href="http://example.org/">example.org</a>',
                      htmls.find(self.HTML, 'a'))