Beispiel #1
0
    def _collapse(self, html_str: str) -> str:
        def collapse1(h):
            header = htmls.find(htmls.find(h, 'div', 'def-body ddef_b'),
                                'span', 'trans dtrans dtrans-se')
            return HTML_COLLAPSE1.format(header, h)

        html_str = htmls.sub(html_str, collapse1, 'div',
                             'def-block ddef_block')

        def collapse2(h):
            m = parse_tag.match(h)
            Log.d(TAG, '{}\n{}\n{}'.format(m.group(1), m.group(2), m.group(3)))
            return m.group(1) + HTML_COLLAPSE.format(m.group(2)) + m.group(3)

        html_str = htmls.sub(html_str, collapse2, 'div',
                             'xref phrasal_verbs hax dxref-w lmt-25 lmb-25')
        html_str = htmls.sub(html_str, collapse2, 'div',
                             'xref idioms hax dxref-w lmt-25 lmb-25')
        return html_str
Beispiel #2
0
    def _extract_fields(self, html_str: str) -> List[str]:
        try:
            back = htmls.find(html_str, 'div', 'class="di-body"')
            front = htmls.find(back, 'div', 'class="di-title"')

            # remove titles
            back = htmls.removeall(back, 'div', 'class="di-title"')
            # # remove audios
            # back = htmls.removeall(back, 'span', 'class="daud"')
            # support online audios
            back = re.sub(r'src="/zhs/media',
                          'src="{}zhs/media'.format(URL_ROOT), back)
            # remove phrases and idioms
            back = htmls.removeall(back, 'div', 'class="xref')
            # seems useless
            back = htmls.removeall(back, 'div', 'class="cid"')
            back = htmls.removeall(back, 'div', 'class="dwl hax"')

            def remove_tag(h):
                return parse_tag.sub(r'\g<2>', h)

            # remove links
            back = htmls.sub(back, remove_tag, 'a', 'class="query"')
            back = htmls.sub(back, remove_tag, 'a', 'href=')

            # remove share
            back = htmls.removeall(back, 'div', 'class="hfr lpb-2"')
            # remove more examples
            back = htmls.removeall(back, 'div', 'class="daccord"')
            # remove js
            back = htmls.removeall(back, 'script')
            # remove underlines
            back = htmls.sub(back, remove_tag, 'span', 'class="x-h dx-h"')
            # remove adds
            back = htmls.removeall(back, 'div', 'ad_contentslot')
            back = htmls.removeall(back, 'div', 'class="bb hax"')
            # collapse long cards
            if len(back) > THRESHOLD_COLLAPSE:
                back = self._collapse(back)
            return [front, back]
        except Exception as e:
            raise ExtractError('can\'t extract fields', e)
Beispiel #3
0
    def _extract_fields(self, html_str: str) -> List[str]:
        try:
            back = htmls.find(html_str, 'div',
                              'class="di-body"').replace('\n', '')
            front = htmls.find(back, 'div', 'class="di-title"')

            # remove titles
            back = htmls.removeall(back, 'div', 'class="di-title"')
            # remove audios
            back = htmls.removeall(back, 'span', 'class="daud"')
            # remove amp-access
            back = htmls.removeall(back, 'a', 'amp-access=')

            def remove_tag(h):
                return parse_tag.sub(r'\g<2>', h)

            # remove links
            back = htmls.sub(back, remove_tag, 'a', 'class="query"')
            back = htmls.sub(back, remove_tag, 'a', 'href=')

            # remove share
            back = htmls.removeall(back, 'div', 'class="hfr lpb-2"')
            # remove more examples
            back = htmls.removeall(back, 'div', 'class="daccord"')
            # remove js
            back = htmls.removeall(back, 'script')
            # remove underlines
            back = htmls.sub(back, remove_tag, 'span', 'class="x-h dx-h"')
            # remove adds
            back = htmls.removeall(back, 'div', 'ad_contentslot')
            back = htmls.removeall(back, 'div', 'class="bb hax"')
            # collapse long cards
            if len(back) > THRESHOLD_COLLAPSE:
                back = self._collapse(back)
            return [front, back]
        except Exception as e:
            raise ExtractError('can\'t extract fields', e)
Beispiel #4
0
    def test_sub(self):
        def rm_tag(s):
            return re.sub(r'<[\s\S]*?>([\s\S]*)<[\s\S]*>', r'\g<1>', s)

        Log.d(TAG,
              htmls.sub(self.HTML, rm_tag, 'a', 'href="http://example.com/"'))