def _collapse(self, html_str: str) -> str: def collapse1(h): header = htmls.find(htmls.find(h, 'div', 'def-body ddef_b'), 'span', 'trans dtrans dtrans-se') return HTML_COLLAPSE1.format(header, h) html_str = htmls.sub(html_str, collapse1, 'div', 'def-block ddef_block') def collapse2(h): m = parse_tag.match(h) Log.d(TAG, '{}\n{}\n{}'.format(m.group(1), m.group(2), m.group(3))) return m.group(1) + HTML_COLLAPSE.format(m.group(2)) + m.group(3) html_str = htmls.sub(html_str, collapse2, 'div', 'xref phrasal_verbs hax dxref-w lmt-25 lmb-25') html_str = htmls.sub(html_str, collapse2, 'div', 'xref idioms hax dxref-w lmt-25 lmb-25') return html_str
def _extract_fields(self, html_str: str) -> List[str]: try: back = htmls.find(html_str, 'div', 'class="di-body"') front = htmls.find(back, 'div', 'class="di-title"') # remove titles back = htmls.removeall(back, 'div', 'class="di-title"') # # remove audios # back = htmls.removeall(back, 'span', 'class="daud"') # support online audios back = re.sub(r'src="/zhs/media', 'src="{}zhs/media'.format(URL_ROOT), back) # remove phrases and idioms back = htmls.removeall(back, 'div', 'class="xref') # seems useless back = htmls.removeall(back, 'div', 'class="cid"') back = htmls.removeall(back, 'div', 'class="dwl hax"') def remove_tag(h): return parse_tag.sub(r'\g<2>', h) # remove links back = htmls.sub(back, remove_tag, 'a', 'class="query"') back = htmls.sub(back, remove_tag, 'a', 'href=') # remove share back = htmls.removeall(back, 'div', 'class="hfr lpb-2"') # remove more examples back = htmls.removeall(back, 'div', 'class="daccord"') # remove js back = htmls.removeall(back, 'script') # remove underlines back = htmls.sub(back, remove_tag, 'span', 'class="x-h dx-h"') # remove adds back = htmls.removeall(back, 'div', 'ad_contentslot') back = htmls.removeall(back, 'div', 'class="bb hax"') # collapse long cards if len(back) > THRESHOLD_COLLAPSE: back = self._collapse(back) return [front, back] except Exception as e: raise ExtractError('can\'t extract fields', e)
def _extract_fields(self, html_str: str) -> List[str]: try: back = htmls.find(html_str, 'div', 'class="di-body"').replace('\n', '') front = htmls.find(back, 'div', 'class="di-title"') # remove titles back = htmls.removeall(back, 'div', 'class="di-title"') # remove audios back = htmls.removeall(back, 'span', 'class="daud"') # remove amp-access back = htmls.removeall(back, 'a', 'amp-access=') def remove_tag(h): return parse_tag.sub(r'\g<2>', h) # remove links back = htmls.sub(back, remove_tag, 'a', 'class="query"') back = htmls.sub(back, remove_tag, 'a', 'href=') # remove share back = htmls.removeall(back, 'div', 'class="hfr lpb-2"') # remove more examples back = htmls.removeall(back, 'div', 'class="daccord"') # remove js back = htmls.removeall(back, 'script') # remove underlines back = htmls.sub(back, remove_tag, 'span', 'class="x-h dx-h"') # remove adds back = htmls.removeall(back, 'div', 'ad_contentslot') back = htmls.removeall(back, 'div', 'class="bb hax"') # collapse long cards if len(back) > THRESHOLD_COLLAPSE: back = self._collapse(back) return [front, back] except Exception as e: raise ExtractError('can\'t extract fields', e)
def test_sub(self): def rm_tag(s): return re.sub(r'<[\s\S]*?>([\s\S]*)<[\s\S]*>', r'\g<1>', s) Log.d(TAG, htmls.sub(self.HTML, rm_tag, 'a', 'href="http://example.com/"'))