async def example(q): r = await asks.get('https://fr.wikipedia.org/wiki/'+q) tree = html_text.parse_html(r.text) cleaned_tree = html_text.cleaner.clean_html(tree) print(html_text.etree_to_text(cleaned_tree)) f = html_text.etree_to_text(cleaned_tree) return f
def chord(): if request.args.get('query'): try: query = request.args.get('query').replace(' ', '+') search = get('http://app.chordindonesia.com/?json=get_search_results&exclude=date,modified,attachments,comment_count,comment_status,thumbnail,thumbnail_images,author,excerpt,content,categories,tags,comments,custom_fields&search=%s' % query).json()['posts'][0]['id'] chord = get('http://app.chordindonesia.com/?json=get_post&id=%s' % search).json() result = html_text.parse_html(chord['post']['content']).text_content() return { "code": "200", "status": "sukses", "result": result } except: return { "code": "404", "status": "error", "message": "Chord yang anda minta tidak dapat ditemukan" } else: return { "code": "404", "status": "error", "message": "Masukan parameter query" }
def test_broken_cfemail(): CFEMAIL = """ <span class="__cf_email__" data-cfemail="*****@*****.**">Sales at MAQSoftware dot com</span> """ for sample in split_text(CFEMAIL): tree = parse_html(sample) res = list(audit_etree(tree)) assert len(res) == 1
def cordIndo(q): try: id_ = get('http://app.chordindonesia.com/?json=get_search_results&exclude=date,modified,attachments,comment_count,comment_status,thumbnail,thumbnail_images,author,excerpt,content,categories,tags,comments,custom_fields&search=%s' % q).json()['posts'][0]['id'] chord = get('http://app.chordindonesia.com/', params={"json":"get_post","id":id_}).json() result = html_text.parse_html(chord['post']['content']).text_content() return result except Exception as e: print(e) return "[❗] Maaf chord yang anda cari tidak dapat saya temukan"
def test_cfemail(): MAILTO = """ <a href="/cdn-cgi/l/email-protection#e48d8a828ba496819091968ac98b8ac98d8a978d838c90ca878b89"><span class="__cf_email__" data-cfemail="cfa6a1a9a08fbdaabbbabda1e2a0a1e2a6a1bca6a8a7bbe1aca0a2">[email protected]</span></a> <a href="/cdn-cgi/l/email-protection#761f1810193602121711041903065815191b"><span class="__cf_email__" data-cfemail="a2cbccc4cde2d6c6c3c5d0cdd7d28cc1cdcf">[email protected]</span></a> <a href="/cdn-cgi/l/email-protection#7b131e1717143b081e091e151f120b120f024955181416"><span class="__cf_email__" data-cfemail="a5cdc0c9c9cae5d6c0d7c0cbc1ccd5ccd1dc978bc6cac8">[email protected]</span></a> <a class='underline' href="/cdn-cgi/l/email-protection#452c2b232a052824372e2031373c2c2b266b262a28"><span class="__cf_email__" data-cfemail="a3cacdc5cce3cec2d1c8c6d7d1dacacdc08dc0ccce">[email protected]</span></a> """ for sample in split_text(MAILTO): tree = parse_html(sample) res = list(set(audit_etree(tree))) assert len(res) == 1 and '@' in res[0], (sample, res)
def test_webpages(page, extracted): html = _load_file(page) if not six.PY3: # FIXME: produces '\xa0' in Python 2, but ' ' in Python 3 # this difference is ignored in this test. # What is the correct behavior? html = html.replace(' ', ' ') expected = _load_file(extracted) assert extract_text(html) == expected tree = cleaner.clean_html(parse_html(html)) assert etree_to_text(tree) == expected
def test_etree_mailto(): MAILTO = """ <a href="mailto:[email protected]"> <a href="mailto:%66%6f%6f%40%62%61%72%2e%63%6f%6d"> <a href="mailto:silvan3@tilllate.com"> <a href="mailto:%73%69%6c%76%61%6e%34%40%74%69%6c%6c%6c%61%74%65%2e%63%6f%6d"> <a href="mailto:%75%73%65%72%40%64%6f%6d%61%69%6e%2e%74%6c%64"> <a href="mailto:user@domain.tld"> <a href='mailto:%4Aohn.Doe@e%78ample%2E%63om'> <a href='mailto:john@yahoo.com'> <meta itemprop="email" content="*****@*****.**" /> """ for sample in split_text(MAILTO): tree = parse_html(sample) res = list(audit_etree(tree)) assert len(res) == 1 and '@' in res[0], (sample, res)
def brainly(url: str): C = requests.get(url) bs = BeautifulSoup(C.text, "html.parser") if bs.find_all("h1", attrs={"data-test": "question-box-text"}): soal = bs.find_all("h1", attrs={"data-test": "question-box-text"})[0].span.text.strip() if bs.find_all("div", attrs={"data-test": "answer-box-text"}): print( bs.find_all("div", attrs={"data-test": "answer-box-text"})[0]("div")) jawaban = html_text.parse_html( bs.find_all("div", attrs={"data-test": "answer-box-text"})[0].__str__().replace( "<br/>", "\n")).text_content() return {"soal": soal, "jawaban": jawaban}
def chord(): if request.args.get('lagu'): try: lagu = request.args.get('lagu').replace(' ', '+') id = get( 'http://app.chordindonesia.com/?json=get_search_results&exclude=date,modified,attachments,comment_count,comment_status,thumbnail,thumbnail_images,author,excerpt,content,categories,tags,comments,custom_fields&search=%s' % lagu).json()['posts'][0]['id'] chord = get('http://app.chordindonesia.com/?json=get_post&id=%s' % id).json() result = html_text.parse_html( chord['post']['content']).text_content() return {'status': 200, 'result': result} except Exception as e: print(e) return { 'status': false, 'error': '[❗] Maaf chord yang anda cari tidak dapat saya temukan!' } else: return {'status': false, 'pesan': 'Masukkan parameter q'}
def test_extract_text_from_node(all_options): html = (u'<html><style>.div {}</style>' '<body><p>Hello, world!</p></body></html>') tree = parse_html(html) node = tree.xpath('//p')[0] assert extract_text(node, **all_options) == u'Hello, world!'
def test_extract_text_from_tree(all_options): html = (u'<html><style>.div {}</style>' '<body><p>Hello, world!</body></html>') tree = parse_html(html) assert extract_text(tree, **all_options) == u'Hello, world!'
def test_extract_text_from_tree(): html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>' tree = parse_html(html) assert extract_text(tree) == u'Hello, world!'
def test_extract_text_from_fail_html(): html = "<html><frameset><frame></frameset></html>" tree = parse_html(html) node = tree.xpath('/html/frameset')[0] assert extract_text(node) == u''