Beispiel #1
0
async def example(q):
    r = await asks.get('https://fr.wikipedia.org/wiki/'+q)
    tree = html_text.parse_html(r.text)
    cleaned_tree = html_text.cleaner.clean_html(tree)
    print(html_text.etree_to_text(cleaned_tree))
    f = html_text.etree_to_text(cleaned_tree)
    return f
Beispiel #2
0
def chord():
    if request.args.get('query'):
        try:
            query = request.args.get('query').replace(' ', '+')
            search = get('http://app.chordindonesia.com/?json=get_search_results&exclude=date,modified,attachments,comment_count,comment_status,thumbnail,thumbnail_images,author,excerpt,content,categories,tags,comments,custom_fields&search=%s' % query).json()['posts'][0]['id']
            chord = get('http://app.chordindonesia.com/?json=get_post&id=%s' % search).json()
            result = html_text.parse_html(chord['post']['content']).text_content()

            return {
                "code": "200",
                "status": "sukses",
                "result": result
            }
        except:
            return {
                "code": "404",
                "status": "error",
                "message": "Chord yang anda minta tidak dapat ditemukan"
            }
    else:
        return {
            "code": "404",
            "status": "error",
            "message": "Masukan parameter query"
        }
Beispiel #3
0
def test_broken_cfemail():
    CFEMAIL = """
<span class="__cf_email__" data-cfemail="*****@*****.**">Sales at MAQSoftware dot com</span>
"""
    for sample in split_text(CFEMAIL):
        tree = parse_html(sample)
        res = list(audit_etree(tree))
        assert len(res) == 1
Beispiel #4
0
def cordIndo(q):
	try:
		id_ = get('http://app.chordindonesia.com/?json=get_search_results&exclude=date,modified,attachments,comment_count,comment_status,thumbnail,thumbnail_images,author,excerpt,content,categories,tags,comments,custom_fields&search=%s' % q).json()['posts'][0]['id']
		chord = get('http://app.chordindonesia.com/', params={"json":"get_post","id":id_}).json()
		result = html_text.parse_html(chord['post']['content']).text_content()
		return result
	except Exception as e:
		print(e)
		return "[❗] Maaf chord yang anda cari tidak dapat saya temukan"
Beispiel #5
0
def test_cfemail():
    MAILTO = """
<a href="/cdn-cgi/l/email-protection#e48d8a828ba496819091968ac98b8ac98d8a978d838c90ca878b89"><span class="__cf_email__" data-cfemail="cfa6a1a9a08fbdaabbbabda1e2a0a1e2a6a1bca6a8a7bbe1aca0a2">[email&#160;protected]</span></a>
<a href="/cdn-cgi/l/email-protection#761f1810193602121711041903065815191b"><span class="__cf_email__" data-cfemail="a2cbccc4cde2d6c6c3c5d0cdd7d28cc1cdcf">[email&#160;protected]</span></a>
<a href="/cdn-cgi/l/email-protection#7b131e1717143b081e091e151f120b120f024955181416"><span class="__cf_email__" data-cfemail="a5cdc0c9c9cae5d6c0d7c0cbc1ccd5ccd1dc978bc6cac8">[email&#160;protected]</span></a>
<a class='underline' href="/cdn-cgi/l/email-protection#452c2b232a052824372e2031373c2c2b266b262a28"><span class="__cf_email__" data-cfemail="a3cacdc5cce3cec2d1c8c6d7d1dacacdc08dc0ccce">[email&#160;protected]</span></a>
    """
    for sample in split_text(MAILTO):
        tree = parse_html(sample)
        res = list(set(audit_etree(tree)))
        assert len(res) == 1 and '@' in res[0], (sample, res)
Beispiel #6
0
def test_webpages(page, extracted):
    html = _load_file(page)
    if not six.PY3:
        # FIXME: &nbsp; produces '\xa0' in Python 2, but ' ' in Python 3
        # this difference is ignored in this test.
        # What is the correct behavior?
        html = html.replace('&nbsp;', ' ')
    expected = _load_file(extracted)
    assert extract_text(html) == expected

    tree = cleaner.clean_html(parse_html(html))
    assert etree_to_text(tree) == expected
Beispiel #7
0
def test_etree_mailto():
    MAILTO = """
<a href="mailto:[email protected]">
<a href="mailto:%66%6f%6f%40%62%61%72%2e%63%6f%6d">
<a href="mailto:silvan3&#64;tilllate&#46;com">
<a href="mailto:%73%69%6c%76%61%6e%34%40%74%69%6c%6c%6c%61%74%65%2e%63%6f%6d">
<a href="mailto:%75%73%65%72%40%64%6f%6d%61%69%6e%2e%74%6c%64">
<a href="&#109&#97&#105&#108&#116&#111&#58&#117&#115&#101&#114&#64&#100&#111&#109&#97&#105&#110&#46&#116&#108&#100">
<a href='m&#97;ilto&#58;%4Aoh&#110;&#46;Doe&#64;e%78a&#109;&#112;le%2E%63&#111;m'>
<a href='&#109;&#97;&#105;&#108;&#116;&#111;&#58;&#106;&#111;&#104;&#110;&#64;&#121;&#97;&#104;&#111;&#111;&#46;&#99;&#111;&#109;'>
<meta itemprop="email" content="*****@*****.**" />
    """
    for sample in split_text(MAILTO):
        tree = parse_html(sample)
        res = list(audit_etree(tree))
        assert len(res) == 1 and '@' in res[0], (sample, res)
Beispiel #8
0
def brainly(url: str):
    C = requests.get(url)
    bs = BeautifulSoup(C.text, "html.parser")
    if bs.find_all("h1", attrs={"data-test": "question-box-text"}):
        soal = bs.find_all("h1",
                           attrs={"data-test":
                                  "question-box-text"})[0].span.text.strip()
        if bs.find_all("div", attrs={"data-test": "answer-box-text"}):
            print(
                bs.find_all("div", attrs={"data-test":
                                          "answer-box-text"})[0]("div"))
            jawaban = html_text.parse_html(
                bs.find_all("div",
                            attrs={"data-test":
                                   "answer-box-text"})[0].__str__().replace(
                                       "<br/>", "\n")).text_content()
            return {"soal": soal, "jawaban": jawaban}
Beispiel #9
0
def chord():
    if request.args.get('lagu'):
        try:
            lagu = request.args.get('lagu').replace(' ', '+')
            id = get(
                'http://app.chordindonesia.com/?json=get_search_results&exclude=date,modified,attachments,comment_count,comment_status,thumbnail,thumbnail_images,author,excerpt,content,categories,tags,comments,custom_fields&search=%s'
                % lagu).json()['posts'][0]['id']
            chord = get('http://app.chordindonesia.com/?json=get_post&id=%s' %
                        id).json()
            result = html_text.parse_html(
                chord['post']['content']).text_content()
            return {'status': 200, 'result': result}
        except Exception as e:
            print(e)
            return {
                'status': false,
                'error':
                '[❗] Maaf chord yang anda cari tidak dapat saya temukan!'
            }
    else:
        return {'status': false, 'pesan': 'Masukkan parameter q'}
Beispiel #10
0
def test_extract_text_from_node(all_options):
    html = (u'<html><style>.div {}</style>'
            '<body><p>Hello,   world!</p></body></html>')
    tree = parse_html(html)
    node = tree.xpath('//p')[0]
    assert extract_text(node, **all_options) == u'Hello, world!'
Beispiel #11
0
def test_extract_text_from_tree(all_options):
    html = (u'<html><style>.div {}</style>'
            '<body><p>Hello,   world!</body></html>')
    tree = parse_html(html)
    assert extract_text(tree, **all_options) == u'Hello, world!'
Beispiel #12
0
def test_extract_text_from_tree():
    html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
    tree = parse_html(html)
    assert extract_text(tree) == u'Hello, world!'
Beispiel #13
0
def test_extract_text_from_fail_html():
    html = "<html><frameset><frame></frameset></html>"
    tree = parse_html(html)
    node = tree.xpath('/html/frameset')[0]
    assert extract_text(node) == u''