def html2article(html, url, selector=False, merge=False, **options): extractor = ArticleExtractor(html, url, **options) article = extractor.article if article is not None and selector: article.update(extractor.selector) if article is not None and article['pages'] and merge: article['content'] = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector).content return article
def html2article(html, url, selector=False, merge=False, **options): extractor = ArticleExtractor(html, url, **options) article = extractor.article if article is not None and selector: article.update(extractor.selector) if article is not None and article['pages'] and merge: article['content'] = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector ).content return article
async def echo(event): try: urls = fetch_urls(event.text) for url in urls: logging.info(url) await page.goto(url) file_name = f'{time.time()}.png' await page.screenshot(path=file_name, fullPage=False) await event.reply(event.text, file=file_name) os.remove(file_name) except Exception as err: await event.reply(event.text) await event.respond(str(err)[:2000]) logging.exception(err) return
def test_article(): debug = True if request.args.get('debug') == 'true' else False url = request.args.get('url', '') if not url.startswith('http://'): return 'url is not startswith http://' add_test_url(url) html = get_or_cache(url, print_path=True) extractor = ArticleExtractor(html, url, debug=debug) article = extractor.article selector = extractor.selector if extractor.pages and article: article['content'] = ArticleMerger(url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), debug=debug, **selector).content return json.dumps({'url': url, 'article': article, 'selector': selector})
def test_article(): debug = True if request.args.get('debug') == 'true' else False url = request.args.get('url', '') if not url.startswith('http://'): return 'url is not startswith http://' add_test_url(url) html = get_or_cache(url, print_path=True) extractor = ArticleExtractor(html, url, debug=debug) article = extractor.article selector = extractor.selector if extractor.pages and article: article['content'] = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), debug=debug, **selector ).content return json.dumps({'url':url, 'article':article, 'selector':selector})
def test_segment(url): url = url.split('#')[0].split('?')[0] if not url.startswith('http://'): return 'url is not startswith http://' add_test_url(url) html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector ).content return json.dumps({ 'url':url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content)) })
def test_segment_all(): urls = get_test_urls() res = [] for url in urls: html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector ).content res.append({ 'url':url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content, code=False)) }) return json.dumps(res)
def test_segment(url): url = url.split('#')[0].split('?')[0] if not url.startswith('http://'): return 'url is not startswith http://' add_test_url(url) html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector).content return json.dumps({ 'url': url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content)) })
def test_segment_all(): urls = get_test_urls() res = [] for url in urls: html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector).content res.append({ 'url': url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content, code=False)) }) return json.dumps(res)