def test_process_item(): root_path = 'https://www.busdmm.work' url = 'https://www.busdmm.work/SSNI-452' session = HTMLSession() router = get_router() router.add_root_path(root_path) fanhao = 'SSNI-452' r = session.get(url) process_item(r.text, url, fanhao)
def test_process_item(): root_path = 'https://www.cdnbus.bid' url = 'https://www.cdnbus.bid/CESD-797' session = HTMLSession() router = get_router() router.add_root_path(root_path) fanhao = 'CESD-797' r = session.get(url) process_item(r.text, url, fanhao)
def html(): # url = 'https://www.cdnbus.bid/SHKD-875' url = 'https://www.busdmm.work/DVAJ-419' router = get_router() router.add_root_path(url.rsplit('/')[0]) async def fetch(session, url): async with session.get(url) as response: return await response.text(errors='ignore') async def main(): async with aiohttp.ClientSession() as session: html = await fetch(session, url) return html html = asyncio.run(main()) return html
''' 例子: douban top 250 电影名单爬取 ''' from collections import namedtuple from aspider.routeing import get_router from aspider import aspider from requests_html import HTML Movie = namedtuple('Movie', ['rank', 'score', 'title']) router = get_router() root_url = 'https://movie.douban.com/top250' movies_250 = [] @router.route('/top250\?start.+') def process_page(text): html = HTML(html=text) item_css = '#content ol.grid_view > li' items = html.find(item_css) rank_css = 'em' title_css = '.info span.title' score_css = '.info .rating_num' for item in items: rank = int(item.find(rank_css, first=True).text) title = item.find(title_css, first=True).text score = float(item.find(score_css, first=True).text) movies_250.append(Movie(rank, score, title))