Ejemplo n.º 1
0
def test_process_item():
    root_path = 'https://www.busdmm.work'
    url = 'https://www.busdmm.work/SSNI-452'
    session = HTMLSession()
    router = get_router()
    router.add_root_path(root_path)
    fanhao = 'SSNI-452'
    r = session.get(url)
    process_item(r.text, url, fanhao)
Ejemplo n.º 2
0
def test_process_item():
    root_path = 'https://www.cdnbus.bid'
    url = 'https://www.cdnbus.bid/CESD-797'
    session = HTMLSession()
    router = get_router()
    router.add_root_path(root_path)
    fanhao = 'CESD-797'
    r = session.get(url)
    process_item(r.text, url, fanhao)
Ejemplo n.º 3
0
def html():
    # url = 'https://www.cdnbus.bid/SHKD-875'
    url = 'https://www.busdmm.work/DVAJ-419'
    router = get_router()
    router.add_root_path(url.rsplit('/')[0])

    async def fetch(session, url):
        async with session.get(url) as response:
            return await response.text(errors='ignore')

    async def main():
        async with aiohttp.ClientSession() as session:
            html = await fetch(session, url)
            return html

    html = asyncio.run(main())
    return html
Ejemplo n.º 4
0
'''
例子: douban top 250 电影名单爬取
'''
from collections import namedtuple
from aspider.routeing import get_router
from aspider import aspider
from requests_html import HTML

Movie = namedtuple('Movie', ['rank', 'score', 'title'])

router = get_router()

root_url = 'https://movie.douban.com/top250'

movies_250 = []


@router.route('/top250\?start.+')
def process_page(text):
    html = HTML(html=text)
    item_css = '#content  ol.grid_view > li'
    items = html.find(item_css)
    rank_css = 'em'
    title_css = '.info  span.title'
    score_css = '.info  .rating_num'
    for item in items:
        rank = int(item.find(rank_css, first=True).text)
        title = item.find(title_css, first=True).text
        score = float(item.find(score_css, first=True).text)
        movies_250.append(Movie(rank, score, title))