Beispiel #1
0
def test_save_as_json():
    data = [{'rank': 2, 'name': 'python'}, {'rank': 1, 'name': 'js'}, {'rank': 3, 'name': 'java'}]
    lt.save_as_json(data, sort_by='rank')
    with open('data.json', 'r') as f:
        ordered_data = json.loads(f.read())
    assert ordered_data[0]['rank'] == 1
    os.remove('data.json')
    dup_data = [{'a': 1}, {'a': 1}, {'b': 2}]
    lt.save_as_json(dup_data, no_duplicate=True)
    with open('data.json', 'r') as f:
        unique_data = json.loads(f.read())
    assert len(dup_data) > len(unique_data)
    os.remove('data.json')
Beispiel #2
0
import looter as lt
from concurrent import futures

domain = 'http://www.xicidaili.com'
proxies = []


def crawl(url):
    tree = lt.fetch(url)
    items = tree.cssselect('table tr')[1:]
    for item in items:
        schema = item.cssselect('td')[-5].text.lower()
        ip = item.cssselect('td')[1].text
        port = item.cssselect('td')[2].text
        proxy = f'{schema}://{ip}:{port}'
        print(proxy)
        proxies.append(proxy)


if __name__ == '__main__':
    tasklist = [f'{domain}/nn/{i}' for i in range(1, 100)]
    with futures.ThreadPoolExecutor(20) as executor:
        executor.map(crawl, tasklist)
    lt.save_as_json(proxies, name='proxies')
Beispiel #3
0
import asyncio
import looter as lt
from pprint import pprint

domain = 'https://www.qiushibaike.com'
total = []

async def crawl(url):
    tree = await lt.async_fetch(url)
    items = tree.cssselect('.article')
    for item in items:
        data = dict()
        data['author'] = item.cssselect('h2')[0].text.strip()
        data['content'] = item.cssselect('.content span')[0].text.strip()
        data['vote'] = int(item.cssselect('.stats-vote .number')[0].text)
        data['comments'] = int(item.cssselect('.stats-comments .number')[0].text)
        data['url'] = domain + item.cssselect('a.contentHerf')[0].get('href')
        pprint(data)
        total.append(data)


if __name__ == '__main__':
    tasklist = [f'{domain}/hot/page/{i}/' for i in range(1, 14)]
    loop = asyncio.get_event_loop()
    result = [crawl(task) for task in tasklist]
    loop.run_until_complete(asyncio.wait(result))
    lt.save_as_json(total, name='qsbk', sort_by='vote')
Beispiel #4
0
def get_tasklist(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.loads(f.read())
    tasklist = [job['link'] for job in data]
    return tasklist


def crawl(url):
    tree = lt.fetch(url)
    data = dict()
    cols = ['salary', 'place', 'date', 'nature', 'experience', 'degree', 'amount', 'category']
    for i, col in enumerate(cols):
        data[col] = tree.cssselect('ul.terminal-ul li')[i].cssselect('strong')[0].text
    data['salary'] = "".join(data['salary'].split())
    del data['place']
    data['date'] = tree.cssselect('ul.terminal-ul li')[2].cssselect('strong #span4freshdate')[0].text
    data['category'] = tree.cssselect('ul.terminal-ul li')[7].cssselect('strong a')[0].text
    detail = tree.cssselect('.tab-inner-cont p')
    data['detail'] = ''.join([p.text for p in detail if p.text]).strip()
    pprint(data)
    total.append(data)


if __name__ == '__main__':
    tasklist = get_tasklist('zhilian_jobs.json')
    tasklist.reverse()
    with futures.ThreadPoolExecutor(40) as executor:
        executor.map(crawl, tasklist)
    lt.save_as_json(total, name='zhilian_details')
Beispiel #5
0
import json
import looter as lt
from concurrent import futures

domain = 'https://github.com/tuna/blogroll/blob/master/README.md'
total_rank = []


def get_tasklist(url):
    tree = lt.fetch(url)
    links = tree.cssselect('table tbody tr td:nth-child(3) a')
    return [link.get('href') for link in links]


def crawl(url):
    rank = lt.alexa_rank(url)
    if rank:
        data = {}
        data['site'] = rank[0]
        data['reach'] = int(rank[1])
        data['popularity'] = int(rank[2])
        total_rank.append(data)


if __name__ == '__main__':
    tasklist = get_tasklist(domain)
    with futures.ThreadPoolExecutor(20) as executor:
        executor.map(crawl, tasklist)
    lt.save_as_json(total_rank, name='BlogRank', sort_by='popularity')
Beispiel #6
0
from pprint import pprint
import looter as lt

domain = 'https://salttiger.com'


def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('ul.car-monthlisting li')
    total = []
    for item in items:
        data = {}
        data['name'] = item.css('a::text').extract_first()
        data['url'] = item.css('a::attr(href)').extract_first()
        data['comments'] = int(item.css('span::text').re_first(r'(\d+)'))
        pprint(data)
        total.append(data)
    return total


if __name__ == '__main__':
    task = f'{domain}/archives/'
    result = crawl(task)
    lt.save_as_json(result,
                    name='salttiger.json',
                    sort_by='comments',
                    order='desc')
Beispiel #7
0
import asyncio
import looter as lt
from pprint import pprint

domain = 'https://www.javbus.pw'
total = []


async def crawl(url):
    tree = await lt.async_fetch(url)
    items = tree.cssselect('#waterfall .item')
    for item in items:
        data = dict()
        data['name'] = item.cssselect('img')[0].get('title')
        data['cover'] = item.cssselect('img')[0].get('src')
        data['link'] = item.cssselect('.movie-box')[0].get('href')
        data['bango'] = item.cssselect('date')[0].text
        data['date'] = item.cssselect('date')[1].text
        pprint(data)
        total.append(data)


if __name__ == '__main__':
    tasklist = [f'{domain}/page/{i}' for i in range(1, 90)]
    loop = asyncio.get_event_loop()
    result = [crawl(task) for task in tasklist]
    loop.run_until_complete(asyncio.wait(result))
    lt.save_as_json(total, name='jav')
Beispiel #8
0

async def crawl(url):
    tree = await lt.async_fetch(url)
    items = tree.cssselect('.newlist_list_content table.newlist')[1:]
    for item in items:
        data = dict()
        data['name'] = 'python'
        data['link'] = item.cssselect('a')[0].get('href')
        data['company'] = item.cssselect('a')[1].text
        salary = item.cssselect('td.zwyx')[0].text
        if salary in ['面议', '1000元以下']:
            data['salary_min'] = data['salary_max'] = 0
        else:
            data['salary_min'] = int(salary.split('-')[0])
            data['salary_max'] = int(salary.split('-')[1])
        data['place'] = item.cssselect('td.gzdd')[0].text
        pprint(data)
        total.append(data)


if __name__ == '__main__':
    tasklist = [
        f'{domain}/jobs/searchresult.ashx?jl=上海%2B苏州&kw=python&sm=0&p={i}'
        for i in range(1, 73)
    ]
    loop = asyncio.get_event_loop()
    result = [crawl(task) for task in tasklist]
    loop.run_until_complete(asyncio.wait(result))
    lt.save_as_json(total, name='zhilian_jobs', sort_by='salary_max')