def test_save_as_json(): data = [{'rank': 2, 'name': 'python'}, {'rank': 1, 'name': 'js'}, {'rank': 3, 'name': 'java'}] lt.save_as_json(data, sort_by='rank') with open('data.json', 'r') as f: ordered_data = json.loads(f.read()) assert ordered_data[0]['rank'] == 1 os.remove('data.json') dup_data = [{'a': 1}, {'a': 1}, {'b': 2}] lt.save_as_json(dup_data, no_duplicate=True) with open('data.json', 'r') as f: unique_data = json.loads(f.read()) assert len(dup_data) > len(unique_data) os.remove('data.json')
import looter as lt from concurrent import futures domain = 'http://www.xicidaili.com' proxies = [] def crawl(url): tree = lt.fetch(url) items = tree.cssselect('table tr')[1:] for item in items: schema = item.cssselect('td')[-5].text.lower() ip = item.cssselect('td')[1].text port = item.cssselect('td')[2].text proxy = f'{schema}://{ip}:{port}' print(proxy) proxies.append(proxy) if __name__ == '__main__': tasklist = [f'{domain}/nn/{i}' for i in range(1, 100)] with futures.ThreadPoolExecutor(20) as executor: executor.map(crawl, tasklist) lt.save_as_json(proxies, name='proxies')
import asyncio import looter as lt from pprint import pprint domain = 'https://www.qiushibaike.com' total = [] async def crawl(url): tree = await lt.async_fetch(url) items = tree.cssselect('.article') for item in items: data = dict() data['author'] = item.cssselect('h2')[0].text.strip() data['content'] = item.cssselect('.content span')[0].text.strip() data['vote'] = int(item.cssselect('.stats-vote .number')[0].text) data['comments'] = int(item.cssselect('.stats-comments .number')[0].text) data['url'] = domain + item.cssselect('a.contentHerf')[0].get('href') pprint(data) total.append(data) if __name__ == '__main__': tasklist = [f'{domain}/hot/page/{i}/' for i in range(1, 14)] loop = asyncio.get_event_loop() result = [crawl(task) for task in tasklist] loop.run_until_complete(asyncio.wait(result)) lt.save_as_json(total, name='qsbk', sort_by='vote')
def get_tasklist(json_file): with open(json_file, 'r', encoding='utf-8') as f: data = json.loads(f.read()) tasklist = [job['link'] for job in data] return tasklist def crawl(url): tree = lt.fetch(url) data = dict() cols = ['salary', 'place', 'date', 'nature', 'experience', 'degree', 'amount', 'category'] for i, col in enumerate(cols): data[col] = tree.cssselect('ul.terminal-ul li')[i].cssselect('strong')[0].text data['salary'] = "".join(data['salary'].split()) del data['place'] data['date'] = tree.cssselect('ul.terminal-ul li')[2].cssselect('strong #span4freshdate')[0].text data['category'] = tree.cssselect('ul.terminal-ul li')[7].cssselect('strong a')[0].text detail = tree.cssselect('.tab-inner-cont p') data['detail'] = ''.join([p.text for p in detail if p.text]).strip() pprint(data) total.append(data) if __name__ == '__main__': tasklist = get_tasklist('zhilian_jobs.json') tasklist.reverse() with futures.ThreadPoolExecutor(40) as executor: executor.map(crawl, tasklist) lt.save_as_json(total, name='zhilian_details')
import json import looter as lt from concurrent import futures domain = 'https://github.com/tuna/blogroll/blob/master/README.md' total_rank = [] def get_tasklist(url): tree = lt.fetch(url) links = tree.cssselect('table tbody tr td:nth-child(3) a') return [link.get('href') for link in links] def crawl(url): rank = lt.alexa_rank(url) if rank: data = {} data['site'] = rank[0] data['reach'] = int(rank[1]) data['popularity'] = int(rank[2]) total_rank.append(data) if __name__ == '__main__': tasklist = get_tasklist(domain) with futures.ThreadPoolExecutor(20) as executor: executor.map(crawl, tasklist) lt.save_as_json(total_rank, name='BlogRank', sort_by='popularity')
from pprint import pprint import looter as lt domain = 'https://salttiger.com' def crawl(url): tree = lt.fetch(url) items = tree.css('ul.car-monthlisting li') total = [] for item in items: data = {} data['name'] = item.css('a::text').extract_first() data['url'] = item.css('a::attr(href)').extract_first() data['comments'] = int(item.css('span::text').re_first(r'(\d+)')) pprint(data) total.append(data) return total if __name__ == '__main__': task = f'{domain}/archives/' result = crawl(task) lt.save_as_json(result, name='salttiger.json', sort_by='comments', order='desc')
import asyncio import looter as lt from pprint import pprint domain = 'https://www.javbus.pw' total = [] async def crawl(url): tree = await lt.async_fetch(url) items = tree.cssselect('#waterfall .item') for item in items: data = dict() data['name'] = item.cssselect('img')[0].get('title') data['cover'] = item.cssselect('img')[0].get('src') data['link'] = item.cssselect('.movie-box')[0].get('href') data['bango'] = item.cssselect('date')[0].text data['date'] = item.cssselect('date')[1].text pprint(data) total.append(data) if __name__ == '__main__': tasklist = [f'{domain}/page/{i}' for i in range(1, 90)] loop = asyncio.get_event_loop() result = [crawl(task) for task in tasklist] loop.run_until_complete(asyncio.wait(result)) lt.save_as_json(total, name='jav')
async def crawl(url): tree = await lt.async_fetch(url) items = tree.cssselect('.newlist_list_content table.newlist')[1:] for item in items: data = dict() data['name'] = 'python' data['link'] = item.cssselect('a')[0].get('href') data['company'] = item.cssselect('a')[1].text salary = item.cssselect('td.zwyx')[0].text if salary in ['面议', '1000元以下']: data['salary_min'] = data['salary_max'] = 0 else: data['salary_min'] = int(salary.split('-')[0]) data['salary_max'] = int(salary.split('-')[1]) data['place'] = item.cssselect('td.gzdd')[0].text pprint(data) total.append(data) if __name__ == '__main__': tasklist = [ f'{domain}/jobs/searchresult.ashx?jl=上海%2B苏州&kw=python&sm=0&p={i}' for i in range(1, 73) ] loop = asyncio.get_event_loop() result = [crawl(task) for task in tasklist] loop.run_until_complete(asyncio.wait(result)) lt.save_as_json(total, name='zhilian_jobs', sort_by='salary_max')