from lightspider.baike import search from lightspider import Spider tasks = ['曹操', '曹操字孟德'] spider = Spider(base_url=search.base_url, save_format=search.save_format, save_path=r'D:\Data\NLP\corpus\baike_search') if __name__ == '__main__': spider.run(tasks, search.parser)
:return: type:tuple:(解析得到的数据对象, 解析页面得到的新的tasks(要么为List,要么为None) """ html = etree.HTML(html) info = html.xpath('//div[@class="col-md-8"]')[0] words = [re.sub(r'\(\d+\)', '', item.xpath('string(.)')) for item in info.xpath('./b')[:-1]] mean = info.xpath('./a/text()')[0] return { 'mean': mean, 'words': words }, None # 编写生成tasks脚本 # tasks = [] # base_url = 'https://www.cilin.org/jyc/b_{}.html' # for i in range(1, 9996): # tasks.append(i) # # spider = Spider(base_url=base_url, style='json', save_path=r'D:\Data\NLP\corpus\jyc') tasks = [] base_url = 'https://www.cilin.org/jyc/b_{}.html' for i in range(1, 30): tasks.append(i) spider = Spider(base_url=base_url, style='json', save_path=r'D:\Data\NLP\corpus\test') if __name__ == '__main__': spider.run(tasks, handler)
'content': content, 'created_time': created_time, 'categories': categories, 'latest_time': latest_time, 'tags': tags, 'url': response.url }, None if __name__ == '__main__': base_url = r'http://www.lightsmile.cn{}' archives_url = r'http://www.lightsmile.cn/archives' archives_res = get_response(archives_url) archives_html = etree.HTML(archives_res.text) section = archives_html.xpath('//section[@class="archive"]')[0] archives = section.xpath('.//a') tasks = [] for archive in archives: tasks.append(archive.xpath('string(./@href)')) save_format = 'json' spider = Spider(base_url=base_url, save_format=save_format, save_path=r'D:\Data\NLP\corpus\my_blogs') spider.run(tasks, parser) # test_url = base_url.format(tasks[39]) # test_res = get_response(test_url) # test_result = parser(test_res) # print(test_result) # test_url = base_url.format(tasks[39]) # test_res = get_response(test_url) # test_result = parser(test_res)
from lightspider.baike import info from lightspider import Spider base_url = 'https://baike.baidu.com/view/{}.htm' tasks = [x for x in range(4000000, 5000000)] save_format = 'json' notification = {"to": "*****@*****.**", "task_name": "baike_info"} spider = Spider(base_url=base_url, save_format=save_format, save_path=r'D:\Data\NLP\corpus\baike_info_4000000_to_5000000') if __name__ == '__main__': spider.run(tasks, info.parser, notification=notification)
from lightspider.baike import href from lightspider import Spider tasks = ['曹操', '曹操字孟德'] notification = {"to": "*****@*****.**", "task_name": "baike_href"} spider = Spider(base_url=href.base_url, save_format=href.save_format, save_path=r'D:\Data\NLP\corpus\baike_href') if __name__ == '__main__': spider.run(tasks, href.parser, notification)
# @Software: PyCharm from lightspider import Spider, light from lxml import etree @light def parser(response): html = etree.HTML(response.text) title = html.xpath( 'string(.//div[@class="list-group-item active-cat"])').strip() items = html.xpath('.//div[@id="ipt-kb-affix-active-post"]/a') items = [(item.xpath('string(.)').strip(), item.xpath('string(./@href)')) for item in items] return {'category': title, 'peoples': items}, None base_url = 'http://www.w3guo.com/wiki/hero/{}' tasks = ['other', 'wu', 'wei', 'shu', 'jin'] save_format = 'json' save_path = r'D:\Data\KG\three_kingdoms_people' spider = Spider(base_url=base_url, save_format=save_format, save_path=save_path, interval=2) if __name__ == '__main__': spider.run(tasks, parser)
from lightspider.baike import href from lightspider import Spider base_url = 'https://baike.baidu.com/view/{}.htm' tasks = [x for x in range(2000000, 3000000)] save_format = 'json' notification = {"to": "*****@*****.**", "task_name": "baike_href"} spider = Spider(base_url=base_url, save_format=save_format, save_path=r'D:\Data\NLP\corpus\baike_href_2000000_to_3000000') if __name__ == '__main__': spider.run(tasks, href.parser, notification=notification)
words = [ re.sub(r'\(\d+\)', '', item.xpath('string(.)')) for item in info.xpath('./b')[:-1] ] mean = info.xpath('./a/text()')[0] return {'mean': mean, 'words': words}, None # 编写生成tasks脚本 # tasks = [] # base_url = 'https://www.cilin.org/jyc/b_{}.html' # for i in range(1, 9996): # tasks.append(i) # # spider = Spider(base_url=base_url, style='json', save_path=r'D:\Data\NLP\corpus\jyc') tasks = [] for i in range(1, 30): tasks.append(i) base_url = 'https://www.cilin.org/jyc/b_{}.html' save_format = 'json' spider = Spider(base_url=base_url, save_format=save_format, save_path=r'D:\Data\NLP\corpus\test', proxy=DEFAULT_PROXY) if __name__ == '__main__': spider.run(tasks, parser)