Beispiel #1
0
from lightspider.baike import search
from lightspider import Spider

tasks = ['曹操', '曹操字孟德']

spider = Spider(base_url=search.base_url,
                save_format=search.save_format,
                save_path=r'D:\Data\NLP\corpus\baike_search')

if __name__ == '__main__':
    spider.run(tasks, search.parser)
Beispiel #2
0
    :return: type:tuple:(解析得到的数据对象, 解析页面得到的新的tasks(要么为List,要么为None)
    """
    html = etree.HTML(html)
    info = html.xpath('//div[@class="col-md-8"]')[0]
    words = [re.sub(r'\(\d+\)', '', item.xpath('string(.)')) for item in info.xpath('./b')[:-1]]
    mean = info.xpath('./a/text()')[0]
    return {
        'mean': mean,
        'words': words
    }, None

# 编写生成tasks脚本
# tasks = []
# base_url = 'https://www.cilin.org/jyc/b_{}.html'
# for i in range(1, 9996):
#     tasks.append(i)
#
# spider = Spider(base_url=base_url, style='json', save_path=r'D:\Data\NLP\corpus\jyc')


tasks = []
base_url = 'https://www.cilin.org/jyc/b_{}.html'
for i in range(1, 30):
    tasks.append(i)

spider = Spider(base_url=base_url, style='json', save_path=r'D:\Data\NLP\corpus\test')


if __name__ == '__main__':
    spider.run(tasks, handler)
Beispiel #3
0
        'content': content,
        'created_time': created_time,
        'categories': categories,
        'latest_time': latest_time,
        'tags': tags,
        'url': response.url
    }, None


if __name__ == '__main__':
    base_url = r'http://www.lightsmile.cn{}'
    archives_url = r'http://www.lightsmile.cn/archives'
    archives_res = get_response(archives_url)
    archives_html = etree.HTML(archives_res.text)
    section = archives_html.xpath('//section[@class="archive"]')[0]
    archives = section.xpath('.//a')
    tasks = []
    for archive in archives:
        tasks.append(archive.xpath('string(./@href)'))
    save_format = 'json'

    spider = Spider(base_url=base_url, save_format=save_format, save_path=r'D:\Data\NLP\corpus\my_blogs')
    spider.run(tasks, parser)
    # test_url = base_url.format(tasks[39])
    # test_res = get_response(test_url)
    # test_result = parser(test_res)
    # print(test_result)
    # test_url = base_url.format(tasks[39])
    # test_res = get_response(test_url)
    # test_result = parser(test_res)
Beispiel #4
0
from lightspider.baike import info
from lightspider import Spider

base_url = 'https://baike.baidu.com/view/{}.htm'
tasks = [x for x in range(4000000, 5000000)]
save_format = 'json'

notification = {"to": "*****@*****.**", "task_name": "baike_info"}

spider = Spider(base_url=base_url,
                save_format=save_format,
                save_path=r'D:\Data\NLP\corpus\baike_info_4000000_to_5000000')

if __name__ == '__main__':
    spider.run(tasks, info.parser, notification=notification)
Beispiel #5
0
from lightspider.baike import href
from lightspider import Spider

tasks = ['曹操', '曹操字孟德']

notification = {"to": "*****@*****.**", "task_name": "baike_href"}

spider = Spider(base_url=href.base_url,
                save_format=href.save_format,
                save_path=r'D:\Data\NLP\corpus\baike_href')

if __name__ == '__main__':
    spider.run(tasks, href.parser, notification)
Beispiel #6
0
# @Software: PyCharm

from lightspider import Spider, light
from lxml import etree


@light
def parser(response):
    html = etree.HTML(response.text)
    title = html.xpath(
        'string(.//div[@class="list-group-item active-cat"])').strip()
    items = html.xpath('.//div[@id="ipt-kb-affix-active-post"]/a')
    items = [(item.xpath('string(.)').strip(), item.xpath('string(./@href)'))
             for item in items]
    return {'category': title, 'peoples': items}, None


base_url = 'http://www.w3guo.com/wiki/hero/{}'
tasks = ['other', 'wu', 'wei', 'shu', 'jin']

save_format = 'json'
save_path = r'D:\Data\KG\three_kingdoms_people'

spider = Spider(base_url=base_url,
                save_format=save_format,
                save_path=save_path,
                interval=2)

if __name__ == '__main__':
    spider.run(tasks, parser)
Beispiel #7
0
from lightspider.baike import href
from lightspider import Spider

base_url = 'https://baike.baidu.com/view/{}.htm'
tasks = [x for x in range(2000000, 3000000)]
save_format = 'json'

notification = {"to": "*****@*****.**", "task_name": "baike_href"}

spider = Spider(base_url=base_url,
                save_format=save_format,
                save_path=r'D:\Data\NLP\corpus\baike_href_2000000_to_3000000')

if __name__ == '__main__':
    spider.run(tasks, href.parser, notification=notification)
Beispiel #8
0
    words = [
        re.sub(r'\(\d+\)', '', item.xpath('string(.)'))
        for item in info.xpath('./b')[:-1]
    ]
    mean = info.xpath('./a/text()')[0]
    return {'mean': mean, 'words': words}, None


# 编写生成tasks脚本
# tasks = []
# base_url = 'https://www.cilin.org/jyc/b_{}.html'
# for i in range(1, 9996):
#     tasks.append(i)
#
# spider = Spider(base_url=base_url, style='json', save_path=r'D:\Data\NLP\corpus\jyc')

tasks = []
for i in range(1, 30):
    tasks.append(i)

base_url = 'https://www.cilin.org/jyc/b_{}.html'
save_format = 'json'

spider = Spider(base_url=base_url,
                save_format=save_format,
                save_path=r'D:\Data\NLP\corpus\test',
                proxy=DEFAULT_PROXY)

if __name__ == '__main__':
    spider.run(tasks, parser)