Esempio n. 1
0
    def parse(self, response):
        # 获取下一页的url
        next_url = get_first_element(response.xpath(".//div[@class='pagerbar']/a[7]/@href").extract())
        # 获取详情页的url
        details_urls = response\
            .xpath(".//ul[@class='super-jobs job-lists']/li/div/p[@class='job-name']/span/a/@href").extract()

        # 下一页插入到redis的start_url list中
        if next_url:
            if details_urls:
                insert_into_job_spider(next_url.strip(), 5)
        # 详情页插入到redis的request list中
        for url in details_urls:
            if url.startswith('https://campus.liepin.com'):
                insert_into_job_spider(url.strip(), 6)
Esempio n. 2
0
    def parse(self, response):
        # 获取下一页的url
        next_url = get_first_element(
            response.xpath(".//li[@class='bk']/a/@href").extract())
        # 获取详情页的url
        details_urls = response.xpath(
            ".//div[@id='resultList']/div[@class='el']/p[@class='t1 ']/span/a/@href"
        ).extract()

        # 下一页插入到redis的start_url list中
        if next_url:
            if details_urls:
                insert_into_job_spider(next_url, 3)
        # 详情页插入到redis的request list中
        for url in details_urls:
            if url.startswith('https://jobs.51job.com'):
                insert_into_job_spider(url, 4)
Esempio n. 3
0
    def parse(self, response):
        # 获取下一页的url
        next_url = get_last_element(
            response.xpath(
                ".//div[@class='pagination-bar']/a/@href").extract())
        # 获取详情页的url
        details_urls = response.xpath(
            ".//a[@class='job-name Fellip']/@href").extract()

        # 下一页插入到redis的start_url list中
        if next_url and next_url != 'javascript:;':
            if details_urls:
                next_url = "https://" + self.allowed_domains[0] + next_url
                insert_into_job_spider(next_url.strip(), 7)
        # 详情页插入到redis的request list中
        for url in details_urls:
            url = "https://" + self.allowed_domains[0] + url
            insert_into_job_spider(url.strip(), 8)
Esempio n. 4
0
    def parse(self, response):
        # 获取下一页的url
        next_url = get_first_element(
            response.xpath(
                ".//span[@class='font12 pageNext']/parent::*/@href").extract())
        # 获取详情页的url
        details_urls = response.xpath(
            "//div[@class='searchResultJobinfo fr']/p[1]/a/@href").extract()

        # 下一页插入到redis的start_url list中
        if next_url:
            if details_urls:
                insert_into_job_spider(
                    "https://" + self.allowed_domains[0] + next_url, 1)
        # 详情页插入到redis的request list中
        for url in details_urls:
            insert_into_job_spider(
                url if url.startswith('http') else "https:" + url, 2)
Esempio n. 5
0
from JobSpider.spiders.zhilian_spider import ZhilianSpider
from JobSpider.spiders.wuyou_spider import WuyouSpider
from JobSpider.spiders.liepin_spider import LiepinSpider
from JobSpider.spiders.chinahr_spider import ChinahrSpider

# Init
print("======Init spider start")
logging.info("======Init spider start")

# zhilian
print('======Init insert start_url into redis[job_spider_start_urls_zl] start')
logging.info(
    '======Init insert start_url into redis[job_spider_start_urls_zl] start')

for url in ZhilianSpider.start_urls:
    insert_into_job_spider(url, 1)

print('======Init insert start_url into redis[job_spider_start_urls_zl] end')
logging.info(
    '======Init insert start_url into redis[job_spider_start_urls_zl] end')

# wuyou
print('======Init insert start_url into redis[job_spider_start_urls_wy] start')
logging.info(
    '======Init insert start_url into redis[job_spider_start_urls_wy] start')

for url in WuyouSpider.start_urls:
    insert_into_job_spider(url, 3)

print('======Init insert start_url into redis[job_spider_start_urls_wy] end')
logging.info(