Example #1
0
    def parse(self, response):
        for item in response.xpath('//div[@class="menu_box"]/div/dl/dd/a'):
            jobClass = item.xpath('text()').extract()
            jobUrl = item.xpath("@href").extract_first()

            oneItem = FirstItem()
            oneItem["jobClass"] = jobClass
            oneItem["jobUrl"] = jobUrl
            # print(jobUrl)
            # yield oneItem

            # jobUrl https://www.lagou.com/zhaopin/Java/2/?filterOption=3
            # https://www.lagou.com/zhaopin/Java/
            # print(jobUrl)
            for i in range(30):

                jobUrl2 = jobUrl + str(i + 1)
                # print(jobUrl2)
                try:
                    yield scrapy.Request(url=jobUrl2,
                                         cookies=self.cookie,
                                         meta={"jobClass": jobClass},
                                         callback=self.parse_url)
                except:
                    pass
Example #2
0
    def parse_url(self, response):
        jobClass = response.meta["jobClass"]

        # print(title)
        for sel2 in response.xpath('//ul[@class="item_con_list"]/li'):
            jobName = sel2.xpath('div/div/div/a/h3/text()').extract()
            jobPlace = sel2.xpath('div/div/div/a/span/em/text()').extract()
            jobMoney = sel2.xpath('div/div/div/div/span/text()').extract()
            jobNeed = sel2.xpath('div/div/div/div/text()').extract()
            jobNeed = jobNeed[2].strip()
            jobCompany = sel2.xpath('div/div/div/a/text()').extract()
            jobCompany = jobCompany[3].strip()

            jobType = sel2.xpath('div/div/div/text()').extract()
            jobType = jobType[7].strip()

            jobSpesk = sel2.xpath(
                'div[@class="list_item_bot"]/div/text()').extract()
            jobSpesk = jobSpesk[-1].strip()

            Item = FirstItem()
            Item["jobName"] = jobName
            Item["jobPlace"] = jobPlace
            Item["jobMoney"] = jobMoney
            Item["jobNeed"] = jobNeed
            Item["jobCompany"] = jobCompany
            Item["jobType"] = jobType
            Item["jobSpesk"] = jobSpesk
            # print(oneItem["jobName"])
            yield Item
Example #3
0
    def parse(self, response):
        for item in response.xpath('//div[@class="menu_box"]/div/dl/dd/a'):
            jobClass = item.xpath('string()').extract()#提取文本内容
            jobUrl = item.xpath("@href").extract_first()#提取链接内容

            oneItem = FirstItem()#实例化一个item对象,将获取到的数据存入
            oneItem["jobClass"] =jobClass
            oneItem["jobUrl"] = jobUrl
            yield oneItem#输出item
Example #4
0
 def __init__(self):
     self.logger.info('jidian.py的日志')
     self.config = ConfigParser()
     self.config.read(os.getcwd() + './spiders/config/config.ini')
     self.items = FirstItem()
     self.items['spider_name'] = 'jidian'
     try:
         with open(os.getcwd() + '/spiders/url_deduplication/{}.txt'.format('jidian'), 'r') as f:
             self.url_list = f.read().splitlines()
     except Exception as e:
         self.url_list = []
Example #5
0
    def parse(self, response):
        for item in response.xpath('//div[@class="menu_box"]/div/dl/dd/a'):
            jobClass = item.xpath('text()').extract()
            jobUrl = item.xpath("@href").extract_first()

            oneItem = FirstItem()
            oneItem["jobClass"] = jobClass
            oneItem["jobUrl"] = jobUrl
            yield scrapy.Request(url=jobUrl,
                                 cookies=self.cookie,
                                 callback=self.parse_url)
Example #6
0
    def parse_url(self, response):
        print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
        title = response.meta["title"]
        print(title)

        for sel2 in response.xpath('//a[@class="Author logSend"]'):
            docName = sel2.xpath("text()").extract()

            oneItem = FirstItem()
            oneItem["docName"] = docName

            print("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb")
            print(oneItem["docName"])
Example #7
0
    def parse(self, response):
        for item in response.xpath('//div[@class="menu_box"]/div/dl/dd/a'):
            jobClass = item.xpath('text()').extract()
            jobUrl = item.xpath("@href").extract_first()

            oneItem = FirstItem()
            oneItem["jobClass"] = jobClass
            oneItem["jobUrl"] = jobUrl
            for i in range(30):
                jobUrl2 = jobUrl + str(i + 1)
                #print(jobUrl2)
                try:
                    yield scrapy.Request(url=jobUrl2,
                                         cookies=self.cookie,
                                         callback=self.parse_url)
                except:
                    pass
Example #8
0
    def parse_url(self, response):

        for sel in response.xpath('//*[@id="s_position_list"]/ul/li'):
            jname = sel.xpath('div/div/div/a/h3/text()').extract_first()
            jmoney = sel.xpath('div/div/div/div/span/text()').extract_first()

            jcompany = sel.xpath('div/div/div/a/text()').extract()
            jcompany = jcompany[3].strip()

            jneed = sel.xpath('div/div/div/div/text()').extract()

            jneed = jneed[2].strip()
            jaddress = sel.xpath(
                'div/div/div/a/span/em/text()').extract_first()

            fi = FirstItem()
            fi['jname'] = jname
            fi['jmoney'] = jmoney
            fi['jneed'] = jneed
            fi['jcompany'] = jcompany
            fi['jaddress'] = jaddress

            yield fi
 def __init__(self):
     self.turn_page = True
     self.logger.info('gonggongziyuan_13.py的日志')
     self.config = ConfigParser()
     self.config.read(os.getcwd() + './spiders/config/config.ini')
     self.items = FirstItem()
     self.items['spider_name'] = 'gonggongziyuan_13'
     try:
         with open(
                 os.getcwd() + '/spiders/url_deduplication/{}.txt'.format(
                     'gonggongziyuan_13'), 'r') as f:
             self.url_list = f.read().splitlines()
     except Exception as e:
         self.url_list = []
     self.data = {
         'currentPage': '1',
         'area': '000',
         'industriesTypeCode': '',
         'scrollValue': '865',
         'tenderProjectCode': '',
         'bulletinName': '',
         'secondArea': ''
     }
Example #10
0
 def __init__(self):
     self.turn_page = True
     self.logger.info('guojiaji_3.py的日志')
     self.config = ConfigParser()
     self.config.read(os.getcwd() + './spiders/config/config.ini')
     self.items = FirstItem()
     self.items['spider_name'] = 'guojiaji_3'
     try:
         with open(
                 os.getcwd() +
                 '/spiders/url_deduplication/{}.txt'.format('guojiaji_3'),
                 'r') as f:
             self.url_list = f.read().splitlines()
     except Exception as e:
         self.url_list = []
     self.data = {
         'next': '1',
         'table': 'news',
         'classid': '2,3,4,5,6',
         'action': 'getmorenews',
         'limit': '15',
         'small_length': '120'
     }
Example #11
0
    def parse(self, response):
        print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
        # for item in response.xpath("//div/dl/dt/a"):
        #     title = item.xpath("text()").extract()
        #     targetUrl = item.xpath("@href").extract()

        #     oneItem = FirstItem()
        #     oneItem["title"] = title
        #     oneItem["targetUrl"] = targetUrl
        #     print(oneItem)

        for item in response.xpath("//div/dl/dd/a"):
            title = item.xpath("text()").extract()
            targetUrl = item.xpath("@href").extract_first()

            oneItem = FirstItem()
            oneItem["title"] = title
            oneItem["targetUrl"] = targetUrl
            # url = "https://wenku.baidu.com/" + oneItem["targetUrl"]
            # yield oneItem
            yield scrapy.Request(url="https://wenku.baidu.com/list/71",
                                 meta={"title": title},
                                 callback=self.parse_url)