def parse(self, response): for item in response.xpath('//div[@class="menu_box"]/div/dl/dd/a'): jobClass = item.xpath('text()').extract() jobUrl = item.xpath("@href").extract_first() oneItem = FirstItem() oneItem["jobClass"] = jobClass oneItem["jobUrl"] = jobUrl # print(jobUrl) # yield oneItem # jobUrl https://www.lagou.com/zhaopin/Java/2/?filterOption=3 # https://www.lagou.com/zhaopin/Java/ # print(jobUrl) for i in range(30): jobUrl2 = jobUrl + str(i + 1) # print(jobUrl2) try: yield scrapy.Request(url=jobUrl2, cookies=self.cookie, meta={"jobClass": jobClass}, callback=self.parse_url) except: pass
def parse_url(self, response): jobClass = response.meta["jobClass"] # print(title) for sel2 in response.xpath('//ul[@class="item_con_list"]/li'): jobName = sel2.xpath('div/div/div/a/h3/text()').extract() jobPlace = sel2.xpath('div/div/div/a/span/em/text()').extract() jobMoney = sel2.xpath('div/div/div/div/span/text()').extract() jobNeed = sel2.xpath('div/div/div/div/text()').extract() jobNeed = jobNeed[2].strip() jobCompany = sel2.xpath('div/div/div/a/text()').extract() jobCompany = jobCompany[3].strip() jobType = sel2.xpath('div/div/div/text()').extract() jobType = jobType[7].strip() jobSpesk = sel2.xpath( 'div[@class="list_item_bot"]/div/text()').extract() jobSpesk = jobSpesk[-1].strip() Item = FirstItem() Item["jobName"] = jobName Item["jobPlace"] = jobPlace Item["jobMoney"] = jobMoney Item["jobNeed"] = jobNeed Item["jobCompany"] = jobCompany Item["jobType"] = jobType Item["jobSpesk"] = jobSpesk # print(oneItem["jobName"]) yield Item
def parse(self, response): for item in response.xpath('//div[@class="menu_box"]/div/dl/dd/a'): jobClass = item.xpath('string()').extract()#提取文本内容 jobUrl = item.xpath("@href").extract_first()#提取链接内容 oneItem = FirstItem()#实例化一个item对象,将获取到的数据存入 oneItem["jobClass"] =jobClass oneItem["jobUrl"] = jobUrl yield oneItem#输出item
def __init__(self): self.logger.info('jidian.py的日志') self.config = ConfigParser() self.config.read(os.getcwd() + './spiders/config/config.ini') self.items = FirstItem() self.items['spider_name'] = 'jidian' try: with open(os.getcwd() + '/spiders/url_deduplication/{}.txt'.format('jidian'), 'r') as f: self.url_list = f.read().splitlines() except Exception as e: self.url_list = []
def parse(self, response): for item in response.xpath('//div[@class="menu_box"]/div/dl/dd/a'): jobClass = item.xpath('text()').extract() jobUrl = item.xpath("@href").extract_first() oneItem = FirstItem() oneItem["jobClass"] = jobClass oneItem["jobUrl"] = jobUrl yield scrapy.Request(url=jobUrl, cookies=self.cookie, callback=self.parse_url)
def parse_url(self, response): print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") title = response.meta["title"] print(title) for sel2 in response.xpath('//a[@class="Author logSend"]'): docName = sel2.xpath("text()").extract() oneItem = FirstItem() oneItem["docName"] = docName print("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb") print(oneItem["docName"])
def parse(self, response): for item in response.xpath('//div[@class="menu_box"]/div/dl/dd/a'): jobClass = item.xpath('text()').extract() jobUrl = item.xpath("@href").extract_first() oneItem = FirstItem() oneItem["jobClass"] = jobClass oneItem["jobUrl"] = jobUrl for i in range(30): jobUrl2 = jobUrl + str(i + 1) #print(jobUrl2) try: yield scrapy.Request(url=jobUrl2, cookies=self.cookie, callback=self.parse_url) except: pass
def parse_url(self, response): for sel in response.xpath('//*[@id="s_position_list"]/ul/li'): jname = sel.xpath('div/div/div/a/h3/text()').extract_first() jmoney = sel.xpath('div/div/div/div/span/text()').extract_first() jcompany = sel.xpath('div/div/div/a/text()').extract() jcompany = jcompany[3].strip() jneed = sel.xpath('div/div/div/div/text()').extract() jneed = jneed[2].strip() jaddress = sel.xpath( 'div/div/div/a/span/em/text()').extract_first() fi = FirstItem() fi['jname'] = jname fi['jmoney'] = jmoney fi['jneed'] = jneed fi['jcompany'] = jcompany fi['jaddress'] = jaddress yield fi
def __init__(self): self.turn_page = True self.logger.info('gonggongziyuan_13.py的日志') self.config = ConfigParser() self.config.read(os.getcwd() + './spiders/config/config.ini') self.items = FirstItem() self.items['spider_name'] = 'gonggongziyuan_13' try: with open( os.getcwd() + '/spiders/url_deduplication/{}.txt'.format( 'gonggongziyuan_13'), 'r') as f: self.url_list = f.read().splitlines() except Exception as e: self.url_list = [] self.data = { 'currentPage': '1', 'area': '000', 'industriesTypeCode': '', 'scrollValue': '865', 'tenderProjectCode': '', 'bulletinName': '', 'secondArea': '' }
def __init__(self): self.turn_page = True self.logger.info('guojiaji_3.py的日志') self.config = ConfigParser() self.config.read(os.getcwd() + './spiders/config/config.ini') self.items = FirstItem() self.items['spider_name'] = 'guojiaji_3' try: with open( os.getcwd() + '/spiders/url_deduplication/{}.txt'.format('guojiaji_3'), 'r') as f: self.url_list = f.read().splitlines() except Exception as e: self.url_list = [] self.data = { 'next': '1', 'table': 'news', 'classid': '2,3,4,5,6', 'action': 'getmorenews', 'limit': '15', 'small_length': '120' }
def parse(self, response): print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx") # for item in response.xpath("//div/dl/dt/a"): # title = item.xpath("text()").extract() # targetUrl = item.xpath("@href").extract() # oneItem = FirstItem() # oneItem["title"] = title # oneItem["targetUrl"] = targetUrl # print(oneItem) for item in response.xpath("//div/dl/dd/a"): title = item.xpath("text()").extract() targetUrl = item.xpath("@href").extract_first() oneItem = FirstItem() oneItem["title"] = title oneItem["targetUrl"] = targetUrl # url = "https://wenku.baidu.com/" + oneItem["targetUrl"] # yield oneItem yield scrapy.Request(url="https://wenku.baidu.com/list/71", meta={"title": title}, callback=self.parse_url)