def parse(self, response): if "job.liepin.com" in response.url: soup=BeautifulSoup(response.body,'lxml') if not soup.find('div','title-info over'): liepinItem = LiepinJobItem() liepinItem['name'] = soup.find('div',class_='title-info').find('h1').get_text().strip() liepinItem['salary'] = soup.find('p',class_='job-main-title').get_text().strip().split()[0].strip() liepinItem['jobid'] = Util.pathnumber_job(response.url) liepinItem['companyid'] = Util.pathnumber_company(soup.find('div',class_='right-post-top').find('a')['href']) liepinItem['location'] = soup.find('p',class_='basic-infor').find('span').get_text().strip() spantext = '' resumespan = soup.find('div',class_='resume clearfix') if resumespan : spans = resumespan.find_all('span') for span in spans: spantext = spantext+span.get_text()+" " liepinItem['require'] = spantext divtags = soup.find('div',class_='tag-list clearfix') tags = [] if divtags: tags = divtags.find_all('span') tagtext = '' for tag in tags: tagtext = tagtext + tag.get_text().strip() + ',' liepinItem['tag']=tagtext liepinItem['jobdes'] = soup.find('div',class_='content content-word') lis = [] jobmaindiv = soup.find_all('div',class_='job-main main-message ') if jobmaindiv.__len__() > 1: lis = jobmaindiv[1].find_all('li') liepinItem['partment'] = '' liepinItem['major'] = '' liepinItem['report']='' liepinItem['sub']='' liepinItem['sex'] ='' for li in lis: if u'所' in li.find('span').get_text().strip(): liepinItem['partment']=li.get_text().split()[0].strip()[5::].strip() if u'专' in li.find('span').get_text().strip(): liepinItem['major']=li.get_text().split()[0].strip()[5::].strip() if u'汇' in li.find('span').get_text().strip(): liepinItem['report']=li.get_text().split()[0].strip()[5::].strip() if u'下' in li.find('span').get_text().strip(): liepinItem['sub']=li.get_text().split()[0].strip()[5::].strip() if u'性' in li.find('span').get_text().strip(): liepinItem['sex']=li.get_text().split()[0].strip()[5::].strip() yield liepinItem urls = response.selector.xpath("//a/@href[contains(.,'liepin')]").extract() for url in urls: if "m." not in url: if "http" in url: yield Request(url,callback = self.parse) else: yield Request("http://"+url,callback = self.parse)