def parse(self, response): if "job.liepin.com" in response.url: soup=BeautifulSoup(response.body,'lxml') if not soup.find('div','title-info over'): liepinItem = LiepinJobItem() liepinItem['name'] = soup.find('div',class_='title-info').find('h1').get_text().strip() liepinItem['salary'] = soup.find('p',class_='job-main-title').get_text().strip().split()[0].strip() liepinItem['jobid'] = Util.pathnumber_job(response.url) liepinItem['companyid'] = Util.pathnumber_company(soup.find('div',class_='right-post-top').find('a')['href']) liepinItem['location'] = soup.find('p',class_='basic-infor').find('span').get_text().strip() spantext = '' resumespan = soup.find('div',class_='resume clearfix') if resumespan : spans = resumespan.find_all('span') for span in spans: spantext = spantext+span.get_text()+" " liepinItem['require'] = spantext divtags = soup.find('div',class_='tag-list clearfix') tags = [] if divtags: tags = divtags.find_all('span') tagtext = '' for tag in tags: tagtext = tagtext + tag.get_text().strip() + ',' liepinItem['tag']=tagtext liepinItem['jobdes'] = soup.find('div',class_='content content-word') lis = [] jobmaindiv = soup.find_all('div',class_='job-main main-message ') if jobmaindiv.__len__() > 1: lis = jobmaindiv[1].find_all('li') liepinItem['partment'] = '' liepinItem['major'] = '' liepinItem['report']='' liepinItem['sub']='' liepinItem['sex'] ='' for li in lis: if u'所' in li.find('span').get_text().strip(): liepinItem['partment']=li.get_text().split()[0].strip()[5::].strip() if u'专' in li.find('span').get_text().strip(): liepinItem['major']=li.get_text().split()[0].strip()[5::].strip() if u'汇' in li.find('span').get_text().strip(): liepinItem['report']=li.get_text().split()[0].strip()[5::].strip() if u'下' in li.find('span').get_text().strip(): liepinItem['sub']=li.get_text().split()[0].strip()[5::].strip() if u'性' in li.find('span').get_text().strip(): liepinItem['sex']=li.get_text().split()[0].strip()[5::].strip() yield liepinItem urls = response.selector.xpath("//a/@href[contains(.,'liepin')]").extract() for url in urls: if "m." not in url: if "http" in url: yield Request(url,callback = self.parse) else: yield Request("http://"+url,callback = self.parse)
def parse(self, response): liepinItem = LiepinCompanyItem() if Util.match_company(response.url): if "company.liepin.com" in response.url: normalLogo = response.selector.xpath("//img[@class='normalELogo']") bigLogo = response.selector.xpath("//img[@class='bigELogo']") bannerLogo = response.selector.xpath("//div[@class='banner']") soup=BeautifulSoup(response.body,'lxml') if bigLogo: liepinItem['logo'] = soup.find('img',class_='bigELogo')['src'] liepinItem['name'] = soup.find('div',class_='company-name').find('h1').get_text().strip() liepinItem['size'] = "" liepinItem['property'] = "" liepinItem['field'] = "" liepinItem['finance'] = "" liepinItem['website'] = "" liepinItem['location'] = "" liepinItem['address'] = soup.find('p',attrs={'data-selector':'company-address'}).get_text().strip() tags = soup.find('section',class_="company-info").find('ul',class_='company-tags clearfix').find_all("li") tag_text=',' for tag in tags: tag_text = tag_text+tag.get_text().strip()+"," liepinItem['tag']=tag_text[::-1] aboutuls = soup.find('ul',class_='about-list') for ul in aboutuls: if u"领" in ul.get_text().strip(): liepinItem['field'] = ul.get_text().strip()[3::] if u"地" in ul.get_text().strip(): liepinItem['location'] = ul.get_text().strip()[3::] if u"官" in ul.get_text().strip(): liepinItem['website'] = ul.get_text().strip()[3::] if u"融" in ul.get_text().strip(): liepinItem['finance'] = ul.get_text().strip()[3::] liepinItem['intro'] = "" liepinItem['companyid'] = Util.pathnumber_company(response.url) sadd("imperfect",liepinItem['companyid']) elif normalLogo: liepinItem['logo'] = soup.find('section',class_="introduction").find('img',class_='normalELogo')['src'] liepinItem['name'] = soup.find('section',class_="introduction").find('div',class_='einfo').find('h2').get_text().strip() liepinItem['size'] = soup.find('section',class_="introduction").find('div',class_='e-menu').find('ul',class_='clearfix').find_all('li')[0].get_text().strip() liepinItem['property'] = soup.find('section',class_="introduction").find('div',class_='e-menu').find('ul',class_='clearfix').find_all('li')[1].get_text().strip() liepinItem['location'] = soup.find('section',class_="introduction").find('div',class_='e-menu').find('ul',class_='clearfix').find_all('li')[2].get_text().strip() tag_list = soup.find('section',class_="introduction").find('div',class_='tag-list clearfix') tag_text='' if tag_list: tags = tag_list.find_all('span',class_='tag') for tag in tags: tag_text = tag_text+tag.get_text().strip()+"," liepinItem['tag']=tag_text[::-1] liepinItem['field']='' liepinItem['finance'] = "" liepinItem['website'] = "" introdiv = soup.find('div',class_='intro-main') liepinItem['intro'] = "" liepinItem['address'] = "" if introdiv: liepinItem['intro'] = introdiv if soup.find('p',class_='company-address'): liepinItem['address'] = soup.find('p',class_='company-address').get_text().strip() liepinItem['companyid'] = Util.pathnumber_company(response.url) elif bannerLogo: liepinItem['logo'] = "" liepinItem['name'] = "" liepinItem['size'] = "" liepinItem['property'] = "" liepinItem['location'] = "" liepinItem['tag'] = "" liepinItem['field']='' liepinItem['finance'] = "" liepinItem['website'] = "" liepinItem['intro'] = "" liepinItem['address'] = "" liepinItem['companyid'] = Util.pathnumber_company(response.url) sadd("imperfect",liepinItem['companyid']) if int(liepinItem['companyid']) > 100000: yield liepinItem elif "vip.liepin.com" in response.url: liepinItem['logo'] = "" liepinItem['name'] = "" liepinItem['size'] = "" liepinItem['property'] = "" liepinItem['location'] = "" liepinItem['tag'] = "" liepinItem['field'] = '' liepinItem['finance'] = "" liepinItem['website'] = "" liepinItem['intro'] = "" liepinItem['address'] = "" liepinItem['companyid'] = Util.pathnumber_vip(response.url) sadd("imperfect",liepinItem['companyid']) urls = response.selector.xpath("//a/@href[contains(.,'liepin')]").extract() for url in urls: if "m." not in url: if "http" in url: yield Request(url,callback = self.parse) else: yield Request("http://"+url,callback = self.parse)