def parse_detail(self, response): if self.killed: raise CloseSpider("Spider already died.") if not response.body: self.error_count += 1 if self.error_count >= self.error_threshold: self.logger.error('[ JobPageRequestException ] {url}'.format( url=response.url.encode('utf-8'))) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message="Empty request's response") yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info( '[ JobPageRetry ] {url} with proxy {proxy}'.format( url=response.url, proxy=proxy)) yield scrapy.Request(response.url, callback=self.parse_detail, meta={'proxy': proxy}) return else: self.logger.info( '[ JobPageRetry ] {url}'.format(url=response.url)) yield scrapy.Request(response.url, callback=self.parse_detail) return self.error_count = 0 try: html_path = self.html_path.format( dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format( url=response.url.encode('utf-8'))) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format( url=response.url.encode('utf-8'))) try: ret = {} ret['company'] = response.xpath( '//div[@class="company-head m-t-10"]/text()').extract_first() ret['pos'] = response.xpath( '//div[@id="jobinfo"]/div/div[@class="title"]/text()' ).extract_first() ret['desc'] = self.clean_tag( response.xpath( '//div[@class="jobinfo-desc m-t-15"]').extract_first()) ret['nation'] = response.xpath( './/table[@class="qualifi-table"]/tr/td/text()')[0].extract() ret['sex'] = response.xpath( './/table[@class="qualifi-table"]/tr/td/text()')[2].extract() ret['age'] = ' '.join( response.xpath('.//table[@class="qualifi-table"]/tr/td/text()') [3].extract().strip().split()) ret['sal'] = ' '.join( response.xpath('.//table[@class="qualifi-table"]/tr/td/text()') [4].extract().strip().split()) ret['edu'] = response.xpath( './/table[@class="qualifi-table"]/tr/td/text()')[5].extract() ret['exp'] = response.xpath( './/table[@class="qualifi-table"]/tr/td/text()')[6].extract() ret['loc'] = response.xpath( '//div[@class="place"]/text()').extract_first().strip() ret['amnt'] = response.xpath( './/table[@class="qualifi-table"]/tr/td/text()')[7].extract() ret['etype'] = response.xpath( './/table[@class="qualifi-table"]/tr/td/text()')[8].extract() ret['benef'] = self.clean_tag( response.xpath( '//div[@class="col-md-9 company-image-box"]/div/div[@class="col-md-12 padding-no m-t-15"]/div[@class="jobinfo-desc"]' )[0].extract()) ret['pdate'] = response.xpath( '//div[@class="jobinfo-update"]/text()').extract_first().split( ':')[1].strip() if ret['pdate'].split('/')[-1] == "2017": self.killed += 1 self.logger.info("[ JobEndReached ] 2017 reached") raise CloseSpider("2017 reached") for key in ret.keys(): if ret[key]: ret[key] = ret[key].strip().encode('utf-8') _hash = hash_dn(ret['desc'], ret['company']) try: self.sqllogger.log_crawled_page( hash_code=_hash, position=ret['pos'], employer=ret['company'], exp=ret['exp'], salary=ret['sal'], location=ret['loc'], web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time=ret['pdate'], job_status='SUCCESS', error_message='') self.logger.info('[ RDSLogged ] {url}'.format( url=response.url.encode('utf-8'))) except exc.IntegrityError as e: if e.orig.args[ 0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info( "[ JobEndReached ] crawled record reached exceeding threshold" ) self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[ 0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info( "[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e self.repeat_count = 0 yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error( '[ JobDetailException ] {url} {html_path} {e}'.format( url=response.url.encode('utf-8'), html_path=html_path.encode('utf-8'), e=e)) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message=e)
def parse_detail(self, response): if self.killed: raise CloseSpider("Spider already died.") ### handle the case when response from web server is empty # retry requesting, after 5 failures in a row, log error then continue. if not response.body: self.error_count += 1 if self.error_count >= self.error_threshold: self.logger.error('[ JobPageRequestException ] {url}'.format( url=response.url.encode('utf-8'))) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message="Empty request's response") yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info( '[ JobPageRetry ] {url} with proxy {proxy}'.format( url=response.url.encode('utf-8'), proxy=proxy)) yield scrapy.Request(response.url.encode('utf-8'), callback=self.parse_detail, meta={'proxy': proxy}) return else: self.logger.info( '[ JobPageRetry ] {url}'.format(url=url.encode('utf-8'))) yield scrapy.Request(response.url.encode('utf-8'), callback=self.parse_detail) return self.error_count = 0 ### ### writing html archive try: html_path = self.html_path.format( dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format( url=response.url.encode('utf-8'))) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format( url=response.url.encode('utf-8'))) ### try: ### parsing information contents = response.xpath( './/div[@class="wrapper-preview-list"]/div[contains(@class,"row tr")]/div[contains(@class,"col-sm")]' ) content_str = [ self.clean_tag(content.xpath('./div/div')[1].extract()) for content in contents[:10] ] pos, company = [ x.strip() for x in response.xpath( '//h1[@class="title-section c4 xs-mt5"]/text()'). extract_first().split(',', 1) ] ret = {} ret['company'] = company ret['pos'] = pos ret['etype'] = content_str[1] ret['indus'] = content_str[2] ret['amnt'] = content_str[3] ret['sal'] = content_str[4] ret['exp'] = content_str[5] ret['sex'] = content_str[6] ret['edu'] = content_str[7] ret['loc'] = content_str[8] ret['desc'] = '|'.join( [x.strip() for x in contents[11].xpath('./text()').extract()]) ret['pdate'] = self.convert_pdate( response.xpath( '//span[@itemprop="datePosted"]/text()').extract_first()) if ret['pdate'].split()[0].split('-')[0] == "2017": self.logger.info("[ JobEndReached ] 2017 reached") self.killed = 1 raise CloseSpider("2017 reached") for key in ret.keys(): if ret[key]: ret[key] = ret[key].strip().encode('utf-8') ### # create hash for tracking jobs _hash = hash_dn(ret['desc'], ret['company']) ### Stop spider after encountering crawled record 3 times in a row. # to prevent spider stopping just from getting a few old records # that may happen because of new job updates #if self.sqllogger.check_existing(_hash, self.web_id): # if self.repeat_count >= self.repeat_threshold: # self.logger.info("[ JobEndReached ] crawled record reached exceeding threshold") # self.killed = 1 # raise CloseSpider("Crawled record reached") # else: # self.repeat_count += 1 # self.logger.info("[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) # yield None # return #self.repeat_count = 0 ### ### log result to MySQL try: self.sqllogger.log_crawled_page( hash_code=_hash, position=ret['pos'], employer=ret['company'], exp=ret['exp'], salary=ret['sal'], location=ret['loc'], web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time=ret['pdate'], job_status='SUCCESS', error_message='') self.logger.info('[ RDSLogged ] {url}'.format( url=response.url.encode('utf-8'))) except exc.IntegrityError as e: ### check encountering old record by catching error that mysql will throw # if old record is met. (primary key(hash) is repeating) # The error code for such error is 1062 ### Stop spider after encountering crawled record 3 times IN A ROW. # to prevent spider stopping just from getting a few old records # that may happen because of new job updates if e.orig.args[ 0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info( "[ JobEndReached ] crawled record reached exceeding threshold" ) self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[ 0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info( "[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e ### self.repeat_count = 0 ### yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error( '[ JobDetailException ] {url} {html_path} {e}'.format( url=response.url.encode('utf-8'), html_path=html_path.encode('utf-8'), e=e)) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message=e)
def parse_detail(self, response): if self.killed: raise CloseSpider("Spider already died.") if not response.body: self.error_count += 1 if self.error_count >= self.error_threshold: self.logger.error('[ JobPageRequestException ] {url}'.format( url=response.url.encode('utf-8'))) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message="Empty request's response") yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info( '[ JobPageRetry ] {url} with proxy {proxy}'.format( url=response.url.encode('utf-8'), proxy=proxy)) yield scrapy.Request(response.url.encode('utf-8'), callback=self.parse_detail, meta={'proxy': proxy}) return else: self.logger.info('[ JobPageRetry ] {url}'.format( url=response.url.encode('utf-8'))) yield scrapy.Request(response.url.encode('utf-8'), callback=self.parse_detail) return self.error_count = 0 try: html_path = self.html_path.format( dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format( url=response.url.encode('utf-8'))) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format( url=response.url.encode('utf-8'))) try: ret = {} head = {} row = response.xpath( './/table[@width="610px"]/tr/td[@valign="top"]/table[@style=" word-break: normal; word-wrap: break-word; "]' ) topic = [ i.xpath('./tr/td/span/text()').extract_first() for i in row ] head[ 'desc'] = u'\u0e23\u0e32\u0e22\u0e25\u0e30\u0e40\u0e2d\u0e35\u0e22\u0e14\u0e02\u0e2d\u0e07\u0e07\u0e32\u0e19' head[ 'loc'] = u'\u0e2a\u0e16\u0e32\u0e19\u0e17\u0e35\u0e48\u0e1b\u0e0f\u0e34\u0e1a\u0e31\u0e15\u0e34\u0e07\u0e32\u0e19' head['amnt'] = u'\u0e2d\u0e31\u0e15\u0e23\u0e32' head[ 'qual'] = u'\u0e04\u0e38\u0e13\u0e2a\u0e21\u0e1a\u0e31\u0e15\u0e34\u0e1c\u0e39\u0e49\u0e2a\u0e21\u0e31\u0e04\u0e23' ret['pos'] = response.xpath( '//span[@class="head5 blue"]/text()').extract_first().encode( 'utf8').split(' : ')[-1] ret['company'] = response.xpath( './/a[@class="searchjob"]/span/text()').extract_first() ret['pdate'] = response.xpath( './/table/tr/td[@align="right"]/span/text()').extract_first() ret['sal'] = '' ret['benef'] = '' ret['desc'] = '' ret['loc'] = '' try: ret['sal'] = response.xpath( './/table[@width="610px"]/tr/td[@valign="top"]/table[@style=" word-break: normal; word-wrap: break-word; "]' )[2].xpath('./tr')[1].xpath('./td/span/text()')[1].extract() except: pass try: ret['benef'] = remove_tags( response.xpath( './/table[@style="margin-top:10px;"]/tr/td/span[@class="head1 blue"]' ).extract_first()) except TypeError: pass for key in head.keys(): try: idx = topic.index(head[key]) except ValueError: continue ret[key] = '|'.join([ i for i in [ remove_tags(i) for i in row[idx].xpath('./tr')[1:].extract() ] if i ]) for key in ret.keys(): if ret[key]: try: ret[key] = ' '.join( ret[key].strip().split()).encode('utf-8') except UnicodeDecodeError: ret[key] = ' '.join(ret[key].strip().split()) if ret['pdate'].split()[-1] == "2560": self.logger.info("[ JobEndReached ] 2017 reached") raise CloseSpider("2017 reached") _hash = hash_dn(ret['desc'], ret['company']) try: self.sqllogger.log_crawled_page( hash_code=_hash, position=ret['pos'], employer=ret['company'], exp='', salary=ret['sal'], location=ret['loc'], web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time=ret['pdate'], job_status='SUCCESS', error_message='') self.logger.info('[ RDSLogged ] {url}'.format( url=response.url.encode('utf-8'))) except exc.IntegrityError as e: if e.orig.args[ 0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info( "[ JobEndReached ] crawled record reached exceeding threshold" ) self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[ 0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info( "[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e self.repeat_count = 0 for key in ret.keys(): if not ret[key]: del ret[key] yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error( '[ JobDetailException ] {url} {html_path} {e}'.format( url=response.url.encode('utf-8'), html_path=html_path.encode('utf-8'), e=e)) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message=e)
def parse_detail(self, response): if self.killed: raise CloseSpider("Spider already died.") if not response.body: self.error_count += 1 if self.error_count >= self.error_threshold: self.logger.error('[ JobPageRequestException ] {url}'.format(url=response.url.encode('utf-8'))) self.sqllogger.log_error_page( hash_code = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')), web_id = self.web_id, url = response.url.encode('utf-8'), meta = response.meta, html_path = html_path, crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status = 'FAILED', error_message= "Empty request's response" ) yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info('[ JobPageRetry ] {url} with proxy {proxy}'.format(url=response.url.encode('utf-8'), proxy=proxy)) yield scrapy.Request(response.url, callback=self.parse_detail , meta={'proxy': proxy}) return else: self.logger.info('[ JobPageRetry ] {url}'.format(url=response.url.encode('utf-8'))) yield scrapy.Request(response.url, callback=self.parse_detail) return self.error_count = 0 try: html_path = self.html_path.format(dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format(url=response.url.encode('utf-8'))) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format(url=response.url.encode('utf-8'))) try: ret = {} ret['company'] = response.xpath('.//h1[@itemprop="hiringOrganization"]/a/span/text()').extract_first() ret['pos'] = response.xpath('.//div[@class="job-detail-top col-xs-12"]/h2/a/text()').extract_first() ret['etype'] = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[0]) ret['loc'] = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[1]) ret['sal'] = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[2]) ret['hour'] = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[4]) ret['desc'] = '|'.join([i.strip() for i in response.xpath('.//div[@itemprop="responsibilities"]/text()').extract()]) ret['qual'] = '|'.join([ i for i in [self.clean_tag(i).strip() for i in response.xpath('.//div[@itemprop="skills"]').extract_first().split('\n')] if i]) ret['benef'] = '|'.join([ i for i in [self.clean_tag(i).strip() for i in response.xpath('.//div[@itemprop="incentives"]').extract_first().replace('<li>','\n').split('\n')] if i]) ret['pdate'] = self.convert_pdate(response.xpath('.//div[@itemprop="datePosted"]/text()').extract_first()) if ret['pdate'].split()[0].split('-')[0] == "2017": self.logger.info("[ JobEndReached ] 2017 reached") self.killed = 1 raise CloseSpider("2017 reached") for key in ret.keys(): if ret[key]: ret[key] = ret[key].strip().encode('utf-8') _hash = hash_dn(ret['desc'],ret['company']) #log result to MySQL try: self.sqllogger.log_crawled_page( hash_code = _hash, position = ret['pos'], employer = ret['company'], exp = '', salary = ret['sal'], location = ret['loc'], web_id = self.web_id, url = response.url.encode('utf-8'), meta = response.meta, html_path = html_path, crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time = ret['pdate'], job_status = 'SUCCESS', error_message= '' ) self.logger.info('[ RDSLogged ] {url}'.format(url=response.url.encode('utf-8'))) except exc.IntegrityError as e: if e.orig.args[0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info("[ JobEndReached ] crawled record reached exceeding threshold") self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info("[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e self.repeat_count = 0 yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error('[ JobDetailException ] {url} {html_path} {e}'.format(url=response.url.encode('utf-8'),html_path=html_path.encode('utf-8'),e=e)) self.sqllogger.log_error_page( hash_code = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')), web_id = self.web_id, url = response.url.encode('utf-8'), meta = response.meta, html_path = html_path, crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status = 'FAILED', error_message= e )
def parse_detail(self, response): if not response.body: self.error_count += 1 if self.error_count == 5: self.logger.error('[ JobPageRequestException ] {url}'.format( url=response.url)) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.header, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message="Empty request's response") yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info( '[ JobPageRetry ] {url} with proxy {proxy}'.format( url=response.url, proxy=proxy)) yield scrapy.Request(response.url, callback=self.parse_detail, meta={'proxy': proxy}) return else: self.logger.info('[ JobPageRetry ] {url}'.format(url=url)) yield scrapy.Request(response.url, callback=self.parse_detail) return self.error_count = 0 try: html_path = self.html_path.format( dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format( url=response.url.encode('utf-8'))) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format( url=response.url.encode('utf-8'))) try: def convert_utf8(data): if isinstance(data, basestring): return data.encode('utf-8') elif isinstance(data, collections.Mapping): return dict(map(convert_utf8, data.iteritems())) elif isinstance(data, collections.Iterable): return type(data)(map(convert_utf8, data)) else: return data ret = {} js = json.loads(remove_tags(response.xpath('/*').extract_first())) js = convert_utf8(js) jobid = js['data']['attributes']['jobid'] companyid = js['data']['relationships']['company']['data']['id'] cdttm = self.cdttm[(companyid, str(jobid))] if datetime.strptime(cdttm, '%Y-%m-%d %H:%M:%S').year == 2017: self.logger.info("[ JobEndReached ] 2017 reached") raise CloseSpider("2017 reached") ret['pdate'] = cdttm.encode('utf-8') if js['data']['attributes'].has_key('position') and js['data'][ 'attributes']['position'] is not None: ret['pos'] = js['data']['attributes']['position'] if js['data']['attributes'].has_key('position_th') and js['data'][ 'attributes']['position_th'] is not None: ret['posth'] = js['data']['attributes']['position_th'] if js['data']['attributes'].has_key('qualifications') and js[ 'data']['attributes']['qualifications'] is not None: ret['qual'] = [ x for x in js['data']['attributes']['qualifications'] ] if js['data']['attributes'].has_key('description') and js['data'][ 'attributes']['description'] is not None: ret['desc'] = js['data']['attributes']['description'] if js['data']['attributes'].has_key('responsibilities') and js[ 'data']['attributes']['responsibilities'] is not None: ret['resp'] = [ x for x in js['data']['attributes']['responsibilities'] ] ret['skill_req'] = [ x['attributes'] for x in js['included'] if x['type'] == 'skillsRequired' ] ret['skill_pref'] = [ x['attributes'] for x in js['included'] if x['type'] == 'skillsPreferred' ] ret['exp_pref'] = [ x['attributes'] for x in js['included'] if x['type'] == 'experiencePreferred' ] ret['exp_req'] = [ x['attributes'] for x in js['included'] if x['type'] == 'experienceRequired' ] ret['edu'] = [ x['attributes'] for x in js['included'] if x['type'] == 'educations' ][:] ret['company'] = [ x['attributes']['name'] for x in js['included'] if x['type'] == "company" ][0] location = [] try: province = [ x for x in js['included'] if x['type'] == 'province' ][0]['attributes']['name'] location.append(province) ret['province'] = province except: pass try: district = [ x for x in js['included'] if x['type'] == 'district' ][0]['attributes']['name'] location.append(district) ret['district'] = district except: pass ret['location'] = ','.join(location) if (not 'desc' in ret) and (not 'resp' in ret): ret['desc'] = ret['company'] + str(ret['skill_req']) elif (not 'desc' in ret) and ('resp' in ret): ret['desc'] = ret['resp'] # for key in ret.keys(): # val = ret[key] # if val and isinstance(val, list): # ret[key] = unicode(','.join(ret[key]),encoding='utf-8').strip().encode('utf-8') # else: # ret[key] = unicode(ret[key],encoding='utf-8').strip().encode('utf-8') _hash = hash_dn(ret['desc'], ret['company']) #log result to MySQL try: self.sqllogger.log_crawled_page( hash_code=_hash, position='%s' % ret['pos'], employer='%s' % ret['company'], exp='%s' % ret['exp_req'], salary='', location=' '.join(ret['location']), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.headers, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time=ret['pdate'], job_status='SUCCESS', error_message='') self.logger.info('[ RDSLogged ] {url}'.format( url=response.url.encode('utf-8'))) except exc.IntegrityError as e: if e.orig.args[ 0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info( "[ JobEndReached ] crawled record reached exceeding threshold" ) self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[ 0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info( "[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e self.repeat_count = 0 yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error( '[ JobDetailException ] {url} {html_path} {e}'.format( url=response.url.encode('utf-8'), html_path=html_path.encode('utf-8'), e=e)) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.headers, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message=e)
def parse_detail(self, response): if self.killed: raise CloseSpider("Spider already died.") if not response.body: self.error_count += 1 if self.error_count >= self.error_threshold: self.logger.error('[ JobPageRequestException ] {url}'.format( url=response.url.encode('utf-8'))) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message="Empty request's response") yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info( '[ JobPageRetry ] {url} with proxy {proxy}'.format( url=response.url.encode('utf-8'), proxy=proxy)) yield scrapy.Request(response.url, callback=self.parse_detail, meta={'proxy': proxy}) return else: self.logger.info('[ JobPageRetry ] {url}'.format( url=response.url.encode('utf-8'))) yield scrapy.Request(response.url, callback=self.parse_detail) return self.error_count = 0 try: html_path = self.html_path.format( dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format( url=response.url.encode('utf-8'))) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format( url=response.url.encode('utf-8'))) try: ret = {} head = {} row = response.xpath( '//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]/p|//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]/ul' )[1:] topic = response.xpath( '//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]//b/u/text()' ).extract() head['amnt'] = u'\u0e2d\u0e31\u0e15\u0e23\u0e32' head[ 'sal'] = u'\u0e40\u0e07\u0e34\u0e19\u0e40\u0e14\u0e37\u0e2d\u0e19' head[ 'benef'] = u'\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e34\u0e01\u0e32\u0e23' head[ 'req'] = u'\u0e04\u0e38\u0e13\u0e2a\u0e21\u0e1a\u0e31\u0e15\u0e34\u0e1c\u0e39\u0e49\u0e2a\u0e21\u0e31\u0e04\u0e23' head[ 'loc_det'] = u'\u0e2a\u0e16\u0e32\u0e19\u0e17\u0e35\u0e48\u0e1b\u0e0f\u0e34\u0e1a\u0e31\u0e15\u0e34\u0e07\u0e32\u0e19' head['loc'] = u'\u0e08\u0e31\u0e07\u0e2b\u0e27\u0e31\u0e14' ret['pos'], ret['desc'] = [ self.clean_tag(x) for x in response.xpath( '//div[@class="w3-theme-l4"]/div').extract() ] ret['pdate'] = self.cdate[response.url] ret['company'] = self.comnm[response.url] del self.cdate[response.url] del self.comnm[response.url] ret['loc'] = '' ret['sal'] = '' for key in head.keys(): try: idx = topic.index(head[key]) except ValueError: continue ret[key] = '|'.join([ i for i in [ remove_tags(i) for i in row[idx].xpath('./text()|./li').extract() ] if i ]) if ret['pdate'].split()[-1] == "2560": self.killed += 1 raise CloseSpider("2017 reached") for key in ret.keys(): if ret[key]: ret[key] = ' '.join( ret[key].strip().split()).encode('utf-8') _hash = hash_dn(ret['desc'], ret['company']) try: self.sqllogger.log_crawled_page( hash_code=_hash, position=ret['pos'], employer=ret['company'], exp='', salary=ret['sal'], location=ret['loc'], web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time=ret['pdate'], job_status='SUCCESS', error_message='') self.logger.info('[ RDSLogged ] {url}'.format( url=response.url.encode('utf-8'))) except exc.IntegrityError as e: if e.orig.args[ 0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info( "[ JobEndReached ] crawled record reached exceeding threshold" ) self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[ 0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info( "[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e self.repeat_count = 0 for key in ret.keys(): if not ret[key]: del ret[key] yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error( '[ JobDetailException ] {url} {html_path} {e}'.format( url=response.url.encode('utf-8'), html_path=html_path.encode('utf-8'), e=e)) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message=e)
def parse_detail(self, response): if self.killed: raise CloseSpider("Spider already died.") if not response.body: self.error_count += 1 if self.error_count >= self.error_threshold: self.logger.error('[ JobPageRequestException ] {url}'.format( url=response.url)) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message="Empty request's response") yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info( '[ JobPageRetry ] {url} with proxy {proxy}'.format( url=response.url, proxy=proxy)) yield scrapy.Request(response.url.encode('utf-8'), callback=self.parse_detail, meta={'proxy': proxy}) return else: self.logger.info( '[ JobPageRetry ] {url}'.format(url=response.url)) yield scrapy.Request(response.url.encode('utf-8'), callback=self.parse_detail) return self.error_count = 0 try: html_path = self.html_path.format( dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format( url=response.url.encode('utf-8'))) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format( url=response.url.encode('utf-8'))) try: contents = response.xpath( '//form[@method="GET"]/table[@id="AutoNumber1"]/tr/td/font[@size="2"]/text()' ).extract() ret = {} ret['company'] = urllib.unquote( repr(self.comnm[response.url])[2:-1].replace( '\\x', '%')).decode('tis-620') ret['pos'] = response.xpath( '//form[@method="GET"]/b/font/text()').extract_first() ret['sal'] = contents[1] ret['amnt'] = contents[2] ret['desc'] = response.xpath( '//form[@method="GET"]/table[@id="AutoNumber1"]/tr/td/font[@style="font-size: 11pt"]/text()' ).extract_first() ret['loc'] = response.xpath( '//form[@method="GET"]/table[@id="AutoNumber1"]/tr/td/font[@size="2"]/span/text()' ).extract_first() ret['pdate'] = urllib.unquote( repr(self.cdttm[response.url])[2:-1].replace( '\\x', '%')).decode('tis-620') del self.cdttm[response.url] if ret['pdate'].split()[-1] == "2560": self.logger.info("[ JobEndReached ] 2017 reached") raise CloseSpider("2017 reached") for key in ret.keys(): if ret[key]: ret[key] = ret[key].strip().encode('utf-8') _hash = hash_dn(ret['desc'], ret['company']) #log result to MySQL try: self.sqllogger.log_crawled_page( hash_code=_hash, position=ret['pos'], employer=ret['company'], exp='', salary=ret['sal'], location=ret['loc'], web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time=ret['pdate'], job_status='SUCCESS', error_message='') self.logger.info('[ RDSLogged ] {url}'.format( url=response.url.encode('utf-8'))) except exc.IntegrityError as e: if e.orig.args[ 0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info( "[ JobEndReached ] crawled record reached exceeding threshold" ) self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[ 0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info( "[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e self.repeat_count = 0 yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error( '[ JobDetailException ] {url} {html_path} {e}'.format( url=response.url.encode('utf-8'), html_path=html_path.encode('utf-8'), e=e)) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message=e)
def parse_detail(self, response): if self.killed: raise CloseSpider("Spider already died.") if not response.body: self.error_count += 1 if self.error_count >= self.error_threshold: self.logger.error( '[ JobPageRequestException ] {url} {form}'.format( url=response.url, form=response.meta['formdata'])) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message="Empty request's response") yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info( '[ JobPageRetry ] {url} {form} with proxy {proxy}'.format( url=response.url, form=response.meta['formdata'], proxy=proxy)) yield FormRequest(response.url, formdata=response.meta['formdata'], callback=self.parse_detail, meta={ 'proxy': proxy, 'formdata': fd }) return else: self.logger.info('[ JobPageRetry ] {url} {form}'.format( url=response.url, form=response.meta['formdata'])) yield FormRequest(response.url, formdata=response.meta['formdata'], callback=self.parse_detail, meta={'formdata': fd}) return self.error_count = 0 try: html_path = self.html_path.format( dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url} {form}'.format( url=response.url, form=response.meta['formdata'])) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url} {form}'.format( url=response.url, form=response.meta['formdata'])) try: ret = {} ret['pos'] = response.xpath( '//div[attribute::class="logoCompany"]/div/text()' ).extract_first() ret['pos2'] = response.xpath( '//div[attribute::class="logoCompany"]/div/text()' ).extract_first() ret['company'] = self.clean_tag( response.xpath( './/div[contains(@class,"companyName")]').extract_first()) ret['desc'] = self.clean_tag( response.xpath('.//div[@id="jobDescription"]/table/tr/td'). extract_first()) ret['req'] = self.clean_tag( response.xpath('.//div[@id="qualification"]').extract_first()) contents = [ self.clean_tag(i) for i in response.xpath( './/div[@id="basic_require"]/div[not(@class="seperator")]' ).extract() ] ret['etype'] = contents[0].split(':')[-1] ret['amnt'] = contents[1].split(':')[-1] ret['sex'] = contents[2].split(':')[-1] ret['sal'] = contents[3].split(':')[-1] ret['exp'] = contents[4].split(':')[-1] ret['loc'] = contents[5].split(':')[-1] ret['edu'] = '|'.join(contents[6].split(':')[1:]) ret['pdate'] = response.xpath( './/div[@clas="dateAndShare"]/p/text()').extract_first() for key in ret.keys(): if ret[key]: ret[key] = ret[key].strip().encode('utf-8') if ret['pdate'].split('/')[-1] == "2560": self.logger.info("[ JobEndReached ] 2017 reached") self.killed = 1 raise CloseSpider("2017 reached") _hash = hash_dn(ret['desc'], ret['company']) try: self.sqllogger.log_crawled_page( hash_code=_hash, position=ret['pos'], employer=ret['company'], exp=ret['exp'], salary=ret['sal'], location=ret['loc'], web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time=ret['pdate'], job_status='SUCCESS', error_message='') self.logger.info('[ RDSLogged ] {url}'.format( url=response.url.encode('utf-8'))) except exc.IntegrityError as e: if e.orig.args[ 0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info( "[ JobEndReached ] crawled record reached exceeding threshold" ) self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[ 0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info( "[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e self.repeat_count = 0 yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error( '[ JobDetailException ] {url} {form} {html} {e}'.format( url=response.url.encode('utf-8'), form=response.meta['formdata'], html=html_path, e=e)) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message=e)
def parse_detail(self, response): if self.killed: raise CloseSpider("Spider already died.") if not response.body: self.error_count += 1 if self.error_count == 5: self.logger.error('[ JobPageRequestException ] {url}'.format(url=response.url)) if self.use_proxy: proxy = choice(self.proxies) self.logger.info('[ JobPageRetry ] {url} with proxy {proxy}'.format(url=response.url, proxy=proxy)) yield scrapy.Request(response.url, callback=self.parse_detail , meta={'proxy': proxy}) else: self.logger.info('[ JobPageRetry ] {url}'.format(url=response.url)) yield scrapy.Request(response.url, callback=self.parse_detail) self.error_count = 0 try: html_path = self.html_path.format(dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format(url=response.url)) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format(url=response.url)) try: ret = {} ret['pos'] = response.xpath('//th[@class="qualification_positionname align_left"]/a/text()').extract_first() ret['company'] = response.xpath('//table[@class="table_qualification"]/tr[@class="header"]/th[@class="corner"]/div/div/div/text()').extract_first() ret['pdate'] = response.xpath('//th[@class="qualification_postdate align_right"]/text()').extract_first() #contents = response.xpath('//table[@class="table_qualification"]/tr[not(@class)]/td[@class="line_left line_right"]')[1].xpath('./ul') #ret['desc'] = '|'.join(contents[0].xpath('./li//text()').extract()) #ret['req'] = '|'.join(contents[1].xpath('./li//text()').extract()) ret['desc'] = self.clean_tag(response.xpath('//table[@class="table_qualification"]/tr[not(@class)]/td[@class="line_left line_right"]')[1].extract()) ret['loc'] = self.location[response.url] del self.location[response.url] for key in ret.keys(): if ret[key]: try: ret[key] = ret[key].strip().encode('utf-8') except: ret[key] = ret[key].strip() if ret['pdate'].split()[-1] == "17": self.logger.info("[ JobEndReached ] 2017 reached") raise CloseSpider("2017 reached") _hash = hash_dn(ret['desc'],ret['company']) try: self.sqllogger.log_crawled_page( hash_code = _hash, position = ret['pos'], employer = ret['company'], exp = '', salary = '', location = ret['loc'], web_id = self.web_id, url = response.url.encode('utf-8'), meta = response.meta, html_path = html_path, crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time = ret['pdate'], job_status = 'SUCCESS', error_message= '' ) self.logger.info('[ RDSLogged ] {url}'.format(url=response.url.encode('utf-8'))) except exc.IntegrityError as e: if e.orig.args[0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info("[ JobEndReached ] crawled record reached exceeding threshold") self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info("[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e self.repeat_count = 0 yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error('[ JobDetailException ] {url} {html_path} {e}'.format(url=response.url.encode('utf-8'),html_path=html_path.encode('utf-8'),e=e)) self.sqllogger.log_error_page( hash_code = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')), web_id = self.web_id, url = response.url.encode('utf-8'), meta = response.meta, html_path = html_path, crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status = 'FAILED', error_message = e )
def parse_detail(self, response): if self.killed: raise CloseSpider("Spider already died.") if not response.body: self.error_count += 1 if self.error_count == 5: self.logger.error('[ JobPageRequestException ] {url}'.format( url=response.url.encode('utf-8'))) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message="Empty request's response") yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info( '[ JobPageRetry ] {url} with proxy {proxy}'.format( url=response.url.encode('utf-8'), proxy=proxy)) yield scrapy.Request(response.url, callback=self.parse_detail, meta={'proxy': proxy}) return else: self.logger.info('[ JobPageRetry ] {url}'.format( url=response.url.encode('utf-8'))) yield scrapy.Request(response.url, callback=self.parse_detail) return self.error_count = 0 try: html_path = self.html_path.format( dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format( url=response.url.encode('utf-8'))) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format( url=response.url.encode('utf-8'))) try: ret = {} ret['company'] = response.xpath( './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/h3/text()' ).extract_first() ret['pos'] = response.xpath( './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div' )[1].xpath('./table/tr')[0].xpath( './td/strong/font/text()').extract_first() ret['amnt'] = response.xpath( './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div' )[1].xpath('./table/tr')[1].xpath('./td/text()')[1].extract() ret['sal'] = response.xpath( './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div' )[1].xpath('./table/tr')[2].xpath('./td/text()')[1].extract() ret['desc'] = '|'.join([ i.strip() for i in response.xpath( './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div' )[1].xpath('./table/tr')[3].xpath('./td/text()').extract() ][1:]) ret['qual'] = '|'.join([ i.strip() for i in response.xpath( './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div' )[1].xpath('./table/tr')[4].xpath('./td/text()').extract() ][1:]) ret['benef'] = '|'.join([ i.strip() for i in response.xpath( './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div' )[1].xpath('./table/tr')[5].xpath('./td/text()').extract() ][1:]) ret['loc'] = '|'.join([ i.strip() for i in response.xpath( './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div' )[1].xpath('./table/tr')[7].xpath('./td/text()').extract() ][1:]) ret['loc_det'] = '|'.join([ i.strip() for i in response.xpath( './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div' )[1].xpath('./table/tr')[6].xpath('./td/text()').extract() ][1:]) ret['pdate'] = response.xpath( './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div' )[1].xpath('./table/tr')[10].xpath( './td/font/text()').extract_first() for key in ret.keys(): if ret[key]: ret[key] = ret[key].strip().encode('utf-8') if ret['pdate'].split()[0].split('-')[0] == "2017": self.logger.info("[ JobEndReached ] 2017 reached") raise CloseSpider("2017 reached") _hash = hash_dn(ret['desc'], ret['company']) try: self.sqllogger.log_crawled_page( hash_code=hash_dn(ret['desc'], ret['company']), position=ret['pos'], employer=ret['company'], exp='', salary=ret['sal'], location=ret['loc'], web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time=ret['pdate'], job_status='SUCCESS', error_message='') self.logger.info('[ RDSLogged ] {url}'.format( url=response.url.encode('utf-8'))) except exc.IntegrityError as e: if e.orig.args[ 0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info( "[ JobEndReached ] crawled record reached exceeding threshold" ) self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[ 0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info( "[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e self.repeat_count = 0 yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error( '[ JobDetailException ] {url} {html_path} {e}'.format( url=response.url.encode('utf-8'), html_path=html_path.encode('utf-8'), e=e)) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message=e)