コード例 #1
0
    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count >= self.error_threshold:
                self.logger.error('[ JobPageRequestException ] {url}'.format(
                    url=response.url.encode('utf-8')))
                self.sqllogger.log_error_page(
                    hash_code=hash_dn(response.url.encode('utf-8'),
                                      datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status='FAILED',
                    error_message="Empty request's response")
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info(
                    '[ JobPageRetry ] {url} with proxy {proxy}'.format(
                        url=response.url, proxy=proxy))
                yield scrapy.Request(response.url,
                                     callback=self.parse_detail,
                                     meta={'proxy': proxy})
                return
            else:
                self.logger.info(
                    '[ JobPageRetry ] {url}'.format(url=response.url))
                yield scrapy.Request(response.url, callback=self.parse_detail)
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(
                dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(
                    url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(
                url=response.url.encode('utf-8')))

        try:
            ret = {}

            ret['company'] = response.xpath(
                '//div[@class="company-head m-t-10"]/text()').extract_first()
            ret['pos'] = response.xpath(
                '//div[@id="jobinfo"]/div/div[@class="title"]/text()'
            ).extract_first()
            ret['desc'] = self.clean_tag(
                response.xpath(
                    '//div[@class="jobinfo-desc m-t-15"]').extract_first())
            ret['nation'] = response.xpath(
                './/table[@class="qualifi-table"]/tr/td/text()')[0].extract()
            ret['sex'] = response.xpath(
                './/table[@class="qualifi-table"]/tr/td/text()')[2].extract()
            ret['age'] = ' '.join(
                response.xpath('.//table[@class="qualifi-table"]/tr/td/text()')
                [3].extract().strip().split())
            ret['sal'] = ' '.join(
                response.xpath('.//table[@class="qualifi-table"]/tr/td/text()')
                [4].extract().strip().split())
            ret['edu'] = response.xpath(
                './/table[@class="qualifi-table"]/tr/td/text()')[5].extract()
            ret['exp'] = response.xpath(
                './/table[@class="qualifi-table"]/tr/td/text()')[6].extract()
            ret['loc'] = response.xpath(
                '//div[@class="place"]/text()').extract_first().strip()
            ret['amnt'] = response.xpath(
                './/table[@class="qualifi-table"]/tr/td/text()')[7].extract()
            ret['etype'] = response.xpath(
                './/table[@class="qualifi-table"]/tr/td/text()')[8].extract()
            ret['benef'] = self.clean_tag(
                response.xpath(
                    '//div[@class="col-md-9 company-image-box"]/div/div[@class="col-md-12 padding-no m-t-15"]/div[@class="jobinfo-desc"]'
                )[0].extract())
            ret['pdate'] = response.xpath(
                '//div[@class="jobinfo-update"]/text()').extract_first().split(
                    ':')[1].strip()

            if ret['pdate'].split('/')[-1] == "2017":
                self.killed += 1
                self.logger.info("[ JobEndReached ] 2017 reached")
                raise CloseSpider("2017 reached")

            for key in ret.keys():
                if ret[key]:
                    ret[key] = ret[key].strip().encode('utf-8')

            _hash = hash_dn(ret['desc'], ret['company'])

            try:
                self.sqllogger.log_crawled_page(
                    hash_code=_hash,
                    position=ret['pos'],
                    employer=ret['company'],
                    exp=ret['exp'],
                    salary=ret['sal'],
                    location=ret['loc'],
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time=ret['pdate'],
                    job_status='SUCCESS',
                    error_message='')
                self.logger.info('[ RDSLogged ] {url}'.format(
                    url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[
                        0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info(
                        "[ JobEndReached ] crawled record reached exceeding threshold"
                    )
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[
                        0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info(
                        "[ JobRepeat ] crawled record found within threshold #%d"
                        % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error(
                '[ JobDetailException ] {url} {html_path} {e}'.format(
                    url=response.url.encode('utf-8'),
                    html_path=html_path.encode('utf-8'),
                    e=e))
            self.sqllogger.log_error_page(
                hash_code=hash_dn(response.url.encode('utf-8'),
                                  datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id=self.web_id,
                url=response.url.encode('utf-8'),
                meta=response.meta,
                html_path=html_path,
                crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status='FAILED',
                error_message=e)
コード例 #2
0
ファイル: jobant.py プロジェクト: tonny62/Text_Classification
    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        ### handle the case when response from web server is empty
        # retry requesting, after 5 failures in a row, log error then continue.
        if not response.body:
            self.error_count += 1

            if self.error_count >= self.error_threshold:
                self.logger.error('[ JobPageRequestException ] {url}'.format(
                    url=response.url.encode('utf-8')))
                self.sqllogger.log_error_page(
                    hash_code=hash_dn(response.url.encode('utf-8'),
                                      datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status='FAILED',
                    error_message="Empty request's response")
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info(
                    '[ JobPageRetry ] {url} with proxy {proxy}'.format(
                        url=response.url.encode('utf-8'), proxy=proxy))
                yield scrapy.Request(response.url.encode('utf-8'),
                                     callback=self.parse_detail,
                                     meta={'proxy': proxy})
                return
            else:
                self.logger.info(
                    '[ JobPageRetry ] {url}'.format(url=url.encode('utf-8')))
                yield scrapy.Request(response.url.encode('utf-8'),
                                     callback=self.parse_detail)
                return
        self.error_count = 0
        ###

        ### writing html archive
        try:
            html_path = self.html_path.format(
                dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(
                    url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(
                url=response.url.encode('utf-8')))
        ###

        try:
            ### parsing information
            contents = response.xpath(
                './/div[@class="wrapper-preview-list"]/div[contains(@class,"row tr")]/div[contains(@class,"col-sm")]'
            )
            content_str = [
                self.clean_tag(content.xpath('./div/div')[1].extract())
                for content in contents[:10]
            ]

            pos, company = [
                x.strip() for x in response.xpath(
                    '//h1[@class="title-section c4 xs-mt5"]/text()').
                extract_first().split(',', 1)
            ]

            ret = {}

            ret['company'] = company
            ret['pos'] = pos
            ret['etype'] = content_str[1]
            ret['indus'] = content_str[2]
            ret['amnt'] = content_str[3]
            ret['sal'] = content_str[4]
            ret['exp'] = content_str[5]
            ret['sex'] = content_str[6]
            ret['edu'] = content_str[7]
            ret['loc'] = content_str[8]
            ret['desc'] = '|'.join(
                [x.strip() for x in contents[11].xpath('./text()').extract()])
            ret['pdate'] = self.convert_pdate(
                response.xpath(
                    '//span[@itemprop="datePosted"]/text()').extract_first())

            if ret['pdate'].split()[0].split('-')[0] == "2017":
                self.logger.info("[ JobEndReached ] 2017 reached")
                self.killed = 1
                raise CloseSpider("2017 reached")

            for key in ret.keys():
                if ret[key]:
                    ret[key] = ret[key].strip().encode('utf-8')
            ###

            # create hash for tracking jobs
            _hash = hash_dn(ret['desc'], ret['company'])

            ### Stop spider after encountering crawled record 3 times in a row.
            # to prevent spider stopping just from getting a few old records
            # that may happen because of new job updates
            #if self.sqllogger.check_existing(_hash, self.web_id):
            #    if self.repeat_count >= self.repeat_threshold:
            #        self.logger.info("[ JobEndReached ] crawled record reached exceeding threshold")
            #        self.killed = 1
            #        raise CloseSpider("Crawled record reached")
            #    else:
            #        self.repeat_count += 1
            #        self.logger.info("[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count)
            #        yield None
            #        return
            #self.repeat_count = 0
            ###

            ### log result to MySQL
            try:
                self.sqllogger.log_crawled_page(
                    hash_code=_hash,
                    position=ret['pos'],
                    employer=ret['company'],
                    exp=ret['exp'],
                    salary=ret['sal'],
                    location=ret['loc'],
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time=ret['pdate'],
                    job_status='SUCCESS',
                    error_message='')
                self.logger.info('[ RDSLogged ] {url}'.format(
                    url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                ### check encountering old record by catching error that mysql will throw
                # if old record is met. (primary key(hash) is repeating)
                # The error code for such error is 1062
                ### Stop spider after encountering crawled record 3 times IN A ROW.
                # to prevent spider stopping just from getting a few old records
                # that may happen because of new job updates
                if e.orig.args[
                        0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info(
                        "[ JobEndReached ] crawled record reached exceeding threshold"
                    )
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[
                        0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info(
                        "[ JobRepeat ] crawled record found within threshold #%d"
                        % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
                ###
            self.repeat_count = 0
            ###

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error(
                '[ JobDetailException ] {url} {html_path} {e}'.format(
                    url=response.url.encode('utf-8'),
                    html_path=html_path.encode('utf-8'),
                    e=e))
            self.sqllogger.log_error_page(
                hash_code=hash_dn(response.url.encode('utf-8'),
                                  datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id=self.web_id,
                url=response.url.encode('utf-8'),
                meta=response.meta,
                html_path=html_path,
                crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status='FAILED',
                error_message=e)
コード例 #3
0
    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count >= self.error_threshold:
                self.logger.error('[ JobPageRequestException ] {url}'.format(
                    url=response.url.encode('utf-8')))
                self.sqllogger.log_error_page(
                    hash_code=hash_dn(response.url.encode('utf-8'),
                                      datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status='FAILED',
                    error_message="Empty request's response")
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info(
                    '[ JobPageRetry ] {url} with proxy {proxy}'.format(
                        url=response.url.encode('utf-8'), proxy=proxy))
                yield scrapy.Request(response.url.encode('utf-8'),
                                     callback=self.parse_detail,
                                     meta={'proxy': proxy})
                return
            else:
                self.logger.info('[ JobPageRetry ] {url}'.format(
                    url=response.url.encode('utf-8')))
                yield scrapy.Request(response.url.encode('utf-8'),
                                     callback=self.parse_detail)
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(
                dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(
                    url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(
                url=response.url.encode('utf-8')))

        try:

            ret = {}
            head = {}

            row = response.xpath(
                './/table[@width="610px"]/tr/td[@valign="top"]/table[@style=" word-break: normal; word-wrap: break-word; "]'
            )
            topic = [
                i.xpath('./tr/td/span/text()').extract_first() for i in row
            ]

            head[
                'desc'] = u'\u0e23\u0e32\u0e22\u0e25\u0e30\u0e40\u0e2d\u0e35\u0e22\u0e14\u0e02\u0e2d\u0e07\u0e07\u0e32\u0e19'
            head[
                'loc'] = u'\u0e2a\u0e16\u0e32\u0e19\u0e17\u0e35\u0e48\u0e1b\u0e0f\u0e34\u0e1a\u0e31\u0e15\u0e34\u0e07\u0e32\u0e19'
            head['amnt'] = u'\u0e2d\u0e31\u0e15\u0e23\u0e32'
            head[
                'qual'] = u'\u0e04\u0e38\u0e13\u0e2a\u0e21\u0e1a\u0e31\u0e15\u0e34\u0e1c\u0e39\u0e49\u0e2a\u0e21\u0e31\u0e04\u0e23'

            ret['pos'] = response.xpath(
                '//span[@class="head5 blue"]/text()').extract_first().encode(
                    'utf8').split(' : ')[-1]
            ret['company'] = response.xpath(
                './/a[@class="searchjob"]/span/text()').extract_first()
            ret['pdate'] = response.xpath(
                './/table/tr/td[@align="right"]/span/text()').extract_first()
            ret['sal'] = ''
            ret['benef'] = ''
            ret['desc'] = ''
            ret['loc'] = ''
            try:
                ret['sal'] = response.xpath(
                    './/table[@width="610px"]/tr/td[@valign="top"]/table[@style=" word-break: normal; word-wrap: break-word; "]'
                )[2].xpath('./tr')[1].xpath('./td/span/text()')[1].extract()
            except:
                pass
            try:
                ret['benef'] = remove_tags(
                    response.xpath(
                        './/table[@style="margin-top:10px;"]/tr/td/span[@class="head1 blue"]'
                    ).extract_first())
            except TypeError:
                pass

            for key in head.keys():
                try:
                    idx = topic.index(head[key])
                except ValueError:
                    continue
                ret[key] = '|'.join([
                    i for i in [
                        remove_tags(i)
                        for i in row[idx].xpath('./tr')[1:].extract()
                    ] if i
                ])

            for key in ret.keys():
                if ret[key]:
                    try:
                        ret[key] = ' '.join(
                            ret[key].strip().split()).encode('utf-8')
                    except UnicodeDecodeError:
                        ret[key] = ' '.join(ret[key].strip().split())

            if ret['pdate'].split()[-1] == "2560":
                self.logger.info("[ JobEndReached ] 2017 reached")
                raise CloseSpider("2017 reached")

            _hash = hash_dn(ret['desc'], ret['company'])

            try:
                self.sqllogger.log_crawled_page(
                    hash_code=_hash,
                    position=ret['pos'],
                    employer=ret['company'],
                    exp='',
                    salary=ret['sal'],
                    location=ret['loc'],
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time=ret['pdate'],
                    job_status='SUCCESS',
                    error_message='')
                self.logger.info('[ RDSLogged ] {url}'.format(
                    url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[
                        0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info(
                        "[ JobEndReached ] crawled record reached exceeding threshold"
                    )
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[
                        0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info(
                        "[ JobRepeat ] crawled record found within threshold #%d"
                        % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            for key in ret.keys():
                if not ret[key]:
                    del ret[key]

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error(
                '[ JobDetailException ] {url} {html_path} {e}'.format(
                    url=response.url.encode('utf-8'),
                    html_path=html_path.encode('utf-8'),
                    e=e))
            self.sqllogger.log_error_page(
                hash_code=hash_dn(response.url.encode('utf-8'),
                                  datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id=self.web_id,
                url=response.url.encode('utf-8'),
                meta=response.meta,
                html_path=html_path,
                crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status='FAILED',
                error_message=e)
コード例 #4
0
ファイル: jobbkk.py プロジェクト: tonny62/Text_Classification
    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count >= self.error_threshold:
                self.logger.error('[ JobPageRequestException ] {url}'.format(url=response.url.encode('utf-8')))
                self.sqllogger.log_error_page(
                    hash_code    = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id       = self.web_id,
                    url          = response.url.encode('utf-8'),
                    meta         = response.meta,
                    html_path    = html_path,
                    crawl_time   = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status   = 'FAILED',
                    error_message= "Empty request's response"
                )
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info('[ JobPageRetry ] {url} with proxy {proxy}'.format(url=response.url.encode('utf-8'), proxy=proxy))
                yield scrapy.Request(response.url, callback=self.parse_detail , meta={'proxy': proxy})
                return
            else:
                self.logger.info('[ JobPageRetry ] {url}'.format(url=response.url.encode('utf-8')))
                yield scrapy.Request(response.url, callback=self.parse_detail)
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(url=response.url.encode('utf-8')))

        try:
            ret = {}

            ret['company'] = response.xpath('.//h1[@itemprop="hiringOrganization"]/a/span/text()').extract_first()
            ret['pos']     = response.xpath('.//div[@class="job-detail-top col-xs-12"]/h2/a/text()').extract_first()
            ret['etype']   = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[0])
            ret['loc']     = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[1])
            ret['sal']     = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[2])
            ret['hour']    = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[4])
            ret['desc']    = '|'.join([i.strip() for i in response.xpath('.//div[@itemprop="responsibilities"]/text()').extract()])
            ret['qual']    = '|'.join([ i for i in [self.clean_tag(i).strip() for i in response.xpath('.//div[@itemprop="skills"]').extract_first().split('\n')] if i])
            ret['benef']   = '|'.join([ i for i in [self.clean_tag(i).strip() for i in response.xpath('.//div[@itemprop="incentives"]').extract_first().replace('<li>','\n').split('\n')] if i])
            ret['pdate']   = self.convert_pdate(response.xpath('.//div[@itemprop="datePosted"]/text()').extract_first())

            if ret['pdate'].split()[0].split('-')[0] == "2017":
                self.logger.info("[ JobEndReached ] 2017 reached")
                self.killed = 1
                raise CloseSpider("2017 reached")

            for key in ret.keys():
                if ret[key]:
                    ret[key] = ret[key].strip().encode('utf-8')

            _hash = hash_dn(ret['desc'],ret['company'])

            #log result to MySQL
            try:
                self.sqllogger.log_crawled_page(
                    hash_code    = _hash,
                    position     = ret['pos'],
                    employer     = ret['company'],
                    exp          = '',
                    salary       = ret['sal'],
                    location     = ret['loc'],
                    web_id       = self.web_id,
                    url          = response.url.encode('utf-8'),
                    meta         = response.meta,
                    html_path    = html_path,
                    crawl_time   = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time    = ret['pdate'],
                    job_status   = 'SUCCESS',
                    error_message= ''
                )
                self.logger.info('[ RDSLogged ] {url}'.format(url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info("[ JobEndReached ] crawled record reached exceeding threshold")
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info("[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error('[ JobDetailException ] {url} {html_path} {e}'.format(url=response.url.encode('utf-8'),html_path=html_path.encode('utf-8'),e=e))
            self.sqllogger.log_error_page(
                hash_code    = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id       = self.web_id,
                url          = response.url.encode('utf-8'),
                meta         = response.meta,
                html_path    = html_path,
                crawl_time   = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status   = 'FAILED',
                error_message= e
            )
コード例 #5
0
    def parse_detail(self, response):

        if not response.body:
            self.error_count += 1

            if self.error_count == 5:
                self.logger.error('[ JobPageRequestException ] {url}'.format(
                    url=response.url))
                self.sqllogger.log_error_page(
                    hash_code=hash_dn(response.url.encode('utf-8'),
                                      datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.header,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status='FAILED',
                    error_message="Empty request's response")
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info(
                    '[ JobPageRetry ] {url} with proxy {proxy}'.format(
                        url=response.url, proxy=proxy))
                yield scrapy.Request(response.url,
                                     callback=self.parse_detail,
                                     meta={'proxy': proxy})
                return
            else:
                self.logger.info('[ JobPageRetry ] {url}'.format(url=url))
                yield scrapy.Request(response.url, callback=self.parse_detail)
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(
                dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(
                    url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(
                url=response.url.encode('utf-8')))

        try:

            def convert_utf8(data):
                if isinstance(data, basestring):
                    return data.encode('utf-8')
                elif isinstance(data, collections.Mapping):
                    return dict(map(convert_utf8, data.iteritems()))
                elif isinstance(data, collections.Iterable):
                    return type(data)(map(convert_utf8, data))
                else:
                    return data

            ret = {}
            js = json.loads(remove_tags(response.xpath('/*').extract_first()))
            js = convert_utf8(js)
            jobid = js['data']['attributes']['jobid']
            companyid = js['data']['relationships']['company']['data']['id']
            cdttm = self.cdttm[(companyid, str(jobid))]
            if datetime.strptime(cdttm, '%Y-%m-%d %H:%M:%S').year == 2017:
                self.logger.info("[ JobEndReached ] 2017 reached")
                raise CloseSpider("2017 reached")

            ret['pdate'] = cdttm.encode('utf-8')

            if js['data']['attributes'].has_key('position') and js['data'][
                    'attributes']['position'] is not None:
                ret['pos'] = js['data']['attributes']['position']
            if js['data']['attributes'].has_key('position_th') and js['data'][
                    'attributes']['position_th'] is not None:
                ret['posth'] = js['data']['attributes']['position_th']
            if js['data']['attributes'].has_key('qualifications') and js[
                    'data']['attributes']['qualifications'] is not None:
                ret['qual'] = [
                    x for x in js['data']['attributes']['qualifications']
                ]
            if js['data']['attributes'].has_key('description') and js['data'][
                    'attributes']['description'] is not None:
                ret['desc'] = js['data']['attributes']['description']
            if js['data']['attributes'].has_key('responsibilities') and js[
                    'data']['attributes']['responsibilities'] is not None:
                ret['resp'] = [
                    x for x in js['data']['attributes']['responsibilities']
                ]

            ret['skill_req'] = [
                x['attributes'] for x in js['included']
                if x['type'] == 'skillsRequired'
            ]
            ret['skill_pref'] = [
                x['attributes'] for x in js['included']
                if x['type'] == 'skillsPreferred'
            ]
            ret['exp_pref'] = [
                x['attributes'] for x in js['included']
                if x['type'] == 'experiencePreferred'
            ]
            ret['exp_req'] = [
                x['attributes'] for x in js['included']
                if x['type'] == 'experienceRequired'
            ]
            ret['edu'] = [
                x['attributes'] for x in js['included']
                if x['type'] == 'educations'
            ][:]
            ret['company'] = [
                x['attributes']['name'] for x in js['included']
                if x['type'] == "company"
            ][0]

            location = []
            try:
                province = [
                    x for x in js['included'] if x['type'] == 'province'
                ][0]['attributes']['name']
                location.append(province)
                ret['province'] = province
            except:
                pass
            try:
                district = [
                    x for x in js['included'] if x['type'] == 'district'
                ][0]['attributes']['name']
                location.append(district)
                ret['district'] = district
            except:
                pass

            ret['location'] = ','.join(location)

            if (not 'desc' in ret) and (not 'resp' in ret):
                ret['desc'] = ret['company'] + str(ret['skill_req'])
            elif (not 'desc' in ret) and ('resp' in ret):
                ret['desc'] = ret['resp']

#            for key in ret.keys():
#                val = ret[key]
#                if val and isinstance(val, list):
#                    ret[key] = unicode(','.join(ret[key]),encoding='utf-8').strip().encode('utf-8')
#                else:
#                    ret[key] = unicode(ret[key],encoding='utf-8').strip().encode('utf-8')

            _hash = hash_dn(ret['desc'], ret['company'])

            #log result to MySQL
            try:
                self.sqllogger.log_crawled_page(
                    hash_code=_hash,
                    position='%s' % ret['pos'],
                    employer='%s' % ret['company'],
                    exp='%s' % ret['exp_req'],
                    salary='',
                    location=' '.join(ret['location']),
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.headers,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time=ret['pdate'],
                    job_status='SUCCESS',
                    error_message='')
                self.logger.info('[ RDSLogged ] {url}'.format(
                    url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[
                        0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info(
                        "[ JobEndReached ] crawled record reached exceeding threshold"
                    )
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[
                        0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info(
                        "[ JobRepeat ] crawled record found within threshold #%d"
                        % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error(
                '[ JobDetailException ] {url} {html_path} {e}'.format(
                    url=response.url.encode('utf-8'),
                    html_path=html_path.encode('utf-8'),
                    e=e))
            self.sqllogger.log_error_page(
                hash_code=hash_dn(response.url.encode('utf-8'),
                                  datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id=self.web_id,
                url=response.url.encode('utf-8'),
                meta=response.headers,
                html_path=html_path,
                crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status='FAILED',
                error_message=e)
コード例 #6
0
    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count >= self.error_threshold:
                self.logger.error('[ JobPageRequestException ] {url}'.format(
                    url=response.url.encode('utf-8')))
                self.sqllogger.log_error_page(
                    hash_code=hash_dn(response.url.encode('utf-8'),
                                      datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status='FAILED',
                    error_message="Empty request's response")
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info(
                    '[ JobPageRetry ] {url} with proxy {proxy}'.format(
                        url=response.url.encode('utf-8'), proxy=proxy))
                yield scrapy.Request(response.url,
                                     callback=self.parse_detail,
                                     meta={'proxy': proxy})
                return
            else:
                self.logger.info('[ JobPageRetry ] {url}'.format(
                    url=response.url.encode('utf-8')))
                yield scrapy.Request(response.url, callback=self.parse_detail)
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(
                dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(
                    url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(
                url=response.url.encode('utf-8')))

        try:
            ret = {}

            head = {}

            row = response.xpath(
                '//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]/p|//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]/ul'
            )[1:]
            topic = response.xpath(
                '//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]//b/u/text()'
            ).extract()
            head['amnt'] = u'\u0e2d\u0e31\u0e15\u0e23\u0e32'
            head[
                'sal'] = u'\u0e40\u0e07\u0e34\u0e19\u0e40\u0e14\u0e37\u0e2d\u0e19'
            head[
                'benef'] = u'\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e34\u0e01\u0e32\u0e23'
            head[
                'req'] = u'\u0e04\u0e38\u0e13\u0e2a\u0e21\u0e1a\u0e31\u0e15\u0e34\u0e1c\u0e39\u0e49\u0e2a\u0e21\u0e31\u0e04\u0e23'
            head[
                'loc_det'] = u'\u0e2a\u0e16\u0e32\u0e19\u0e17\u0e35\u0e48\u0e1b\u0e0f\u0e34\u0e1a\u0e31\u0e15\u0e34\u0e07\u0e32\u0e19'
            head['loc'] = u'\u0e08\u0e31\u0e07\u0e2b\u0e27\u0e31\u0e14'

            ret['pos'], ret['desc'] = [
                self.clean_tag(x) for x in response.xpath(
                    '//div[@class="w3-theme-l4"]/div').extract()
            ]
            ret['pdate'] = self.cdate[response.url]
            ret['company'] = self.comnm[response.url]
            del self.cdate[response.url]
            del self.comnm[response.url]
            ret['loc'] = ''
            ret['sal'] = ''

            for key in head.keys():
                try:
                    idx = topic.index(head[key])
                except ValueError:
                    continue
                ret[key] = '|'.join([
                    i for i in [
                        remove_tags(i)
                        for i in row[idx].xpath('./text()|./li').extract()
                    ] if i
                ])

            if ret['pdate'].split()[-1] == "2560":
                self.killed += 1
                raise CloseSpider("2017 reached")

            for key in ret.keys():
                if ret[key]:
                    ret[key] = ' '.join(
                        ret[key].strip().split()).encode('utf-8')

            _hash = hash_dn(ret['desc'], ret['company'])

            try:
                self.sqllogger.log_crawled_page(
                    hash_code=_hash,
                    position=ret['pos'],
                    employer=ret['company'],
                    exp='',
                    salary=ret['sal'],
                    location=ret['loc'],
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time=ret['pdate'],
                    job_status='SUCCESS',
                    error_message='')
                self.logger.info('[ RDSLogged ] {url}'.format(
                    url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[
                        0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info(
                        "[ JobEndReached ] crawled record reached exceeding threshold"
                    )
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[
                        0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info(
                        "[ JobRepeat ] crawled record found within threshold #%d"
                        % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            for key in ret.keys():
                if not ret[key]:
                    del ret[key]

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error(
                '[ JobDetailException ] {url} {html_path} {e}'.format(
                    url=response.url.encode('utf-8'),
                    html_path=html_path.encode('utf-8'),
                    e=e))
            self.sqllogger.log_error_page(
                hash_code=hash_dn(response.url.encode('utf-8'),
                                  datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id=self.web_id,
                url=response.url.encode('utf-8'),
                meta=response.meta,
                html_path=html_path,
                crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status='FAILED',
                error_message=e)
コード例 #7
0
    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count >= self.error_threshold:
                self.logger.error('[ JobPageRequestException ] {url}'.format(
                    url=response.url))
                self.sqllogger.log_error_page(
                    hash_code=hash_dn(response.url.encode('utf-8'),
                                      datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status='FAILED',
                    error_message="Empty request's response")
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info(
                    '[ JobPageRetry ] {url} with proxy {proxy}'.format(
                        url=response.url, proxy=proxy))
                yield scrapy.Request(response.url.encode('utf-8'),
                                     callback=self.parse_detail,
                                     meta={'proxy': proxy})
                return
            else:
                self.logger.info(
                    '[ JobPageRetry ] {url}'.format(url=response.url))
                yield scrapy.Request(response.url.encode('utf-8'),
                                     callback=self.parse_detail)
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(
                dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(
                    url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(
                url=response.url.encode('utf-8')))

        try:
            contents = response.xpath(
                '//form[@method="GET"]/table[@id="AutoNumber1"]/tr/td/font[@size="2"]/text()'
            ).extract()
            ret = {}
            ret['company'] = urllib.unquote(
                repr(self.comnm[response.url])[2:-1].replace(
                    '\\x', '%')).decode('tis-620')
            ret['pos'] = response.xpath(
                '//form[@method="GET"]/b/font/text()').extract_first()
            ret['sal'] = contents[1]
            ret['amnt'] = contents[2]
            ret['desc'] = response.xpath(
                '//form[@method="GET"]/table[@id="AutoNumber1"]/tr/td/font[@style="font-size: 11pt"]/text()'
            ).extract_first()
            ret['loc'] = response.xpath(
                '//form[@method="GET"]/table[@id="AutoNumber1"]/tr/td/font[@size="2"]/span/text()'
            ).extract_first()
            ret['pdate'] = urllib.unquote(
                repr(self.cdttm[response.url])[2:-1].replace(
                    '\\x', '%')).decode('tis-620')
            del self.cdttm[response.url]

            if ret['pdate'].split()[-1] == "2560":
                self.logger.info("[ JobEndReached ] 2017 reached")
                raise CloseSpider("2017 reached")

            for key in ret.keys():
                if ret[key]:
                    ret[key] = ret[key].strip().encode('utf-8')

            _hash = hash_dn(ret['desc'], ret['company'])

            #log result to MySQL
            try:
                self.sqllogger.log_crawled_page(
                    hash_code=_hash,
                    position=ret['pos'],
                    employer=ret['company'],
                    exp='',
                    salary=ret['sal'],
                    location=ret['loc'],
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time=ret['pdate'],
                    job_status='SUCCESS',
                    error_message='')
                self.logger.info('[ RDSLogged ] {url}'.format(
                    url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[
                        0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info(
                        "[ JobEndReached ] crawled record reached exceeding threshold"
                    )
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[
                        0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info(
                        "[ JobRepeat ] crawled record found within threshold #%d"
                        % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error(
                '[ JobDetailException ] {url} {html_path} {e}'.format(
                    url=response.url.encode('utf-8'),
                    html_path=html_path.encode('utf-8'),
                    e=e))
            self.sqllogger.log_error_page(
                hash_code=hash_dn(response.url.encode('utf-8'),
                                  datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id=self.web_id,
                url=response.url.encode('utf-8'),
                meta=response.meta,
                html_path=html_path,
                crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status='FAILED',
                error_message=e)
コード例 #8
0
    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count >= self.error_threshold:
                self.logger.error(
                    '[ JobPageRequestException ] {url} {form}'.format(
                        url=response.url, form=response.meta['formdata']))
                self.sqllogger.log_error_page(
                    hash_code=hash_dn(response.url.encode('utf-8'),
                                      datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status='FAILED',
                    error_message="Empty request's response")
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info(
                    '[ JobPageRetry ] {url} {form} with proxy {proxy}'.format(
                        url=response.url,
                        form=response.meta['formdata'],
                        proxy=proxy))
                yield FormRequest(response.url,
                                  formdata=response.meta['formdata'],
                                  callback=self.parse_detail,
                                  meta={
                                      'proxy': proxy,
                                      'formdata': fd
                                  })
                return
            else:
                self.logger.info('[ JobPageRetry ] {url} {form}'.format(
                    url=response.url, form=response.meta['formdata']))
                yield FormRequest(response.url,
                                  formdata=response.meta['formdata'],
                                  callback=self.parse_detail,
                                  meta={'formdata': fd})
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(
                dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url} {form}'.format(
                    url=response.url, form=response.meta['formdata']))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url} {form}'.format(
                url=response.url, form=response.meta['formdata']))

        try:

            ret = {}

            ret['pos'] = response.xpath(
                '//div[attribute::class="logoCompany"]/div/text()'
            ).extract_first()
            ret['pos2'] = response.xpath(
                '//div[attribute::class="logoCompany"]/div/text()'
            ).extract_first()
            ret['company'] = self.clean_tag(
                response.xpath(
                    './/div[contains(@class,"companyName")]').extract_first())
            ret['desc'] = self.clean_tag(
                response.xpath('.//div[@id="jobDescription"]/table/tr/td').
                extract_first())
            ret['req'] = self.clean_tag(
                response.xpath('.//div[@id="qualification"]').extract_first())

            contents = [
                self.clean_tag(i) for i in response.xpath(
                    './/div[@id="basic_require"]/div[not(@class="seperator")]'
                ).extract()
            ]

            ret['etype'] = contents[0].split(':')[-1]
            ret['amnt'] = contents[1].split(':')[-1]
            ret['sex'] = contents[2].split(':')[-1]
            ret['sal'] = contents[3].split(':')[-1]
            ret['exp'] = contents[4].split(':')[-1]
            ret['loc'] = contents[5].split(':')[-1]
            ret['edu'] = '|'.join(contents[6].split(':')[1:])
            ret['pdate'] = response.xpath(
                './/div[@clas="dateAndShare"]/p/text()').extract_first()

            for key in ret.keys():
                if ret[key]:
                    ret[key] = ret[key].strip().encode('utf-8')

            if ret['pdate'].split('/')[-1] == "2560":
                self.logger.info("[ JobEndReached ] 2017 reached")
                self.killed = 1
                raise CloseSpider("2017 reached")

            _hash = hash_dn(ret['desc'], ret['company'])

            try:
                self.sqllogger.log_crawled_page(
                    hash_code=_hash,
                    position=ret['pos'],
                    employer=ret['company'],
                    exp=ret['exp'],
                    salary=ret['sal'],
                    location=ret['loc'],
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time=ret['pdate'],
                    job_status='SUCCESS',
                    error_message='')
                self.logger.info('[ RDSLogged ] {url}'.format(
                    url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[
                        0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info(
                        "[ JobEndReached ] crawled record reached exceeding threshold"
                    )
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[
                        0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info(
                        "[ JobRepeat ] crawled record found within threshold #%d"
                        % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error(
                '[ JobDetailException ] {url} {form} {html} {e}'.format(
                    url=response.url.encode('utf-8'),
                    form=response.meta['formdata'],
                    html=html_path,
                    e=e))
            self.sqllogger.log_error_page(
                hash_code=hash_dn(response.url.encode('utf-8'),
                                  datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id=self.web_id,
                url=response.url.encode('utf-8'),
                meta=response.meta,
                html_path=html_path,
                crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status='FAILED',
                error_message=e)
コード例 #9
0
    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count == 5:
                self.logger.error('[ JobPageRequestException ] {url}'.format(url=response.url))
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info('[ JobPageRetry ] {url} with proxy {proxy}'.format(url=response.url, proxy=proxy))
                yield scrapy.Request(response.url, callback=self.parse_detail , meta={'proxy': proxy})
            else:
                self.logger.info('[ JobPageRetry ] {url}'.format(url=response.url))
                yield scrapy.Request(response.url, callback=self.parse_detail)

        self.error_count = 0

        try:
            html_path = self.html_path.format(dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(url=response.url))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(url=response.url))

        try:
            ret = {}
            ret['pos']       = response.xpath('//th[@class="qualification_positionname align_left"]/a/text()').extract_first()
            ret['company']   = response.xpath('//table[@class="table_qualification"]/tr[@class="header"]/th[@class="corner"]/div/div/div/text()').extract_first()
            ret['pdate']     = response.xpath('//th[@class="qualification_postdate align_right"]/text()').extract_first()
            #contents         = response.xpath('//table[@class="table_qualification"]/tr[not(@class)]/td[@class="line_left line_right"]')[1].xpath('./ul')
            #ret['desc']      = '|'.join(contents[0].xpath('./li//text()').extract())
            #ret['req']       = '|'.join(contents[1].xpath('./li//text()').extract())
            ret['desc']      = self.clean_tag(response.xpath('//table[@class="table_qualification"]/tr[not(@class)]/td[@class="line_left line_right"]')[1].extract())
            ret['loc']       = self.location[response.url]
            
            del self.location[response.url]

            for key in ret.keys():
                if ret[key]:
                    try:
                        ret[key] = ret[key].strip().encode('utf-8')
                    except:
                        ret[key] = ret[key].strip()

            if ret['pdate'].split()[-1] == "17":
                    self.logger.info("[ JobEndReached ] 2017 reached")
                    raise CloseSpider("2017 reached")

            _hash = hash_dn(ret['desc'],ret['company'])

            try:
                self.sqllogger.log_crawled_page(
                    hash_code    = _hash,
                    position     = ret['pos'],
                    employer     = ret['company'],
                    exp          = '',
                    salary       = '',
                    location     = ret['loc'],
                    web_id       = self.web_id,
                    url          = response.url.encode('utf-8'),
                    meta         = response.meta,
                    html_path    = html_path,
                    crawl_time   = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time    = ret['pdate'],
                    job_status   = 'SUCCESS',
                    error_message= ''
                )
                self.logger.info('[ RDSLogged ] {url}'.format(url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info("[ JobEndReached ] crawled record reached exceeding threshold")
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info("[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            yield ret
            
        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error('[ JobDetailException ] {url} {html_path} {e}'.format(url=response.url.encode('utf-8'),html_path=html_path.encode('utf-8'),e=e))
            self.sqllogger.log_error_page(
                hash_code     = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id        = self.web_id,
                url           = response.url.encode('utf-8'),
                meta          = response.meta,
                html_path     = html_path,
                crawl_time    = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status    = 'FAILED',
                error_message = e
            )
コード例 #10
0
    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count == 5:
                self.logger.error('[ JobPageRequestException ] {url}'.format(
                    url=response.url.encode('utf-8')))
                self.sqllogger.log_error_page(
                    hash_code=hash_dn(response.url.encode('utf-8'),
                                      datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status='FAILED',
                    error_message="Empty request's response")
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info(
                    '[ JobPageRetry ] {url} with proxy {proxy}'.format(
                        url=response.url.encode('utf-8'), proxy=proxy))
                yield scrapy.Request(response.url,
                                     callback=self.parse_detail,
                                     meta={'proxy': proxy})
                return
            else:
                self.logger.info('[ JobPageRetry ] {url}'.format(
                    url=response.url.encode('utf-8')))
                yield scrapy.Request(response.url, callback=self.parse_detail)
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(
                dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(
                    url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(
                url=response.url.encode('utf-8')))

        try:
            ret = {}

            ret['company'] = response.xpath(
                './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/h3/text()'
            ).extract_first()
            ret['pos'] = response.xpath(
                './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div'
            )[1].xpath('./table/tr')[0].xpath(
                './td/strong/font/text()').extract_first()
            ret['amnt'] = response.xpath(
                './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div'
            )[1].xpath('./table/tr')[1].xpath('./td/text()')[1].extract()
            ret['sal'] = response.xpath(
                './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div'
            )[1].xpath('./table/tr')[2].xpath('./td/text()')[1].extract()
            ret['desc'] = '|'.join([
                i.strip() for i in response.xpath(
                    './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div'
                )[1].xpath('./table/tr')[3].xpath('./td/text()').extract()
            ][1:])
            ret['qual'] = '|'.join([
                i.strip() for i in response.xpath(
                    './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div'
                )[1].xpath('./table/tr')[4].xpath('./td/text()').extract()
            ][1:])
            ret['benef'] = '|'.join([
                i.strip() for i in response.xpath(
                    './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div'
                )[1].xpath('./table/tr')[5].xpath('./td/text()').extract()
            ][1:])
            ret['loc'] = '|'.join([
                i.strip() for i in response.xpath(
                    './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div'
                )[1].xpath('./table/tr')[7].xpath('./td/text()').extract()
            ][1:])
            ret['loc_det'] = '|'.join([
                i.strip() for i in response.xpath(
                    './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div'
                )[1].xpath('./table/tr')[6].xpath('./td/text()').extract()
            ][1:])
            ret['pdate'] = response.xpath(
                './/div[@id="content-frame"]/div[@id="content-frame-2col-1"]/div'
            )[1].xpath('./table/tr')[10].xpath(
                './td/font/text()').extract_first()

            for key in ret.keys():
                if ret[key]:
                    ret[key] = ret[key].strip().encode('utf-8')

            if ret['pdate'].split()[0].split('-')[0] == "2017":
                self.logger.info("[ JobEndReached ] 2017 reached")
                raise CloseSpider("2017 reached")

            _hash = hash_dn(ret['desc'], ret['company'])

            try:
                self.sqllogger.log_crawled_page(
                    hash_code=hash_dn(ret['desc'], ret['company']),
                    position=ret['pos'],
                    employer=ret['company'],
                    exp='',
                    salary=ret['sal'],
                    location=ret['loc'],
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time=ret['pdate'],
                    job_status='SUCCESS',
                    error_message='')
                self.logger.info('[ RDSLogged ] {url}'.format(
                    url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[
                        0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info(
                        "[ JobEndReached ] crawled record reached exceeding threshold"
                    )
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[
                        0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info(
                        "[ JobRepeat ] crawled record found within threshold #%d"
                        % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error(
                '[ JobDetailException ] {url} {html_path} {e}'.format(
                    url=response.url.encode('utf-8'),
                    html_path=html_path.encode('utf-8'),
                    e=e))
            self.sqllogger.log_error_page(
                hash_code=hash_dn(response.url.encode('utf-8'),
                                  datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id=self.web_id,
                url=response.url.encode('utf-8'),
                meta=response.meta,
                html_path=html_path,
                crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status='FAILED',
                error_message=e)