Python first_item Exemples, jobspider.utils.tools.first_item Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : amazon_spider.py Projet : alexming/jobspider

 def parse_review(self, response):
     hxs = Selector(response)
     asin = response.meta['asin']
     title = FmtSQLCharater(first_item(hxs.xpath('//title/text()').extract()))
     title = title.replace(u'Amazon.com: Customer Reviews: ', '')
     rlist = hxs.xpath("//div[@id='cm_cr-review_list']/div[@class='a-section review']")
     for div in rlist:
         r = Review()
         r['product_id'] = asin
         r['product_name'] = title
         r['review_id'] = first_item(div.xpath('@id').extract())
         votes = FmtSQLCharater(first_item(div.xpath('div[1]/span/text()').extract()))
         match = re.search(u'(.+) people found this helpful', votes, re.I)
         if match:
             r['total_feedback_num'] = match.group(1)
             r['total_helpful_num'] = match.group(2)
         #
         r['full_star'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[1]/i/span/text()").extract()))
         r['title'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[2]/text()").extract()))
         r['cust_name'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[1]/a/text()").extract()))
         r['creation_date'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[4]/text()").extract()))
         #r['creation_date'] = r['creation_date'].replace(u'于 ', '').replace('年', '/').replace(u'月', '/').replace(u'日', '/')
         r['body'] = first_item(div.xpath("div[5]/span").extract())
         yield r
     #下一页
     if len(rlist) == 10:
         page = response.meta['page'] + 1
         log.msg('Request Product[%s]-[%d] page review ...' % (asin, page))
         yield Request(
             url=self.review_url.replace('<?asin?>', asin).replace('<?page?>', str(page)),
             callback=self.parse_review,
             headers=self.headers,
             meta={'page': page, 'asin': asin}
         )

Exemple #2

0

Afficher le fichier

Fichier : mytechlogy_spider.py Projet : cash2one/jobspider

 def parse_list(self, response):
     if response.status == 200:
         hxs = Selector(response)
         positions = hxs.xpath(
             '//ul[@class="article-list"]/li/article/div[@class="entry-content"]/header/h4/a'
         )
         #下一页
         if positions and len(positions) > 0:
             page = response.meta['page'] + 1
             yield self._list_request_by_pg(page)
         #
         for item in positions:
             link_url = first_item(item.xpath('@href').extract())
             yield Request(
                 url=link_url,
                 meta={
                     'sector': response.meta['sector'],
                     'timeout': 5000
                 },
                 callback=self.parse_info,
                 dont_filter=True,
                 #errback=self._requestErrorBack
             )
     else:
         log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level=log.INFO)

Exemple #3

0

Afficher le fichier

Fichier : wubaeducation_spider.py Projet : alexming/jobspider

 def parse_comment(self, response):
     data = response.body
     shop_id = response.meta['shopid']
     hxs = Selector(None, data)
     pj = hxs.xpath('//div[@class="pjbox"]/ul[@class="pj_ul"]')
     for item in pj:
         c = WubaComment()
         c['shop_id'] = shop_id
         c['user_name'] = first_item(item.xpath('li/span[@class="dp_t"]/text()').extract())
         c['c_time'] = first_item(item.xpath('li/label/i/text()').extract())
         c['content'] = first_item(item.xpath('li[2]/text()').extract())
         c['content'] = c['content'].lstrip(' ')
         c['content'] = c['content'].rstrip(' ')
         addto = first_item(item.xpath('li/p[@class="addto"]/text()').extract())
         c['content'] += addto
         yield c

Exemple #4

0

Afficher le fichier

Fichier : amazon_spider.py Projet : alexming/jobspider

 def parse_list(self, response):
     data = response.body
     if data == '':
         log.msg(format= '%(request)s post fail.response is empty.', level = log.ERROR, request = response.url)
         return
     #
     """
     root = response.meta['root']
     leaf = response.meta['leaf']
     age = response.meta['age']
     star = response.meta['star']
     """
     page = response.meta['page']
     #
     hxs = Selector(None, data)
     #
     plist_a = hxs.xpath("//div[@id='resultsCol']/div[@id='centerMinus']/div[@id='atfResults']/ul[@id='s-results-list-atf']/li")
     plist_b = hxs.xpath("//div[@id='btfResults']/ul/li")
     plist = plist_a + plist_b
     #log.msg(u'类别[图书->少儿->%s->%s->%s->%s]页码[%d]总数=%d,开始请求详情...' % (root['name'], leaf['name'], age['name'], star['name'], page, len(plist)))
     #og.msg(u'Start Request:%s' % plist);
     """
     if len(plist) == 0:
         if data.find(u'verify'):
             log.msg(u'verify ban')
     """
     for item in plist:
         asin = first_item(item.xpath('@data-asin').extract())
         log.msg('Request ASIN Detail Page:' + str(asin))
         #
         """
         c = Category()
         c['product_id'] = asin
         c['category_path'] = 'n:658390051,n:!658391051,n:658409051,n:%d,n:%d,p_72:%d,p_n_age_range:%d' % (root['id'], leaf['id'], star['id'], age['id'])
         c['path_name'] = ' 图书 : 少儿 : %s : %s : %s : %s' % (root['name'], leaf['name'], star['name'], age['name'])
         yield c
         """
         #请求详情
         detailUrl = self.info_url.replace('<?asin?>', asin)
         log.msg('DetailUrl:' + detailUrl)
         yield Request(
             url=self.info_url.replace('<?asin?>', asin),
             callback=self.parse_info,
             headers=self.headers,
             meta={'asin':asin,'proxy':'http://192.168.1.130:8888'},
             #meta={'root': root, 'leaf': leaf, 'age': age, 'star': star, 'asin': asin}
         )
         """
         #请求评论            
         yield Request(
             url=self.review_url.replace('<?asin?>', asin).replace('<?page?>', '1'),
             callback=self.parse_review,
             headers=self.headers,
             meta={'page': 1, 'asin': asin}
         )
         """
         
     """

Exemple #5

0

Afficher le fichier

Fichier : wubaeducation_spider.py Projet : cash2one/jobspider

 def parse_comment(self, response):
     data = response.body
     shop_id = response.meta['shopid']
     hxs = Selector(None, data)
     pj = hxs.xpath('//div[@class="pjbox"]/ul[@class="pj_ul"]')
     for item in pj:
         c = WubaComment()
         c['shop_id'] = shop_id
         c['user_name'] = first_item(
             item.xpath('li/span[@class="dp_t"]/text()').extract())
         c['c_time'] = first_item(item.xpath('li/label/i/text()').extract())
         c['content'] = first_item(item.xpath('li[2]/text()').extract())
         c['content'] = c['content'].lstrip(' ')
         c['content'] = c['content'].rstrip(' ')
         addto = first_item(
             item.xpath('li/p[@class="addto"]/text()').extract())
         c['content'] += addto
         yield c

Exemple #6

0

Afficher le fichier

Fichier : wubaeducation_spider.py Projet : alexming/jobspider

 def parse_info_tel(self, response):
     data = response.body
     hxs = Selector(None, data)
     telnum = first_item(hxs.xpath("//ul[@class='contact_area']/li/span[@class='phone']/text()").extract())
     telnum = telnum.replace('\r\n', '')
     telnum = telnum.replace(' ', '')
     wb = response.meta['wb']
     wb['telnum'] = telnum
     yield wb

Exemple #7

0

Afficher le fichier

Fichier : jingdong_spider.py Projet : cash2one/jobspider

    def parse_desc(self, response):
        data = response.body
        data = data.decode('GBK', 'ignore')
        data = data[9:-1]
        try:
            js = json.loads(data)
        except:
            log.msg(u'图书[%s]描述请求结果解析异常,非json数据.url=%s' %
                    (response.meta['b']['product_id'], response.url),
                    level=log.INFO)
            return
        b = response.meta['b']
        hxs = Selector(None, js['content'])
        b['product_features'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-1']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['abstract'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-2']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['recommendation'] = b['abstract']
        b['content'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-3']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['brief_introduction'] = b['content']
        b['authorintro'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-4']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['extract'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-5']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['catalog'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-6']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['more_information'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-8']/div[2]/div[@class='book-detail-content']"
            ).extract())
        #
        b['abstract'] = FmtSQLCharater(b['abstract'])
        b['catalog'] = FmtSQLCharater(b['catalog'])
        b['recommendation'] = FmtSQLCharater(b['recommendation'])
        b['content'] = FmtSQLCharater(b['content'])
        b['brief_introduction'] = FmtSQLCharater(b['brief_introduction'])
        b['authorintro'] = FmtSQLCharater(b['authorintro'])
        b['extract'] = FmtSQLCharater(b['extract'])
        b['more_information'] = FmtSQLCharater(b['more_information'])

        log.msg(u'请求商品[%s]的价格信息...' % b['product_id'])

        yield Request(url=self.price_url.replace('<?sku?>', b['product_id']),
                      callback=self.parse_price,
                      headers=self.headers,
                      meta={'b': b})

Exemple #8

0

Afficher le fichier

Fichier : wubaeducation_spider.py Projet : cash2one/jobspider

 def parse_info_tel(self, response):
     data = response.body
     hxs = Selector(None, data)
     telnum = first_item(
         hxs.xpath(
             "//ul[@class='contact_area']/li/span[@class='phone']/text()").
         extract())
     telnum = telnum.replace('\r\n', '')
     telnum = telnum.replace(' ', '')
     wb = response.meta['wb']
     wb['telnum'] = telnum
     yield wb

Exemple #9

0

Afficher le fichier

Fichier : jingdong_spider.py Projet : cash2one/jobspider

    def parse_list(self, response):
        data = response.body
        if data == '':
            log.msg(format='%(request)s post fail.response is empty.',
                    level=log.ERROR,
                    request=response.url)
            return
        category = response.meta['category']
        age = response.meta['age']
        hxs = Selector(None, data)
        #
        plist = hxs.xpath("//li/div/div[@class='gl-i-wrap j-sku-item']")
        log.msg(
            u'类别[%s]年龄[%s]页码[%d]总数=%d,开始请求详情...' %
            (category['name'], age['name'], response.meta['page'], len(plist)))
        for item in plist:
            sku = first_item(item.xpath('@data-sku').extract())
            '''
            #请求详情
            yield Request(
                url=self.info_url.replace('<?sku?>', sku),
                callback=self.parse_info,
                headers=dict({'Host': 'item.jd.com', 'Upgrade-Insecure-Requests': '1'}, **self.headers),
                meta={'category': category, 'age': age, 'sku': sku}
            )
            '''
            #请求评论
            yield Request(url=self.review_url.replace('<?sku?>', sku).replace(
                '<?page?>', '1'),
                          callback=self.parse_review,
                          headers=self.headers,
                          meta={
                              'page': 1,
                              'sku': sku
                          })

        #下一页
        if len(plist) == 60:
            page = response.meta['page'] + 1
            log.msg(u'请求类别[%s]年龄[%s]的第%d页' %
                    (category['name'], age['name'], page))
            yield Request(
                url=self.list_url.replace('<?page?>', str(page)).replace(
                    '<?cat?>',
                    str(category['id'])).replace('<?age?>', age['id']),
                callback=self.parse_list,
                headers=dict({'Host': 'list.jd.com'}, **self.headers),
                meta={
                    'page': page,
                    'category': category,
                    'age': age
                })

Exemple #10

0

Afficher le fichier

Fichier : amazon_spider.py Projet : cash2one/jobspider

 def parse_review(self, response):
     hxs = Selector(response)
     asin = response.meta['asin']
     title = FmtSQLCharater(
         first_item(hxs.xpath('//title/text()').extract()))
     title = title.replace(u'Amazon.com: Customer Reviews: ', '')
     rlist = hxs.xpath(
         "//div[@id='cm_cr-review_list']/div[@class='a-section review']")
     for div in rlist:
         r = Review()
         r['product_id'] = asin
         r['product_name'] = title
         r['review_id'] = first_item(div.xpath('@id').extract())
         votes = FmtSQLCharater(
             first_item(div.xpath('div[1]/span/text()').extract()))
         match = re.search(u'(.+) people found this helpful', votes, re.I)
         if match:
             r['total_feedback_num'] = match.group(1)
             r['total_helpful_num'] = match.group(2)
         #
         r['full_star'] = FmtSQLCharater(
             first_item(div.xpath("div[2]/a[1]/i/span/text()").extract()))
         r['title'] = FmtSQLCharater(
             first_item(div.xpath("div[2]/a[2]/text()").extract()))
         r['cust_name'] = FmtSQLCharater(
             first_item(div.xpath("div[3]/span[1]/a/text()").extract()))
         r['creation_date'] = FmtSQLCharater(
             first_item(div.xpath("div[3]/span[4]/text()").extract()))
         #r['creation_date'] = r['creation_date'].replace(u'于 ', '').replace('年', '/').replace(u'月', '/').replace(u'日', '/')
         r['body'] = first_item(div.xpath("div[5]/span").extract())
         yield r
     #下一页
     if len(rlist) == 10:
         page = response.meta['page'] + 1
         log.msg('Request Product[%s]-[%d] page review ...' % (asin, page))
         yield Request(url=self.review_url.replace(
             '<?asin?>', asin).replace('<?page?>', str(page)),
                       callback=self.parse_review,
                       headers=self.headers,
                       meta={
                           'page': page,
                           'asin': asin
                       })

Exemple #11

0

Afficher le fichier

 def parse_list(self, response):
     if response.status == 200:
         hxs = Selector(response)
         positions = hxs.xpath('//div[@class="main-job"]/div')
         #职位类别
         f = response.meta['f']
         #下一页
         if positions and len(positions) > 0:
             page = response.meta['page'] + 1
             yield self._list_request_by_pg(f, page)
         #
         for item in positions:
             logourl = first_item(
                 item.xpath(
                     'div[@class="mobile-comp-logo"]/div[@class="comp-logo-frame"]/img/@src'
                 ).extract())
             linkid = first_item(
                 item.xpath('div[@class="job organic"]/@id').extract())
             title = first_item(
                 item.xpath(
                     'div[@class="job organic"]/h2[@itemprop="title"]/a/@title'
                 ).extract())
             link_url = first_item(
                 item.xpath(
                     'div[@class="job organic"]/h2[@itemprop="title"]/a/@href'
                 ).extract())
             link_url = link_url.replace('\r', '')
             link_url = link_url.replace('\n', '')
             link_url = link_url.replace('\t', '')
             location = first_item(
                 item.xpath(
                     'div[@class="job organic"]/div[@class="main-content"]/h3/span[@itemprop="jobLocation"]/span[@class="location"]/span/text()'
                 ).extract())
             postdate = first_item(
                 item.xpath(
                     'div[@class="job organic"]/div[@class="main-content"]/div[@class="time"]/@content'
                 ).extract())
             yield Request(url=link_url,
                           meta={
                               'f': f,
                               'logourl': logourl,
                               'linkid': linkid,
                               'title': title,
                               'location': location,
                               'postdate': postdate,
                               'timeout': 10000
                           },
                           callback=self.parse_info,
                           dont_filter=True,
                           errback=self._requestErrorBack)
     else:
         log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level=log.INFO)

Exemple #12

0

Afficher le fichier

Fichier : monster_spider.py Projet : alexming/jobspider

 def parse_list(self, response):
     if response.status == 200:
         hxs = Selector(response)
         positions = hxs.xpath('//div[@class="ns_job_wrapper"]/div[@class="ns_lt ns_jobdetails"]/a[@class="ns_joblink"]')
         #下一页
         if positions and len(positions) > 0:
             page = response.meta['page'] + 1
             yield self._list_request_by_pg(page)
         #
         for item in positions:
             link_url = first_item(item.xpath('@href').extract())
             yield Request(url=link_url,
                           meta={'timeout': 5000},
                           callback=self.parse_info,
                           dont_filter=True,
                           #errback=self._requestErrorBack
                          )
     else:
         log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level = log.INFO)

Exemple #13

0

Afficher le fichier

Fichier : jobstreet_spider.py Projet : alexming/jobspider

 def parse_list(self, response):
     if response.status == 200:
         hxs = Selector(response)
         positions = hxs.xpath('//h4[@class="position-title "]/a[@class="position-title-link"]')
         #下一页
         if positions and len(positions) > 0:
             pg = response.meta['pg'] + 1
             yield self._list_request_by_pg(response.meta['id'], response.meta['name'], pg)
         #
         for item in positions:
             link_url = first_item(item.xpath('@href').extract())
             yield Request(url=link_url,
                           meta={'name': response.meta['name'], 'timeout': 5000},
                           callback=self.parse_info,
                           dont_filter=True,
                           errback=self.requestErrorBack
                          )
     else:
         log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level = log.INFO)

Exemple #14

0

Afficher le fichier

Fichier : mytechlogy_spider.py Projet : alexming/jobspider

 def parse_list(self, response):
     if response.status == 200:
         hxs = Selector(response)
         positions = hxs.xpath('//ul[@class="article-list"]/li/article/div[@class="entry-content"]/header/h4/a')
         #下一页
         if positions and len(positions) > 0:
             page = response.meta['page'] + 1
             yield self._list_request_by_pg(page)
         #
         for item in positions:
             link_url = first_item(item.xpath('@href').extract())
             yield Request(url=link_url,
                           meta={'sector': response.meta['sector'], 'timeout': 5000},
                           callback=self.parse_info,
                           dont_filter=True,
                           #errback=self._requestErrorBack
                          )
     else:
         log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level = log.INFO)

Exemple #15

0

Afficher le fichier

 def parse_list(self, response):
     if response.status == 200:
         hxs = Selector(response)
         positions = hxs.xpath(
             '//div[@class="ns_job_wrapper"]/div[@class="ns_lt ns_jobdetails"]/a[@class="ns_joblink"]'
         )
         #下一页
         if positions and len(positions) > 0:
             page = response.meta['page'] + 1
             yield self._list_request_by_pg(page)
         #
         for item in positions:
             link_url = first_item(item.xpath('@href').extract())
             yield Request(
                 url=link_url,
                 meta={'timeout': 5000},
                 callback=self.parse_info,
                 dont_filter=True,
                 #errback=self._requestErrorBack
             )
     else:
         log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level=log.INFO)

Exemple #16

0

Afficher le fichier

 def parse_list(self, response):
     if response.status == 200:
         hxs = Selector(response)
         positions = hxs.xpath(
             '//h4[@class="position-title "]/a[@class="position-title-link"]'
         )
         #下一页
         if positions and len(positions) > 0:
             pg = response.meta['pg'] + 1
             yield self._list_request_by_pg(response.meta['id'],
                                            response.meta['name'], pg)
         #
         for item in positions:
             link_url = first_item(item.xpath('@href').extract())
             yield Request(url=link_url,
                           meta={
                               'name': response.meta['name'],
                               'timeout': 5000
                           },
                           callback=self.parse_info,
                           dont_filter=True,
                           errback=self.requestErrorBack)
     else:
         log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level=log.INFO)

Exemple #17

0

Afficher le fichier

 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #开始解析
         title = first_item(
             hxs.xpath('//h1[@itemprop="title"]/text()').extract())
         salary = first_item(
             hxs.xpath('//span[@itemprop="baseSalary"]/text()').extract())
         location = first_item(
             hxs.xpath('//span[@itemprop="address"]/text()').extract())
         jobtype = first_item(
             hxs.xpath(
                 '//span[@itemprop="employmentType"]/text()').extract())
         companyname = first_item(
             hxs.xpath('//span[@itemprop="name"]/text()').extract())
         postdate = first_item(
             hxs.xpath('//span[@itemprop="datePosted"]/text()').extract())
         jobdesc = first_item(
             hxs.xpath('//section[@class="description"]/div[@class="well"]'
                       ).extract())
         logourl = first_item(
             hxs.xpath(
                 '//section[@class="brandInfo"]/div[@class="well"]/h2/img/@src'
             ).extract())
         if logourl != '':
             logourl = self.create_url(logourl)
         #
         match = re.search(r'<label>Contact:</label>\s*(.+)</li>', data,
                           re.I | re.M)
         if match:
             contact = match.group(1)
         else:
             contact = ''
         #
         match = re.search(r'<label>Address:</label>\s*(.+)</li>', data,
                           re.I | re.M)
         if match:
             address = match.group(1)
         else:
             address = ''
         #
         match = re.search(r'<label>Phone:</label>\s*(.+)</li>', data,
                           re.I | re.M)
         if match:
             phone = match.group(1)
         else:
             phone = ''
         #
         match = re.search(r'<label>Email:</label>\s*(.+)</li>', data,
                           re.I | re.M)
         if match:
             email = match.group(1)
         else:
             email = ''
         #
         match = re.search(r'<label>Website:</label>\s*<a href="(.+)" ',
                           data, re.I | re.M)
         if match:
             website = match.group(1)
         else:
             website = ''
         title = FmtSQLCharater(title)
         companyname = FmtSQLCharater(companyname)
         location = FmtSQLCharater(location)
         address = FmtSQLCharater(address)
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         match = re.search(r'\.id(.+)\?', response.url, re.I | re.M)
         if match:
             job['LinkID'] = str(int(match.group(1)))
         job['JobTitle'] = title
         job['Company'] = companyname
         job['JobName'] = response.meta['sector']
         job['JobDesc'] = FmtSQLCharater(jobdesc)
         job['Salary'] = salary
         if jobtype.find('Full time') > 0:
             job['JobType'] = 1
         else:
             job['JobType'] = 0
         job['SrcUrl'] = response.url
         job['Number'] = 'one person'
         #时间格式化
         if postdate == '':
             postdate = datetime.today()
         else:
             postdate = datetime.strptime(postdate, '%d %b %y')
         job['PublishTime'] = postdate
         job['RefreshTime'] = postdate
         job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['JobAddress'] = address
         job['Mobile'] = phone
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = companyname
         company['CompanyAddress'] = address
         company['WebSite'] = website
         company['CompanyLogoUrl'] = logourl
         company['AreaName'] = job['CityName']
         company['Mobile'] = phone
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)

Exemple #18

0

Afficher le fichier

Fichier : jingdong_spider.py Projet : cash2one/jobspider

    def parse_info(self, response):
        data = response.body
        if data == '':
            log.msg(format='%(request)s post fail.response is empty.',
                    level=log.ERROR,
                    request=response.url)
            return
        category = response.meta['category']
        age = response.meta['age']
        data = data.decode('GBK', 'ignore')
        hxs = Selector(None, data)
        #
        p = hxs.xpath("//div[@id='product-intro']")
        preview = p.xpath("div[@id='preview']")
        item = p.xpath("div[@class='m-item-inner']/div[@id='itemInfo']")
        b = Book()
        b['product_id'] = response.meta['sku']
        b['product_name'] = first_item(
            item.xpath("div[@id='name']/h1/text()").extract())
        b['category_id'] = category['id']
        b['cat_age'] = age['name']
        b['age'] = first_item(
            item.xpath("div[@id='name']/h1/strong/text()").extract())
        author = item.xpath("div[@id='name']/div[@id='p-author']")
        b['publish_author_name'] = first_item(
            author.xpath('string(.)').extract())
        b['publish_author_name'] = b['publish_author_name'].replace(
            '\t', '').replace('\r', '').replace('\n', '')
        b['publish_author_name'] = b['publish_author_name'].lstrip().rstrip()
        #images
        images = preview.xpath(
            "div[@id='spec-list']/div[@class='spec-items']/ul/li/img/@src"
        ).extract()
        b['images'] = '#'.join(images)
        b['images_big'] = b['images'].replace('/n5/', '/n1/')
        #
        detail = hxs.xpath(
            "//div[@id='product-detail-1']/div[@class='p-parameter']/ul[@class='p-parameter-list']"
        )
        b['publish_publisher'] = first_item(
            detail.xpath(u"li[contains(text(), '出版社')]/@title").extract())
        b['publish_standard_id'] = first_item(
            detail.xpath(u"li[contains(text(), 'ISBN')]/@title").extract())
        b['publish_version_num'] = first_item(
            detail.xpath(u"li[contains(text(), '版次')]/@title").extract())
        b['publish_binding'] = first_item(
            detail.xpath(u"li[contains(text(), '包装')]/@title").extract())
        b['publish_product_size'] = first_item(
            detail.xpath(u"li[contains(text(), '开本')]/@title").extract())
        b['publish_publish_date'] = first_item(
            detail.xpath(u"li[contains(text(), '出版时间')]/@title").extract())
        b['publish_paper_quality'] = first_item(
            detail.xpath(u"li[contains(text(), '用纸')]/@title").extract())
        b['publish_print_copy'] = first_item(
            detail.xpath(u"li[contains(text(), '印次')]/@title").extract())
        b['publish_number_of_pages'] = first_item(
            detail.xpath(u"li[contains(text(), '套装数量')]/@title").extract())
        b['publish_subtitle_language'] = first_item(
            detail.xpath(u"li[contains(text(), '正文语种')]/@title").extract())

        log.msg(u'请求商品[%s]的描述信息...' % response.meta['sku'])

        yield Request(url=self.desc_url.replace('<?sku?>',
                                                response.meta['sku']),
                      callback=self.parse_desc,
                      headers=self.headers,
                      meta={'b': b})

Exemple #19

0

Afficher le fichier

Fichier : mytechlogy_spider.py Projet : alexming/jobspider

 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #开始解析
         title = first_item(hxs.xpath('//h1[@class="entry-title mt_title1"]/text()').extract())
         companyname = first_item(hxs.xpath('//span[@class="entry-author"]/text()').extract())
         companyname = companyname.rstrip(' - ')
         #
         match = re.search(r'^<td.+>Location</td>\s+<td.+>(.+)</td>$', data, re.I|re.M)
         if match:
             location = match.group(1)
             if location.find(', ') > 0:
                 location = location.split(',')[0]
         else:
             location = ''
         #
         match = re.search(r'^<td.+>Posted</td>\s+<td.+>(.+)</td>$', data, re.I|re.M)
         if match:
             postdate = match.group(1)
         else:
             postdate = ''
         #
         jobdesc = first_item(hxs.xpath('//div[@class="user-page mt_content1"]/div[@class="mt_content1"]').extract())
         linkid = first_item(hxs.xpath('//input[@id="uid"]/@value').extract())
         #
         title = FmtSQLCharater(title)
         companyname = FmtSQLCharater(companyname)
         location = FmtSQLCharater(location)
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         job['LinkID'] = linkid
         job['JobTitle'] = title
         job['Company'] = companyname
         job['JobName'] = response.meta['sector']
         job['JobDesc'] = FmtSQLCharater(jobdesc)
         job['Salary'] = salary
         job['JobType'] = 1
         job['SrcUrl'] = response.url
         job['Number'] = 'one person'
         #时间格式化
         if postdate == '':
             postdate = datetime.today()
         else:
             postdate = datetime.strptime(postdate, '%d %b %y')
         job['PublishTime'] = postdate
         job['RefreshTime'] = postdate
         job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['JobAddress'] = address
         job['Mobile'] = phone
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = companyname
         company['CompanyAddress'] = address
         company['WebSite'] = website
         company['CompanyLogoUrl'] = logourl
         company['AreaName'] = job['CityName']
         company['Mobile'] = phone
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level = log.INFO)

Exemple #20

0

Afficher le fichier

Fichier : amazon_spider.py Projet : alexming/jobspider

 def parse_info(self, response):
     data = response.body
     if data == '':
         log.msg(format= '%(request)s post fail.response is empty.', level = log.ERROR, request = response.url)
         return
     #
     """
     root = response.meta['root']
     leaf = response.meta['leaf']
     age = response.meta['age']
     star = response.meta['star']
     """
     asin = response.meta['asin']
     #
     hxs = Selector(None, data)
     #
     container = hxs.xpath("//div[@class='a-container']")
     right = container.xpath("div[@id='rightCol']")
     left = container.xpath("div[@id='leftCol']")
     center = container.xpath("div[@id='centerCol']")
     #
     log.msg('Book--')
     b = Book()
     b['product_id'] = asin
     b['product_name'] = FmtSQLCharater(first_item(center.xpath("div[@id='booksTitle']/div/h1[@id='title']/span[@id='productTitle']/text()").extract()))
     b['subname'] = b['product_name']
     b['publish_paper_quality'] = FmtSQLCharater(first_item(center.xpath("div[@id='booksTitle']/div/h1[@id='title']/span[2]/text()").extract()))
     author = center.xpath("div[@id='booksTitle']/div[@id='byline']")
     log.msg('author html:' + author.extract())
     b['publish_author_name'] = FmtSQLCharater(first_item(author.xpath('string(.)').extract()))
     b['publish_author_name'] = b['publish_author_name'].replace('\n', '').replace('\t', '').replace(' ', '')
     b['abstract'] = FmtSQLCharater(first_item(hxs.xpath("div[@id='bookDescription_feature_div']/noscript/text()").extract()))
     images = left.xpath("div[@id='booksImageBlock_feature_div']/div[@id='imageBlockOuter']/div[@id='imageBlockThumbs']/span/div/img/@src").extract()
     bigImages = map(lambda x: x.replace('_AC_SY60_CR,0,0,60,60_', '_SY498_BO1,204,203,200_').replace('_AC_SX60_CR,0,0,60,60_', '_SX443_BO1,204,203,200_'), images)
     b['images'] = '#'.join(images)
     b['images_big'] = '#'.join(bigImages)
     #
     buybox = right.xpath("div[@id='buybox_feature_div']/div[@id='combinedBuyBox']/form[@id='addToCart']/div[@id='buybox']/div/div[@class='a-box-inner']/div")
     b['sale_price'] = FmtSQLCharater(first_item(buybox.xpath("//*[@id='a-autoid-5-announce']/span[2]/span").extract()))
     b['discount'] = FmtSQLCharater(first_item(buybox.xpath("div[@id='buyNewSection']/div/div[@id='soldByThirdParty']/span[2]/text()").extract()))
     b['original_price'] = FmtSQLCharater(first_item(buybox.xpath("//*[@id='a-autoid-4-announce']/span[2]").extract()))
     b['sale_price'] = b['sale_price'].replace('￥', '')
     b['discount'] = b['discount'].replace(' (', '').replace(u'折) ', '')
     b['original_price'] = b['original_price'].replace(u'￥', '')
     #基本信息
     bullets = hxs.xpath("//div[@id='productDetails']/table/tr/td[@class='bucket']/div[@class='content']/ul/li")
     for li in bullets:
         log.msg('Book-base-info')
         if li.xpath(u"b[contains(text(), 'Publisher')]"):
             publisher = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip())
             #未来出版社; 第1版 (2011年11月1日)
             match = re.search(u'(.+); 第(.+)版 \((.+)\)', publisher, re.I|re.M)
             if match:
                 b['publish_publisher'] = match.group(1)
                 b['publish_version_num'] = match.group(2)
                 b['publish_publish_date'] = match.group(3)
         elif li.xpath(u"b[contains(text(), 'Series')]"):
             b['product_name'] = FmtSQLCharater(first_item(li.xpath("a/text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Paperback')]"):
             b['publish_paper_quality'] = u'Paperback'
             b['publish_number_of_pages'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Hardcover')]"):
             b['publish_paper_quality'] = u'Hardcover'
             b['publish_number_of_pages'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), '纸板书')]"):
             b['publish_paper_quality'] = u'纸板书'
             b['publish_number_of_pages'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Age Range')]"):
             b['age'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Language')]"):
             b['publish_subtitle_language'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), '开本')]"):
             b['publish_product_size'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'ISBN-13')]"):
             b['publish_standard_id'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip())
         #elif li.xpath(u"b[contains(text(), '条形码')]"):
         #    b['publish_barcode'] = first_item(li.xpath("text()").extract()).lstrip()
         elif li.xpath(u"b[contains(text(), 'Product Dimensions')]"):
             b['publish_product_size2'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).replace('\n', '').lstrip().rstrip())
         elif li.xpath(u"b[contains(text(), 'Shipping Weight')]"):
             b['publish_product_weight'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).replace('\n', '').lstrip().rstrip())
         #elif li.xpath(u"b[contains(text(), '品牌')]"):
         #    b['brand'] = first_item(li.xpath("text()").extract()).lstrip()
     #商品描述
     begin = data.find('var iframeContent =')
     end = data.find('obj.onloadCallback = onloadCallback;')
     if begin and end:
         desc = data[begin + 21: end - 10]
         desc = urllib2.unquote(desc)
         hxs = Selector(None, desc)
         b['recommendation'] = first_item(hxs.xpath(u"//div[@class='content']/h3[contains(text(), '编辑推荐')]/following-sibling::div[1]/text()").extract())
         b['catalog'] = first_item(hxs.xpath(u"//div[@class='content']/h3[contains(text(), '目录')]/following-sibling::div[1]/text()").extract())
         b['more_information'] = first_item(hxs.xpath(u"//div[@class='content']/h3[contains(text(), '文摘')]/following-sibling::div[1]/text()").extract())
     #
     yield b

Exemple #21

0

Afficher le fichier

Fichier : monster_spider.py Projet : alexming/jobspider

 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #开始解析
         match = re.search(r"^var foldr = '(.+)';", data, re.I|re.M)
         if match:
             linkid = match.group(1)
         else:
             linkid = ''
         if linkid == '':
             log.msg(u'页面没有找到职位ID，丢弃。%s' % response.url, log.ERROR)
             return
         else:
             log.msg(u'找到职位，ID=[%s]' % linkid)
         #
         title = first_item(hxs.xpath('//div[@class="ns_jd_headingbig hl"]/h1/strong/text()').extract())
         title = title.rstrip(' ')
         logourl = first_item(hxs.xpath('//div[@class="ns_jd_comp_logo"]/img/@src').extract())
         companyname = first_item(hxs.xpath('//span[@class="ns_comp_name"]/text()').extract())
         #Locations
         match = re.search(r'<strong>Locations</strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M)
         if match:
             location = match.group(1)
         else:
             location = ''
         #Experience
         match = re.search(r'<strong>Experience </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M)
         if match:
             experience = match.group(1)
         else:
             experience = ''
         #Keywords / Skills
         match = re.search(r'<strong>Keywords / Skills </strong></h2></div>\s+<div class="ns_jobsum_txt"\s.+>(.+)\s</div>', data, re.I|re.M)
         if match:
             skills = match.group(1)
         else:
             skills = ''
         #Education
         match = re.search(r'<strong>Education </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M)
         if match:
             education = match.group(1)
         else:
             education = ''
         #Function
         match = re.search(r'<strong>Function </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M)
         if match:
             function = match.group(1)
             function = function.replace(' &bull; ', '*')
             function = function.replace('<br />', '')
         else:
             function = ''
         #Role
         match = re.search(r'<strong>Role </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M)
         if match:
             role = match.group(1)
             role = role.replace(' &bull; ', '*')
             role = role.replace('<br />', '')
         else:
             role = ''
         #Industry
         match = re.search(r'<strong>Industry </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M)
         if match:
             industry = match.group(1)
             industry = industry.replace(' &bull; ', '')
             industry = industry.replace('<br />', ';')
         else:
             industry = ''
         #Summary
         match = re.search(r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)</div>', data, re.I|re.M)
         if match:
             summary = match.group(1)
         else:
             #存在中途换行的情况
             match = re.search(r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+\s+.+)</div>', data, re.I|re.M)
             if match:
                 summary = match.group(1)
             else:
                 summary = ''
         #
         match = re.search(r'<strong>Posted On </strong></h2></div>\s+<div class="ns_jobsum_txt">\s(.+)\s</div>\t', data, re.I|re.M)
         if match:
             postdate = match.group(1)
         else:
             postdate = ''
         #
         desc = hxs.xpath('//div[@class="ns_jobdesc hl"]').extract()
         if desc:
             jobdesc = hxs.xpath('//div[@class="ns_jobdesc hl"]').extract()[0]
         else:
             jobdesc = ''
         #
         if desc and len(desc) > 1:
             comdesc = hxs.xpath('//div[@class="ns_jobdesc hl"]').extract()[1]
         else:
             comdesc = ''
         #
         title = FmtSQLCharater(title)
         companyname = FmtSQLCharater(companyname)
         location = FmtSQLCharater(location)
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         job['LinkID'] = linkid
         job['JobTitle'] = title
         job['Company'] = companyname
         job['JobName'] = function
         job['JobDesc'] = FmtSQLCharater(summary + '<p>' + jobdesc)
         job['JobType'] = 1
         job['SrcUrl'] = response.url
         job['Number'] = 'one person'
         #时间格式化
         if postdate == '':
             postdate = datetime.today()
         else:
             postdate = postdate.replace('st', '')
             postdate = postdate.replace('nd', '')
             postdate = postdate.replace('rd', '')
             postdate = postdate.replace('th', '')
             postdate = datetime.strptime(postdate, '%d %b %Y')
         job['PublishTime'] = postdate
         job['RefreshTime'] = postdate
         job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['JobComputerSkill'] = skills
         job['Exercise'] = experience
         job['Eduacation'] = education
         job['JobFunction'] = role
         job['Industry'] = industry
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = companyname
         company['Industry'] = industry
         company['CompanyLogoUrl'] = logourl
         company['CompanyDesc'] = FmtSQLCharater(comdesc)
         company['AreaName'] = job['CityName']
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level = log.INFO)

Exemple #22

0

Afficher le fichier

Fichier : efinancial_spider.py Projet : alexming/jobspider

 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #开始解析
         title = first_item(hxs.xpath('//h1[@itemprop="title"]/text()').extract())
         salary = first_item(hxs.xpath('//span[@itemprop="baseSalary"]/text()').extract())
         location = first_item(hxs.xpath('//span[@itemprop="address"]/text()').extract())
         jobtype = first_item(hxs.xpath('//span[@itemprop="employmentType"]/text()').extract())
         companyname = first_item(hxs.xpath('//span[@itemprop="name"]/text()').extract())
         postdate = first_item(hxs.xpath('//span[@itemprop="datePosted"]/text()').extract())
         jobdesc = first_item(hxs.xpath('//section[@class="description"]/div[@class="well"]').extract())
         logourl = first_item(hxs.xpath('//section[@class="brandInfo"]/div[@class="well"]/h2/img/@src').extract())
         if logourl != '':
             logourl = self.create_url(logourl)
         #
         match = re.search(r'<label>Contact:</label>\s*(.+)</li>', data, re.I|re.M)
         if match:
             contact = match.group(1)
         else:
             contact = ''
         #
         match = re.search(r'<label>Address:</label>\s*(.+)</li>', data, re.I|re.M)
         if match:
             address = match.group(1)
         else:
             address = ''
         #
         match = re.search(r'<label>Phone:</label>\s*(.+)</li>', data, re.I|re.M)
         if match:
             phone = match.group(1)
         else:
             phone = ''
         #
         match = re.search(r'<label>Email:</label>\s*(.+)</li>', data, re.I|re.M)
         if match:
             email = match.group(1)
         else:
             email = ''
         #
         match = re.search(r'<label>Website:</label>\s*<a href="(.+)" ', data, re.I|re.M)
         if match:
             website = match.group(1)
         else:
             website = ''
         title = FmtSQLCharater(title)
         companyname = FmtSQLCharater(companyname)
         location = FmtSQLCharater(location)
         address = FmtSQLCharater(address)
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         match = re.search(r'\.id(.+)\?', response.url, re.I|re.M)
         if match:
             job['LinkID'] = str(int(match.group(1)))
         job['JobTitle'] = title
         job['Company'] = companyname
         job['JobName'] = response.meta['sector']
         job['JobDesc'] = FmtSQLCharater(jobdesc)
         job['Salary'] = salary
         if jobtype.find('Full time') > 0:
             job['JobType'] = 1
         else:
             job['JobType'] = 0
         job['SrcUrl'] = response.url
         job['Number'] = 'one person'
         #时间格式化
         if postdate == '':
             postdate = datetime.today()
         else:
             postdate = datetime.strptime(postdate, '%d %b %y')
         job['PublishTime'] = postdate
         job['RefreshTime'] = postdate
         job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['JobAddress'] = address
         job['Mobile'] = phone
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = companyname
         company['CompanyAddress'] = address
         company['WebSite'] = website
         company['CompanyLogoUrl'] = logourl
         company['AreaName'] = job['CityName']
         company['Mobile'] = phone
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level = log.INFO)

Exemple #23

0

Afficher le fichier

Fichier : amazon_spider.py Projet : cash2one/jobspider

 def parse_info(self, response):
     data = response.body
     if data == '':
         log.msg(format='%(request)s post fail.response is empty.',
                 level=log.ERROR,
                 request=response.url)
         return
     #
     """
     root = response.meta['root']
     leaf = response.meta['leaf']
     age = response.meta['age']
     star = response.meta['star']
     """
     asin = response.meta['asin']
     #
     hxs = Selector(None, data)
     #
     container = hxs.xpath("//div[@class='a-container']")
     right = container.xpath("div[@id='rightCol']")
     left = container.xpath("div[@id='leftCol']")
     center = container.xpath("div[@id='centerCol']")
     #
     log.msg('Book--')
     b = Book()
     b['product_id'] = asin
     b['product_name'] = FmtSQLCharater(
         first_item(
             center.xpath(
                 "div[@id='booksTitle']/div/h1[@id='title']/span[@id='productTitle']/text()"
             ).extract()))
     b['subname'] = b['product_name']
     b['publish_paper_quality'] = FmtSQLCharater(
         first_item(
             center.xpath(
                 "div[@id='booksTitle']/div/h1[@id='title']/span[2]/text()"
             ).extract()))
     author = center.xpath("div[@id='booksTitle']/div[@id='byline']")
     log.msg('author html:' + author.extract())
     b['publish_author_name'] = FmtSQLCharater(
         first_item(author.xpath('string(.)').extract()))
     b['publish_author_name'] = b['publish_author_name'].replace(
         '\n', '').replace('\t', '').replace(' ', '')
     b['abstract'] = FmtSQLCharater(
         first_item(
             hxs.xpath(
                 "div[@id='bookDescription_feature_div']/noscript/text()").
             extract()))
     images = left.xpath(
         "div[@id='booksImageBlock_feature_div']/div[@id='imageBlockOuter']/div[@id='imageBlockThumbs']/span/div/img/@src"
     ).extract()
     bigImages = map(
         lambda x: x.replace(
             '_AC_SY60_CR,0,0,60,60_', '_SY498_BO1,204,203,200_').replace(
                 '_AC_SX60_CR,0,0,60,60_', '_SX443_BO1,204,203,200_'),
         images)
     b['images'] = '#'.join(images)
     b['images_big'] = '#'.join(bigImages)
     #
     buybox = right.xpath(
         "div[@id='buybox_feature_div']/div[@id='combinedBuyBox']/form[@id='addToCart']/div[@id='buybox']/div/div[@class='a-box-inner']/div"
     )
     b['sale_price'] = FmtSQLCharater(
         first_item(
             buybox.xpath(
                 "//*[@id='a-autoid-5-announce']/span[2]/span").extract()))
     b['discount'] = FmtSQLCharater(
         first_item(
             buybox.xpath(
                 "div[@id='buyNewSection']/div/div[@id='soldByThirdParty']/span[2]/text()"
             ).extract()))
     b['original_price'] = FmtSQLCharater(
         first_item(
             buybox.xpath(
                 "//*[@id='a-autoid-4-announce']/span[2]").extract()))
     b['sale_price'] = b['sale_price'].replace('￥', '')
     b['discount'] = b['discount'].replace(' (', '').replace(u'折) ', '')
     b['original_price'] = b['original_price'].replace(u'￥', '')
     #基本信息
     bullets = hxs.xpath(
         "//div[@id='productDetails']/table/tr/td[@class='bucket']/div[@class='content']/ul/li"
     )
     for li in bullets:
         log.msg('Book-base-info')
         if li.xpath(u"b[contains(text(), 'Publisher')]"):
             publisher = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
             #未来出版社; 第1版 (2011年11月1日)
             match = re.search(u'(.+); 第(.+)版 \((.+)\)', publisher,
                               re.I | re.M)
             if match:
                 b['publish_publisher'] = match.group(1)
                 b['publish_version_num'] = match.group(2)
                 b['publish_publish_date'] = match.group(3)
         elif li.xpath(u"b[contains(text(), 'Series')]"):
             b['product_name'] = FmtSQLCharater(
                 first_item(li.xpath("a/text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Paperback')]"):
             b['publish_paper_quality'] = u'Paperback'
             b['publish_number_of_pages'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Hardcover')]"):
             b['publish_paper_quality'] = u'Hardcover'
             b['publish_number_of_pages'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), '纸板书')]"):
             b['publish_paper_quality'] = u'纸板书'
             b['publish_number_of_pages'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Age Range')]"):
             b['age'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Language')]"):
             b['publish_subtitle_language'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), '开本')]"):
             b['publish_product_size'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'ISBN-13')]"):
             b['publish_standard_id'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         #elif li.xpath(u"b[contains(text(), '条形码')]"):
         #    b['publish_barcode'] = first_item(li.xpath("text()").extract()).lstrip()
         elif li.xpath(u"b[contains(text(), 'Product Dimensions')]"):
             b['publish_product_size2'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).replace(
                     '\n', '').lstrip().rstrip())
         elif li.xpath(u"b[contains(text(), 'Shipping Weight')]"):
             b['publish_product_weight'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).replace(
                     '\n', '').lstrip().rstrip())
         #elif li.xpath(u"b[contains(text(), '品牌')]"):
         #    b['brand'] = first_item(li.xpath("text()").extract()).lstrip()
     #商品描述
     begin = data.find('var iframeContent =')
     end = data.find('obj.onloadCallback = onloadCallback;')
     if begin and end:
         desc = data[begin + 21:end - 10]
         desc = urllib2.unquote(desc)
         hxs = Selector(None, desc)
         b['recommendation'] = first_item(
             hxs.xpath(
                 u"//div[@class='content']/h3[contains(text(), '编辑推荐')]/following-sibling::div[1]/text()"
             ).extract())
         b['catalog'] = first_item(
             hxs.xpath(
                 u"//div[@class='content']/h3[contains(text(), '目录')]/following-sibling::div[1]/text()"
             ).extract())
         b['more_information'] = first_item(
             hxs.xpath(
                 u"//div[@class='content']/h3[contains(text(), '文摘')]/following-sibling::div[1]/text()"
             ).extract())
     #
     yield b

Exemple #24

0

Afficher le fichier

Fichier : amazon_spider.py Projet : cash2one/jobspider

 def parse_list(self, response):
     data = response.body
     if data == '':
         log.msg(format='%(request)s post fail.response is empty.',
                 level=log.ERROR,
                 request=response.url)
         return
     #
     """
     root = response.meta['root']
     leaf = response.meta['leaf']
     age = response.meta['age']
     star = response.meta['star']
     """
     page = response.meta['page']
     #
     hxs = Selector(None, data)
     #
     plist_a = hxs.xpath(
         "//div[@id='resultsCol']/div[@id='centerMinus']/div[@id='atfResults']/ul[@id='s-results-list-atf']/li"
     )
     plist_b = hxs.xpath("//div[@id='btfResults']/ul/li")
     plist = plist_a + plist_b
     #log.msg(u'类别[图书->少儿->%s->%s->%s->%s]页码[%d]总数=%d,开始请求详情...' % (root['name'], leaf['name'], age['name'], star['name'], page, len(plist)))
     #og.msg(u'Start Request:%s' % plist);
     """
     if len(plist) == 0:
         if data.find(u'verify'):
             log.msg(u'verify ban')
     """
     for item in plist:
         asin = first_item(item.xpath('@data-asin').extract())
         log.msg('Request ASIN Detail Page:' + str(asin))
         #
         """
         c = Category()
         c['product_id'] = asin
         c['category_path'] = 'n:658390051,n:!658391051,n:658409051,n:%d,n:%d,p_72:%d,p_n_age_range:%d' % (root['id'], leaf['id'], star['id'], age['id'])
         c['path_name'] = ' 图书 : 少儿 : %s : %s : %s : %s' % (root['name'], leaf['name'], star['name'], age['name'])
         yield c
         """
         #请求详情
         detailUrl = self.info_url.replace('<?asin?>', asin)
         log.msg('DetailUrl:' + detailUrl)
         yield Request(
             url=self.info_url.replace('<?asin?>', asin),
             callback=self.parse_info,
             headers=self.headers,
             meta={
                 'asin': asin,
                 'proxy': 'http://192.168.1.130:8888'
             },
             #meta={'root': root, 'leaf': leaf, 'age': age, 'star': star, 'asin': asin}
         )
         """
         #请求评论            
         yield Request(
             url=self.review_url.replace('<?asin?>', asin).replace('<?page?>', '1'),
             callback=self.parse_review,
             headers=self.headers,
             meta={'page': 1, 'asin': asin}
         )
         """
     """

Exemple #25

0

Afficher le fichier

 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #页面解析
         #企业横幅
         company_banner = first_item(
             hxs.xpath(
                 '//img[@id="company_banner"]/@data-original').extract())
         #企业logo
         company_logo = first_item(
             hxs.xpath(
                 '//img[@id="company_logo"]/@data-original').extract())
         #职位名称
         position_title = first_item(
             hxs.xpath('//h1[@id="position_title"]/text()').extract())
         position_title = FmtSQLCharater(position_title)
         #企业名称
         company_name = first_item(
             hxs.xpath('//h2[@id="company_name"]/a/text()').extract())
         if company_name == '':
             company_name = first_item(
                 hxs.xpath('//h2[@id="company_name"]/text()').extract())
         company_name = company_name.replace('\n', '')
         company_name = company_name.replace('\t', '')
         company_name = company_name.lstrip(' ')
         company_name = company_name.rstrip(' ')
         company_name = FmtSQLCharater(company_name)
         if company_name == '':
             log.msg(u'企业名称为空，url=%s' % response.url)
             return
         #企业SrcUrl地址
         company_url = first_item(
             hxs.xpath('//h2[@id="company_name"]/a/@href').extract())
         #薪资
         salary = first_item(
             hxs.xpath('//div[@id="salary"]/p/a/text()').extract())
         #经验
         experience = first_item(
             hxs.xpath(
                 '//div[@id="experience"]/p[@id="years_of_experience"]/span[@id="years_of_experience"]/text()'
             ).extract())
         experience = experience.replace('\n', '')
         experience = experience.replace('\t', '')
         #Location
         location = first_item(
             hxs.xpath(
                 '//div[@id="location"]/p/span[@id="single_work_location"]/text()'
             ).extract())
         location = location.replace('\n', '')
         location = location.replace('\t', '')
         #职位描述(可能包含岗位职责、职位要求)
         job_desc = first_item(
             hxs.xpath('//div[@id="job_description"]').extract())
         #企业信息
         company_registration_number = first_item(
             hxs.xpath('//span[@id="company_registration_number"]/text()').
             extract())
         company_industry = first_item(
             hxs.xpath('//p[@id="company_industry"]/text()').extract())
         company_website = first_item(
             hxs.xpath('//a[@id="company_website"]/text()').extract())
         company_contact = first_item(
             hxs.xpath('//p[@id="company_contact"]/text()').extract())
         company_size = first_item(
             hxs.xpath('//p[@id="company_size"]/text()').extract())
         work_environment_working_hours = first_item(
             hxs.xpath('//p[@id="work_environment_working_hours"]/text()').
             extract())
         work_environment_dress_code = first_item(
             hxs.xpath(
                 '//p[@id="work_environment_dress_code"]/text()').extract())
         work_environment_benefits = first_item(
             hxs.xpath(
                 '//p[@id="work_environment_benefits"]/text()').extract())
         work_environment_spoken_language = first_item(
             hxs.xpath('//p[@id="work_environment_spoken_language"]/text()'
                       ).extract())
         #gallery
         gallery = ''
         thumbs = hxs.xpath('//ul[@class="gallery-thumb"]/li')
         for item in thumbs:
             gallery += first_item(
                 item.xpath('img/@data-original').extract()) + ';'
         #企业描述
         company_overview_all = first_item(
             hxs.xpath('//div[@id="company_overview_all"]').extract())
         #work location
         match = re.search(r'&center=(.*?)&', data, re.I | re.M)
         if match:
             gps_location = match.group(1)
             lat = gps_location.split(',')[0]
             lng = gps_location.split(',')[1]
         else:
             lat = '0.0'
             lng = '0.0'
         #
         address = first_item(
             hxs.xpath('//p[@id="address"]/text()').extract())
         address = FmtSQLCharater(address)
         #Advertised: 23-June-2015
         posting_date = first_item(
             hxs.xpath('//p[@id="posting_date"]/text()').extract())
         posting_date = posting_date.replace('Advertised:', '')
         posting_date = posting_date.replace(' ', '')
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         #http://jobs.jobstreet.com/sg/jobs/4712859?fr=J
         job['LinkID'] = response.url[34:-5]
         job['JobTitle'] = position_title
         job['Company'] = company_name
         job['Industry'] = company_industry
         job['JobName'] = response.meta['name']
         job['JobDesc'] = FmtSQLCharater(job_desc)
         job['Salary'] = salary
         job['Exercise'] = experience
         job['JobType'] = 1
         job['SrcUrl'] = response.url
         job['SSWelfare'] = work_environment_benefits
         job['Number'] = 'one person'
         #时间格式化
         PostDate = datetime.strptime(posting_date, '%d-%B-%Y')
         job['PublishTime'] = PostDate
         job['RefreshTime'] = PostDate
         if location <> '' and len(location.split('-')) > 1:
             job['CityName'] = location.split('-')[0].replace(' ', '')
             job['WorkArea1'] = location.split('-')[1].replace(' ', '')
         else:
             job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['ForeignLanguage'] = work_environment_spoken_language
         job['JobWorkTime'] = work_environment_working_hours
         job['GisLongitude'] = lng
         job['GisLatitude'] = lat
         job['JobAddress'] = address
         job['Mobile'] = company_contact
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = company_name
         company['Industry'] = company_industry
         company['CompanyScale'] = company_size
         company['CompanyAddress'] = address
         company['CompanyUrl'] = company_url
         company['WebSite'] = company_website
         company['CompanyLogoUrl'] = company_logo
         company['AreaName'] = job['CityName']
         company['CompanyDesc'] = FmtSQLCharater(company_overview_all)
         company['Mobile'] = company_contact
         company['GisLongitude'] = lng
         company['GisLatitude'] = lat
         company['OtherInfo'] = company_banner + '#' + gallery
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)

Exemple #26

0

Afficher le fichier

 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(None, data)
         #开始解析
         linkid = response.meta['linkid']
         #
         title = response.meta['title']
         #
         logourl = response.meta['logourl']
         #
         location = response.meta['location']
         #
         function = response.meta['f']
         #
         postdate = response.meta['postdate']
         #
         companyname = first_item(
             hxs.xpath(
                 '//div[@class="additional_info"]/span[@class="company"]/a/text()'
             ).extract())
         companyname = companyname.lstrip(' ')
         companyname = companyname.rstrip(' ')
         if companyname == '':
             log.msg(u'该职位来源其他网站(%s),无法抓取.' % response.url, level=log.ERROR)
             return
         #
         desc = first_item(
             hxs.xpath('//div[@class="p-description"]').extract())
         desc = desc.lstrip('<div class="p-description">')
         desc = desc.rstrip('</div>')
         desc = desc.replace('\t', '')
         #
         title = FmtSQLCharater(title)
         companyname = FmtSQLCharater(companyname)
         location = FmtSQLCharater(location)
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         job['LinkID'] = linkid
         job['JobTitle'] = title
         job['Company'] = companyname
         job['JobName'] = function
         job['JobDesc'] = FmtSQLCharater(desc)
         job['JobType'] = 1
         job['SrcUrl'] = response.url
         job['Number'] = 'one person'
         #时间格式化
         if postdate == '':
             postdate = datetime.today()
         else:
             postdate = datetime.strptime(postdate, '%Y-%m-%d')
         job['PublishTime'] = postdate
         job['RefreshTime'] = postdate
         job['CityName'] = location
         job['WorkArea'] = job['CityName']
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = companyname
         company['CompanyLogoUrl'] = logourl
         company['AreaName'] = job['CityName']
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)

Exemple #27

0

Afficher le fichier

Fichier : mytechlogy_spider.py Projet : cash2one/jobspider

 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #开始解析
         title = first_item(
             hxs.xpath(
                 '//h1[@class="entry-title mt_title1"]/text()').extract())
         companyname = first_item(
             hxs.xpath('//span[@class="entry-author"]/text()').extract())
         companyname = companyname.rstrip(' - ')
         #
         match = re.search(r'^<td.+>Location</td>\s+<td.+>(.+)</td>$', data,
                           re.I | re.M)
         if match:
             location = match.group(1)
             if location.find(', ') > 0:
                 location = location.split(',')[0]
         else:
             location = ''
         #
         match = re.search(r'^<td.+>Posted</td>\s+<td.+>(.+)</td>$', data,
                           re.I | re.M)
         if match:
             postdate = match.group(1)
         else:
             postdate = ''
         #
         jobdesc = first_item(
             hxs.xpath(
                 '//div[@class="user-page mt_content1"]/div[@class="mt_content1"]'
             ).extract())
         linkid = first_item(
             hxs.xpath('//input[@id="uid"]/@value').extract())
         #
         title = FmtSQLCharater(title)
         companyname = FmtSQLCharater(companyname)
         location = FmtSQLCharater(location)
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         job['LinkID'] = linkid
         job['JobTitle'] = title
         job['Company'] = companyname
         job['JobName'] = response.meta['sector']
         job['JobDesc'] = FmtSQLCharater(jobdesc)
         job['Salary'] = salary
         job['JobType'] = 1
         job['SrcUrl'] = response.url
         job['Number'] = 'one person'
         #时间格式化
         if postdate == '':
             postdate = datetime.today()
         else:
             postdate = datetime.strptime(postdate, '%d %b %y')
         job['PublishTime'] = postdate
         job['RefreshTime'] = postdate
         job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['JobAddress'] = address
         job['Mobile'] = phone
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = companyname
         company['CompanyAddress'] = address
         company['WebSite'] = website
         company['CompanyLogoUrl'] = logourl
         company['AreaName'] = job['CityName']
         company['Mobile'] = phone
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)

Exemple #28

0

Afficher le fichier

Fichier : jobstreet_spider.py Projet : alexming/jobspider

 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #页面解析
         #企业横幅
         company_banner = first_item(hxs.xpath('//img[@id="company_banner"]/@data-original').extract())
         #企业logo
         company_logo = first_item(hxs.xpath('//img[@id="company_logo"]/@data-original').extract())
         #职位名称
         position_title = first_item(hxs.xpath('//h1[@id="position_title"]/text()').extract())
         position_title = FmtSQLCharater(position_title)
         #企业名称
         company_name = first_item(hxs.xpath('//h2[@id="company_name"]/a/text()').extract())
         if company_name == '':
             company_name = first_item(hxs.xpath('//h2[@id="company_name"]/text()').extract())
         company_name = company_name.replace('\n', '')
         company_name = company_name.replace('\t', '')
         company_name = company_name.lstrip(' ')
         company_name = company_name.rstrip(' ')
         company_name = FmtSQLCharater(company_name)
         if company_name == '':
             log.msg(u'企业名称为空，url=%s' % response.url)
             return
         #企业SrcUrl地址
         company_url = first_item(hxs.xpath('//h2[@id="company_name"]/a/@href').extract())
         #薪资
         salary = first_item(hxs.xpath('//div[@id="salary"]/p/a/text()').extract())
         #经验
         experience = first_item(hxs.xpath('//div[@id="experience"]/p[@id="years_of_experience"]/span[@id="years_of_experience"]/text()').extract())
         experience = experience.replace('\n', '')
         experience = experience.replace('\t', '')
         #Location
         location = first_item(hxs.xpath('//div[@id="location"]/p/span[@id="single_work_location"]/text()').extract())
         location = location.replace('\n', '')
         location = location.replace('\t', '')
         #职位描述(可能包含岗位职责、职位要求)
         job_desc = first_item(hxs.xpath('//div[@id="job_description"]').extract())
         #企业信息
         company_registration_number = first_item(hxs.xpath('//span[@id="company_registration_number"]/text()').extract())
         company_industry = first_item(hxs.xpath('//p[@id="company_industry"]/text()').extract())
         company_website = first_item(hxs.xpath('//a[@id="company_website"]/text()').extract())
         company_contact = first_item(hxs.xpath('//p[@id="company_contact"]/text()').extract())
         company_size = first_item(hxs.xpath('//p[@id="company_size"]/text()').extract())
         work_environment_working_hours = first_item(hxs.xpath('//p[@id="work_environment_working_hours"]/text()').extract())
         work_environment_dress_code = first_item(hxs.xpath('//p[@id="work_environment_dress_code"]/text()').extract())
         work_environment_benefits = first_item(hxs.xpath('//p[@id="work_environment_benefits"]/text()').extract())
         work_environment_spoken_language = first_item(hxs.xpath('//p[@id="work_environment_spoken_language"]/text()').extract())
         #gallery
         gallery = ''
         thumbs = hxs.xpath('//ul[@class="gallery-thumb"]/li')
         for item in thumbs:
             gallery += first_item(item.xpath('img/@data-original').extract()) + ';'
         #企业描述
         company_overview_all = first_item(hxs.xpath('//div[@id="company_overview_all"]').extract())
         #work location
         match = re.search(r'&center=(.*?)&', data, re.I|re.M)
         if match:
             gps_location = match.group(1)
             lat = gps_location.split(',')[0]
             lng = gps_location.split(',')[1]
         else:
             lat = '0.0'
             lng = '0.0'
         #
         address = first_item(hxs.xpath('//p[@id="address"]/text()').extract())
         address = FmtSQLCharater(address)
         #Advertised: 23-June-2015
         posting_date = first_item(hxs.xpath('//p[@id="posting_date"]/text()').extract())
         posting_date = posting_date.replace('Advertised:', '')
         posting_date = posting_date.replace(' ', '')
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         #http://jobs.jobstreet.com/sg/jobs/4712859?fr=J
         job['LinkID'] = response.url[34: -5]
         job['JobTitle'] = position_title
         job['Company'] = company_name
         job['Industry'] = company_industry
         job['JobName'] = response.meta['name']
         job['JobDesc'] = FmtSQLCharater(job_desc)
         job['Salary'] = salary
         job['Exercise'] = experience
         job['JobType'] = 1
         job['SrcUrl'] = response.url
         job['SSWelfare'] = work_environment_benefits
         job['Number'] = 'one person'
         #时间格式化
         PostDate = datetime.strptime(posting_date, '%d-%B-%Y')
         job['PublishTime'] = PostDate
         job['RefreshTime'] = PostDate
         if location <> '' and len(location.split('-')) > 1:
             job['CityName'] = location.split('-')[0].replace(' ', '')
             job['WorkArea1'] = location.split('-')[1].replace(' ', '')
         else:
             job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['ForeignLanguage'] = work_environment_spoken_language
         job['JobWorkTime'] = work_environment_working_hours
         job['GisLongitude'] = lng
         job['GisLatitude'] = lat
         job['JobAddress'] = address
         job['Mobile'] = company_contact
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = company_name
         company['Industry'] = company_industry
         company['CompanyScale'] = company_size
         company['CompanyAddress'] = address
         company['CompanyUrl'] = company_url
         company['WebSite'] = company_website
         company['CompanyLogoUrl'] = company_logo
         company['AreaName'] = job['CityName']
         company['CompanyDesc'] = FmtSQLCharater(company_overview_all)
         company['Mobile'] = company_contact
         company['GisLongitude'] = lng
         company['GisLatitude'] = lat
         company['OtherInfo'] = company_banner + '#' + gallery
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level = log.INFO)

Exemple #29

0

Afficher le fichier

 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #开始解析
         match = re.search(r"^var foldr = '(.+)';", data, re.I | re.M)
         if match:
             linkid = match.group(1)
         else:
             linkid = ''
         if linkid == '':
             log.msg(u'页面没有找到职位ID，丢弃。%s' % response.url, log.ERROR)
             return
         else:
             log.msg(u'找到职位，ID=[%s]' % linkid)
         #
         title = first_item(
             hxs.xpath(
                 '//div[@class="ns_jd_headingbig hl"]/h1/strong/text()').
             extract())
         title = title.rstrip(' ')
         logourl = first_item(
             hxs.xpath(
                 '//div[@class="ns_jd_comp_logo"]/img/@src').extract())
         companyname = first_item(
             hxs.xpath('//span[@class="ns_comp_name"]/text()').extract())
         #Locations
         match = re.search(
             r'<strong>Locations</strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             location = match.group(1)
         else:
             location = ''
         #Experience
         match = re.search(
             r'<strong>Experience </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             experience = match.group(1)
         else:
             experience = ''
         #Keywords / Skills
         match = re.search(
             r'<strong>Keywords / Skills </strong></h2></div>\s+<div class="ns_jobsum_txt"\s.+>(.+)\s</div>',
             data, re.I | re.M)
         if match:
             skills = match.group(1)
         else:
             skills = ''
         #Education
         match = re.search(
             r'<strong>Education </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             education = match.group(1)
         else:
             education = ''
         #Function
         match = re.search(
             r'<strong>Function </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             function = match.group(1)
             function = function.replace(' &bull; ', '*')
             function = function.replace('<br />', '')
         else:
             function = ''
         #Role
         match = re.search(
             r'<strong>Role </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             role = match.group(1)
             role = role.replace(' &bull; ', '*')
             role = role.replace('<br />', '')
         else:
             role = ''
         #Industry
         match = re.search(
             r'<strong>Industry </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             industry = match.group(1)
             industry = industry.replace(' &bull; ', '')
             industry = industry.replace('<br />', ';')
         else:
             industry = ''
         #Summary
         match = re.search(
             r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)</div>',
             data, re.I | re.M)
         if match:
             summary = match.group(1)
         else:
             #存在中途换行的情况
             match = re.search(
                 r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+\s+.+)</div>',
                 data, re.I | re.M)
             if match:
                 summary = match.group(1)
             else:
                 summary = ''
         #
         match = re.search(
             r'<strong>Posted On </strong></h2></div>\s+<div class="ns_jobsum_txt">\s(.+)\s</div>\t',
             data, re.I | re.M)
         if match:
             postdate = match.group(1)
         else:
             postdate = ''
         #
         desc = hxs.xpath('//div[@class="ns_jobdesc hl"]').extract()
         if desc:
             jobdesc = hxs.xpath(
                 '//div[@class="ns_jobdesc hl"]').extract()[0]
         else:
             jobdesc = ''
         #
         if desc and len(desc) > 1:
             comdesc = hxs.xpath(
                 '//div[@class="ns_jobdesc hl"]').extract()[1]
         else:
             comdesc = ''
         #
         title = FmtSQLCharater(title)
         companyname = FmtSQLCharater(companyname)
         location = FmtSQLCharater(location)
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         job['LinkID'] = linkid
         job['JobTitle'] = title
         job['Company'] = companyname
         job['JobName'] = function
         job['JobDesc'] = FmtSQLCharater(summary + '<p>' + jobdesc)
         job['JobType'] = 1
         job['SrcUrl'] = response.url
         job['Number'] = 'one person'
         #时间格式化
         if postdate == '':
             postdate = datetime.today()
         else:
             postdate = postdate.replace('st', '')
             postdate = postdate.replace('nd', '')
             postdate = postdate.replace('rd', '')
             postdate = postdate.replace('th', '')
             postdate = datetime.strptime(postdate, '%d %b %Y')
         job['PublishTime'] = postdate
         job['RefreshTime'] = postdate
         job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['JobComputerSkill'] = skills
         job['Exercise'] = experience
         job['Eduacation'] = education
         job['JobFunction'] = role
         job['Industry'] = industry
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = companyname
         company['Industry'] = industry
         company['CompanyLogoUrl'] = logourl
         company['CompanyDesc'] = FmtSQLCharater(comdesc)
         company['AreaName'] = job['CityName']
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)