def parse_review(self, response): hxs = Selector(response) asin = response.meta['asin'] title = FmtSQLCharater(first_item(hxs.xpath('//title/text()').extract())) title = title.replace(u'Amazon.com: Customer Reviews: ', '') rlist = hxs.xpath("//div[@id='cm_cr-review_list']/div[@class='a-section review']") for div in rlist: r = Review() r['product_id'] = asin r['product_name'] = title r['review_id'] = first_item(div.xpath('@id').extract()) votes = FmtSQLCharater(first_item(div.xpath('div[1]/span/text()').extract())) match = re.search(u'(.+) people found this helpful', votes, re.I) if match: r['total_feedback_num'] = match.group(1) r['total_helpful_num'] = match.group(2) # r['full_star'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[1]/i/span/text()").extract())) r['title'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[2]/text()").extract())) r['cust_name'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[1]/a/text()").extract())) r['creation_date'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[4]/text()").extract())) #r['creation_date'] = r['creation_date'].replace(u'于 ', '').replace('年', '/').replace(u'月', '/').replace(u'日', '/') r['body'] = first_item(div.xpath("div[5]/span").extract()) yield r #下一页 if len(rlist) == 10: page = response.meta['page'] + 1 log.msg('Request Product[%s]-[%d] page review ...' % (asin, page)) yield Request( url=self.review_url.replace('<?asin?>', asin).replace('<?page?>', str(page)), callback=self.parse_review, headers=self.headers, meta={'page': page, 'asin': asin} )
def parse_list(self, response): if response.status == 200: hxs = Selector(response) positions = hxs.xpath( '//ul[@class="article-list"]/li/article/div[@class="entry-content"]/header/h4/a' ) #下一页 if positions and len(positions) > 0: page = response.meta['page'] + 1 yield self._list_request_by_pg(page) # for item in positions: link_url = first_item(item.xpath('@href').extract()) yield Request( url=link_url, meta={ 'sector': response.meta['sector'], 'timeout': 5000 }, callback=self.parse_info, dont_filter=True, #errback=self._requestErrorBack ) else: log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_comment(self, response): data = response.body shop_id = response.meta['shopid'] hxs = Selector(None, data) pj = hxs.xpath('//div[@class="pjbox"]/ul[@class="pj_ul"]') for item in pj: c = WubaComment() c['shop_id'] = shop_id c['user_name'] = first_item(item.xpath('li/span[@class="dp_t"]/text()').extract()) c['c_time'] = first_item(item.xpath('li/label/i/text()').extract()) c['content'] = first_item(item.xpath('li[2]/text()').extract()) c['content'] = c['content'].lstrip(' ') c['content'] = c['content'].rstrip(' ') addto = first_item(item.xpath('li/p[@class="addto"]/text()').extract()) c['content'] += addto yield c
def parse_list(self, response): data = response.body if data == '': log.msg(format= '%(request)s post fail.response is empty.', level = log.ERROR, request = response.url) return # """ root = response.meta['root'] leaf = response.meta['leaf'] age = response.meta['age'] star = response.meta['star'] """ page = response.meta['page'] # hxs = Selector(None, data) # plist_a = hxs.xpath("//div[@id='resultsCol']/div[@id='centerMinus']/div[@id='atfResults']/ul[@id='s-results-list-atf']/li") plist_b = hxs.xpath("//div[@id='btfResults']/ul/li") plist = plist_a + plist_b #log.msg(u'类别[图书->少儿->%s->%s->%s->%s]页码[%d]总数=%d,开始请求详情...' % (root['name'], leaf['name'], age['name'], star['name'], page, len(plist))) #og.msg(u'Start Request:%s' % plist); """ if len(plist) == 0: if data.find(u'verify'): log.msg(u'verify ban') """ for item in plist: asin = first_item(item.xpath('@data-asin').extract()) log.msg('Request ASIN Detail Page:' + str(asin)) # """ c = Category() c['product_id'] = asin c['category_path'] = 'n:658390051,n:!658391051,n:658409051,n:%d,n:%d,p_72:%d,p_n_age_range:%d' % (root['id'], leaf['id'], star['id'], age['id']) c['path_name'] = ' 图书 : 少儿 : %s : %s : %s : %s' % (root['name'], leaf['name'], star['name'], age['name']) yield c """ #请求详情 detailUrl = self.info_url.replace('<?asin?>', asin) log.msg('DetailUrl:' + detailUrl) yield Request( url=self.info_url.replace('<?asin?>', asin), callback=self.parse_info, headers=self.headers, meta={'asin':asin,'proxy':'http://192.168.1.130:8888'}, #meta={'root': root, 'leaf': leaf, 'age': age, 'star': star, 'asin': asin} ) """ #请求评论 yield Request( url=self.review_url.replace('<?asin?>', asin).replace('<?page?>', '1'), callback=self.parse_review, headers=self.headers, meta={'page': 1, 'asin': asin} ) """ """
def parse_comment(self, response): data = response.body shop_id = response.meta['shopid'] hxs = Selector(None, data) pj = hxs.xpath('//div[@class="pjbox"]/ul[@class="pj_ul"]') for item in pj: c = WubaComment() c['shop_id'] = shop_id c['user_name'] = first_item( item.xpath('li/span[@class="dp_t"]/text()').extract()) c['c_time'] = first_item(item.xpath('li/label/i/text()').extract()) c['content'] = first_item(item.xpath('li[2]/text()').extract()) c['content'] = c['content'].lstrip(' ') c['content'] = c['content'].rstrip(' ') addto = first_item( item.xpath('li/p[@class="addto"]/text()').extract()) c['content'] += addto yield c
def parse_info_tel(self, response): data = response.body hxs = Selector(None, data) telnum = first_item(hxs.xpath("//ul[@class='contact_area']/li/span[@class='phone']/text()").extract()) telnum = telnum.replace('\r\n', '') telnum = telnum.replace(' ', '') wb = response.meta['wb'] wb['telnum'] = telnum yield wb
def parse_desc(self, response): data = response.body data = data.decode('GBK', 'ignore') data = data[9:-1] try: js = json.loads(data) except: log.msg(u'图书[%s]描述请求结果解析异常,非json数据.url=%s' % (response.meta['b']['product_id'], response.url), level=log.INFO) return b = response.meta['b'] hxs = Selector(None, js['content']) b['product_features'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-1']/div[2]/div[@class='book-detail-content']" ).extract()) b['abstract'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-2']/div[2]/div[@class='book-detail-content']" ).extract()) b['recommendation'] = b['abstract'] b['content'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-3']/div[2]/div[@class='book-detail-content']" ).extract()) b['brief_introduction'] = b['content'] b['authorintro'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-4']/div[2]/div[@class='book-detail-content']" ).extract()) b['extract'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-5']/div[2]/div[@class='book-detail-content']" ).extract()) b['catalog'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-6']/div[2]/div[@class='book-detail-content']" ).extract()) b['more_information'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-8']/div[2]/div[@class='book-detail-content']" ).extract()) # b['abstract'] = FmtSQLCharater(b['abstract']) b['catalog'] = FmtSQLCharater(b['catalog']) b['recommendation'] = FmtSQLCharater(b['recommendation']) b['content'] = FmtSQLCharater(b['content']) b['brief_introduction'] = FmtSQLCharater(b['brief_introduction']) b['authorintro'] = FmtSQLCharater(b['authorintro']) b['extract'] = FmtSQLCharater(b['extract']) b['more_information'] = FmtSQLCharater(b['more_information']) log.msg(u'请求商品[%s]的价格信息...' % b['product_id']) yield Request(url=self.price_url.replace('<?sku?>', b['product_id']), callback=self.parse_price, headers=self.headers, meta={'b': b})
def parse_info_tel(self, response): data = response.body hxs = Selector(None, data) telnum = first_item( hxs.xpath( "//ul[@class='contact_area']/li/span[@class='phone']/text()"). extract()) telnum = telnum.replace('\r\n', '') telnum = telnum.replace(' ', '') wb = response.meta['wb'] wb['telnum'] = telnum yield wb
def parse_list(self, response): data = response.body if data == '': log.msg(format='%(request)s post fail.response is empty.', level=log.ERROR, request=response.url) return category = response.meta['category'] age = response.meta['age'] hxs = Selector(None, data) # plist = hxs.xpath("//li/div/div[@class='gl-i-wrap j-sku-item']") log.msg( u'类别[%s]年龄[%s]页码[%d]总数=%d,开始请求详情...' % (category['name'], age['name'], response.meta['page'], len(plist))) for item in plist: sku = first_item(item.xpath('@data-sku').extract()) ''' #请求详情 yield Request( url=self.info_url.replace('<?sku?>', sku), callback=self.parse_info, headers=dict({'Host': 'item.jd.com', 'Upgrade-Insecure-Requests': '1'}, **self.headers), meta={'category': category, 'age': age, 'sku': sku} ) ''' #请求评论 yield Request(url=self.review_url.replace('<?sku?>', sku).replace( '<?page?>', '1'), callback=self.parse_review, headers=self.headers, meta={ 'page': 1, 'sku': sku }) #下一页 if len(plist) == 60: page = response.meta['page'] + 1 log.msg(u'请求类别[%s]年龄[%s]的第%d页' % (category['name'], age['name'], page)) yield Request( url=self.list_url.replace('<?page?>', str(page)).replace( '<?cat?>', str(category['id'])).replace('<?age?>', age['id']), callback=self.parse_list, headers=dict({'Host': 'list.jd.com'}, **self.headers), meta={ 'page': page, 'category': category, 'age': age })
def parse_review(self, response): hxs = Selector(response) asin = response.meta['asin'] title = FmtSQLCharater( first_item(hxs.xpath('//title/text()').extract())) title = title.replace(u'Amazon.com: Customer Reviews: ', '') rlist = hxs.xpath( "//div[@id='cm_cr-review_list']/div[@class='a-section review']") for div in rlist: r = Review() r['product_id'] = asin r['product_name'] = title r['review_id'] = first_item(div.xpath('@id').extract()) votes = FmtSQLCharater( first_item(div.xpath('div[1]/span/text()').extract())) match = re.search(u'(.+) people found this helpful', votes, re.I) if match: r['total_feedback_num'] = match.group(1) r['total_helpful_num'] = match.group(2) # r['full_star'] = FmtSQLCharater( first_item(div.xpath("div[2]/a[1]/i/span/text()").extract())) r['title'] = FmtSQLCharater( first_item(div.xpath("div[2]/a[2]/text()").extract())) r['cust_name'] = FmtSQLCharater( first_item(div.xpath("div[3]/span[1]/a/text()").extract())) r['creation_date'] = FmtSQLCharater( first_item(div.xpath("div[3]/span[4]/text()").extract())) #r['creation_date'] = r['creation_date'].replace(u'于 ', '').replace('年', '/').replace(u'月', '/').replace(u'日', '/') r['body'] = first_item(div.xpath("div[5]/span").extract()) yield r #下一页 if len(rlist) == 10: page = response.meta['page'] + 1 log.msg('Request Product[%s]-[%d] page review ...' % (asin, page)) yield Request(url=self.review_url.replace( '<?asin?>', asin).replace('<?page?>', str(page)), callback=self.parse_review, headers=self.headers, meta={ 'page': page, 'asin': asin })
def parse_list(self, response): if response.status == 200: hxs = Selector(response) positions = hxs.xpath('//div[@class="main-job"]/div') #职位类别 f = response.meta['f'] #下一页 if positions and len(positions) > 0: page = response.meta['page'] + 1 yield self._list_request_by_pg(f, page) # for item in positions: logourl = first_item( item.xpath( 'div[@class="mobile-comp-logo"]/div[@class="comp-logo-frame"]/img/@src' ).extract()) linkid = first_item( item.xpath('div[@class="job organic"]/@id').extract()) title = first_item( item.xpath( 'div[@class="job organic"]/h2[@itemprop="title"]/a/@title' ).extract()) link_url = first_item( item.xpath( 'div[@class="job organic"]/h2[@itemprop="title"]/a/@href' ).extract()) link_url = link_url.replace('\r', '') link_url = link_url.replace('\n', '') link_url = link_url.replace('\t', '') location = first_item( item.xpath( 'div[@class="job organic"]/div[@class="main-content"]/h3/span[@itemprop="jobLocation"]/span[@class="location"]/span/text()' ).extract()) postdate = first_item( item.xpath( 'div[@class="job organic"]/div[@class="main-content"]/div[@class="time"]/@content' ).extract()) yield Request(url=link_url, meta={ 'f': f, 'logourl': logourl, 'linkid': linkid, 'title': title, 'location': location, 'postdate': postdate, 'timeout': 10000 }, callback=self.parse_info, dont_filter=True, errback=self._requestErrorBack) else: log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_list(self, response): if response.status == 200: hxs = Selector(response) positions = hxs.xpath('//div[@class="ns_job_wrapper"]/div[@class="ns_lt ns_jobdetails"]/a[@class="ns_joblink"]') #下一页 if positions and len(positions) > 0: page = response.meta['page'] + 1 yield self._list_request_by_pg(page) # for item in positions: link_url = first_item(item.xpath('@href').extract()) yield Request(url=link_url, meta={'timeout': 5000}, callback=self.parse_info, dont_filter=True, #errback=self._requestErrorBack ) else: log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level = log.INFO)
def parse_list(self, response): if response.status == 200: hxs = Selector(response) positions = hxs.xpath('//h4[@class="position-title "]/a[@class="position-title-link"]') #下一页 if positions and len(positions) > 0: pg = response.meta['pg'] + 1 yield self._list_request_by_pg(response.meta['id'], response.meta['name'], pg) # for item in positions: link_url = first_item(item.xpath('@href').extract()) yield Request(url=link_url, meta={'name': response.meta['name'], 'timeout': 5000}, callback=self.parse_info, dont_filter=True, errback=self.requestErrorBack ) else: log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level = log.INFO)
def parse_list(self, response): if response.status == 200: hxs = Selector(response) positions = hxs.xpath('//ul[@class="article-list"]/li/article/div[@class="entry-content"]/header/h4/a') #下一页 if positions and len(positions) > 0: page = response.meta['page'] + 1 yield self._list_request_by_pg(page) # for item in positions: link_url = first_item(item.xpath('@href').extract()) yield Request(url=link_url, meta={'sector': response.meta['sector'], 'timeout': 5000}, callback=self.parse_info, dont_filter=True, #errback=self._requestErrorBack ) else: log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level = log.INFO)
def parse_list(self, response): if response.status == 200: hxs = Selector(response) positions = hxs.xpath( '//div[@class="ns_job_wrapper"]/div[@class="ns_lt ns_jobdetails"]/a[@class="ns_joblink"]' ) #下一页 if positions and len(positions) > 0: page = response.meta['page'] + 1 yield self._list_request_by_pg(page) # for item in positions: link_url = first_item(item.xpath('@href').extract()) yield Request( url=link_url, meta={'timeout': 5000}, callback=self.parse_info, dont_filter=True, #errback=self._requestErrorBack ) else: log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_list(self, response): if response.status == 200: hxs = Selector(response) positions = hxs.xpath( '//h4[@class="position-title "]/a[@class="position-title-link"]' ) #下一页 if positions and len(positions) > 0: pg = response.meta['pg'] + 1 yield self._list_request_by_pg(response.meta['id'], response.meta['name'], pg) # for item in positions: link_url = first_item(item.xpath('@href').extract()) yield Request(url=link_url, meta={ 'name': response.meta['name'], 'timeout': 5000 }, callback=self.parse_info, dont_filter=True, errback=self.requestErrorBack) else: log.msg(u'职位列表请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #开始解析 title = first_item( hxs.xpath('//h1[@itemprop="title"]/text()').extract()) salary = first_item( hxs.xpath('//span[@itemprop="baseSalary"]/text()').extract()) location = first_item( hxs.xpath('//span[@itemprop="address"]/text()').extract()) jobtype = first_item( hxs.xpath( '//span[@itemprop="employmentType"]/text()').extract()) companyname = first_item( hxs.xpath('//span[@itemprop="name"]/text()').extract()) postdate = first_item( hxs.xpath('//span[@itemprop="datePosted"]/text()').extract()) jobdesc = first_item( hxs.xpath('//section[@class="description"]/div[@class="well"]' ).extract()) logourl = first_item( hxs.xpath( '//section[@class="brandInfo"]/div[@class="well"]/h2/img/@src' ).extract()) if logourl != '': logourl = self.create_url(logourl) # match = re.search(r'<label>Contact:</label>\s*(.+)</li>', data, re.I | re.M) if match: contact = match.group(1) else: contact = '' # match = re.search(r'<label>Address:</label>\s*(.+)</li>', data, re.I | re.M) if match: address = match.group(1) else: address = '' # match = re.search(r'<label>Phone:</label>\s*(.+)</li>', data, re.I | re.M) if match: phone = match.group(1) else: phone = '' # match = re.search(r'<label>Email:</label>\s*(.+)</li>', data, re.I | re.M) if match: email = match.group(1) else: email = '' # match = re.search(r'<label>Website:</label>\s*<a href="(.+)" ', data, re.I | re.M) if match: website = match.group(1) else: website = '' title = FmtSQLCharater(title) companyname = FmtSQLCharater(companyname) location = FmtSQLCharater(location) address = FmtSQLCharater(address) # job = JobsDB_Job() job['SiteID'] = self.site_id match = re.search(r'\.id(.+)\?', response.url, re.I | re.M) if match: job['LinkID'] = str(int(match.group(1))) job['JobTitle'] = title job['Company'] = companyname job['JobName'] = response.meta['sector'] job['JobDesc'] = FmtSQLCharater(jobdesc) job['Salary'] = salary if jobtype.find('Full time') > 0: job['JobType'] = 1 else: job['JobType'] = 0 job['SrcUrl'] = response.url job['Number'] = 'one person' #时间格式化 if postdate == '': postdate = datetime.today() else: postdate = datetime.strptime(postdate, '%d %b %y') job['PublishTime'] = postdate job['RefreshTime'] = postdate job['CityName'] = location job['WorkArea'] = job['CityName'] job['JobAddress'] = address job['Mobile'] = phone # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = companyname company['CompanyAddress'] = address company['WebSite'] = website company['CompanyLogoUrl'] = logourl company['AreaName'] = job['CityName'] company['Mobile'] = phone # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_info(self, response): data = response.body if data == '': log.msg(format='%(request)s post fail.response is empty.', level=log.ERROR, request=response.url) return category = response.meta['category'] age = response.meta['age'] data = data.decode('GBK', 'ignore') hxs = Selector(None, data) # p = hxs.xpath("//div[@id='product-intro']") preview = p.xpath("div[@id='preview']") item = p.xpath("div[@class='m-item-inner']/div[@id='itemInfo']") b = Book() b['product_id'] = response.meta['sku'] b['product_name'] = first_item( item.xpath("div[@id='name']/h1/text()").extract()) b['category_id'] = category['id'] b['cat_age'] = age['name'] b['age'] = first_item( item.xpath("div[@id='name']/h1/strong/text()").extract()) author = item.xpath("div[@id='name']/div[@id='p-author']") b['publish_author_name'] = first_item( author.xpath('string(.)').extract()) b['publish_author_name'] = b['publish_author_name'].replace( '\t', '').replace('\r', '').replace('\n', '') b['publish_author_name'] = b['publish_author_name'].lstrip().rstrip() #images images = preview.xpath( "div[@id='spec-list']/div[@class='spec-items']/ul/li/img/@src" ).extract() b['images'] = '#'.join(images) b['images_big'] = b['images'].replace('/n5/', '/n1/') # detail = hxs.xpath( "//div[@id='product-detail-1']/div[@class='p-parameter']/ul[@class='p-parameter-list']" ) b['publish_publisher'] = first_item( detail.xpath(u"li[contains(text(), '出版社')]/@title").extract()) b['publish_standard_id'] = first_item( detail.xpath(u"li[contains(text(), 'ISBN')]/@title").extract()) b['publish_version_num'] = first_item( detail.xpath(u"li[contains(text(), '版次')]/@title").extract()) b['publish_binding'] = first_item( detail.xpath(u"li[contains(text(), '包装')]/@title").extract()) b['publish_product_size'] = first_item( detail.xpath(u"li[contains(text(), '开本')]/@title").extract()) b['publish_publish_date'] = first_item( detail.xpath(u"li[contains(text(), '出版时间')]/@title").extract()) b['publish_paper_quality'] = first_item( detail.xpath(u"li[contains(text(), '用纸')]/@title").extract()) b['publish_print_copy'] = first_item( detail.xpath(u"li[contains(text(), '印次')]/@title").extract()) b['publish_number_of_pages'] = first_item( detail.xpath(u"li[contains(text(), '套装数量')]/@title").extract()) b['publish_subtitle_language'] = first_item( detail.xpath(u"li[contains(text(), '正文语种')]/@title").extract()) log.msg(u'请求商品[%s]的描述信息...' % response.meta['sku']) yield Request(url=self.desc_url.replace('<?sku?>', response.meta['sku']), callback=self.parse_desc, headers=self.headers, meta={'b': b})
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #开始解析 title = first_item(hxs.xpath('//h1[@class="entry-title mt_title1"]/text()').extract()) companyname = first_item(hxs.xpath('//span[@class="entry-author"]/text()').extract()) companyname = companyname.rstrip(' - ') # match = re.search(r'^<td.+>Location</td>\s+<td.+>(.+)</td>$', data, re.I|re.M) if match: location = match.group(1) if location.find(', ') > 0: location = location.split(',')[0] else: location = '' # match = re.search(r'^<td.+>Posted</td>\s+<td.+>(.+)</td>$', data, re.I|re.M) if match: postdate = match.group(1) else: postdate = '' # jobdesc = first_item(hxs.xpath('//div[@class="user-page mt_content1"]/div[@class="mt_content1"]').extract()) linkid = first_item(hxs.xpath('//input[@id="uid"]/@value').extract()) # title = FmtSQLCharater(title) companyname = FmtSQLCharater(companyname) location = FmtSQLCharater(location) # job = JobsDB_Job() job['SiteID'] = self.site_id job['LinkID'] = linkid job['JobTitle'] = title job['Company'] = companyname job['JobName'] = response.meta['sector'] job['JobDesc'] = FmtSQLCharater(jobdesc) job['Salary'] = salary job['JobType'] = 1 job['SrcUrl'] = response.url job['Number'] = 'one person' #时间格式化 if postdate == '': postdate = datetime.today() else: postdate = datetime.strptime(postdate, '%d %b %y') job['PublishTime'] = postdate job['RefreshTime'] = postdate job['CityName'] = location job['WorkArea'] = job['CityName'] job['JobAddress'] = address job['Mobile'] = phone # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = companyname company['CompanyAddress'] = address company['WebSite'] = website company['CompanyLogoUrl'] = logourl company['AreaName'] = job['CityName'] company['Mobile'] = phone # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level = log.INFO)
def parse_info(self, response): data = response.body if data == '': log.msg(format= '%(request)s post fail.response is empty.', level = log.ERROR, request = response.url) return # """ root = response.meta['root'] leaf = response.meta['leaf'] age = response.meta['age'] star = response.meta['star'] """ asin = response.meta['asin'] # hxs = Selector(None, data) # container = hxs.xpath("//div[@class='a-container']") right = container.xpath("div[@id='rightCol']") left = container.xpath("div[@id='leftCol']") center = container.xpath("div[@id='centerCol']") # log.msg('Book--') b = Book() b['product_id'] = asin b['product_name'] = FmtSQLCharater(first_item(center.xpath("div[@id='booksTitle']/div/h1[@id='title']/span[@id='productTitle']/text()").extract())) b['subname'] = b['product_name'] b['publish_paper_quality'] = FmtSQLCharater(first_item(center.xpath("div[@id='booksTitle']/div/h1[@id='title']/span[2]/text()").extract())) author = center.xpath("div[@id='booksTitle']/div[@id='byline']") log.msg('author html:' + author.extract()) b['publish_author_name'] = FmtSQLCharater(first_item(author.xpath('string(.)').extract())) b['publish_author_name'] = b['publish_author_name'].replace('\n', '').replace('\t', '').replace(' ', '') b['abstract'] = FmtSQLCharater(first_item(hxs.xpath("div[@id='bookDescription_feature_div']/noscript/text()").extract())) images = left.xpath("div[@id='booksImageBlock_feature_div']/div[@id='imageBlockOuter']/div[@id='imageBlockThumbs']/span/div/img/@src").extract() bigImages = map(lambda x: x.replace('_AC_SY60_CR,0,0,60,60_', '_SY498_BO1,204,203,200_').replace('_AC_SX60_CR,0,0,60,60_', '_SX443_BO1,204,203,200_'), images) b['images'] = '#'.join(images) b['images_big'] = '#'.join(bigImages) # buybox = right.xpath("div[@id='buybox_feature_div']/div[@id='combinedBuyBox']/form[@id='addToCart']/div[@id='buybox']/div/div[@class='a-box-inner']/div") b['sale_price'] = FmtSQLCharater(first_item(buybox.xpath("//*[@id='a-autoid-5-announce']/span[2]/span").extract())) b['discount'] = FmtSQLCharater(first_item(buybox.xpath("div[@id='buyNewSection']/div/div[@id='soldByThirdParty']/span[2]/text()").extract())) b['original_price'] = FmtSQLCharater(first_item(buybox.xpath("//*[@id='a-autoid-4-announce']/span[2]").extract())) b['sale_price'] = b['sale_price'].replace('¥', '') b['discount'] = b['discount'].replace(' (', '').replace(u'折) ', '') b['original_price'] = b['original_price'].replace(u'¥', '') #基本信息 bullets = hxs.xpath("//div[@id='productDetails']/table/tr/td[@class='bucket']/div[@class='content']/ul/li") for li in bullets: log.msg('Book-base-info') if li.xpath(u"b[contains(text(), 'Publisher')]"): publisher = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip()) #未来出版社; 第1版 (2011年11月1日) match = re.search(u'(.+); 第(.+)版 \((.+)\)', publisher, re.I|re.M) if match: b['publish_publisher'] = match.group(1) b['publish_version_num'] = match.group(2) b['publish_publish_date'] = match.group(3) elif li.xpath(u"b[contains(text(), 'Series')]"): b['product_name'] = FmtSQLCharater(first_item(li.xpath("a/text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Paperback')]"): b['publish_paper_quality'] = u'Paperback' b['publish_number_of_pages'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Hardcover')]"): b['publish_paper_quality'] = u'Hardcover' b['publish_number_of_pages'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), '纸板书')]"): b['publish_paper_quality'] = u'纸板书' b['publish_number_of_pages'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Age Range')]"): b['age'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Language')]"): b['publish_subtitle_language'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), '开本')]"): b['publish_product_size'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'ISBN-13')]"): b['publish_standard_id'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).lstrip()) #elif li.xpath(u"b[contains(text(), '条形码')]"): # b['publish_barcode'] = first_item(li.xpath("text()").extract()).lstrip() elif li.xpath(u"b[contains(text(), 'Product Dimensions')]"): b['publish_product_size2'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).replace('\n', '').lstrip().rstrip()) elif li.xpath(u"b[contains(text(), 'Shipping Weight')]"): b['publish_product_weight'] = FmtSQLCharater(first_item(li.xpath("text()").extract()).replace('\n', '').lstrip().rstrip()) #elif li.xpath(u"b[contains(text(), '品牌')]"): # b['brand'] = first_item(li.xpath("text()").extract()).lstrip() #商品描述 begin = data.find('var iframeContent =') end = data.find('obj.onloadCallback = onloadCallback;') if begin and end: desc = data[begin + 21: end - 10] desc = urllib2.unquote(desc) hxs = Selector(None, desc) b['recommendation'] = first_item(hxs.xpath(u"//div[@class='content']/h3[contains(text(), '编辑推荐')]/following-sibling::div[1]/text()").extract()) b['catalog'] = first_item(hxs.xpath(u"//div[@class='content']/h3[contains(text(), '目录')]/following-sibling::div[1]/text()").extract()) b['more_information'] = first_item(hxs.xpath(u"//div[@class='content']/h3[contains(text(), '文摘')]/following-sibling::div[1]/text()").extract()) # yield b
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #开始解析 match = re.search(r"^var foldr = '(.+)';", data, re.I|re.M) if match: linkid = match.group(1) else: linkid = '' if linkid == '': log.msg(u'页面没有找到职位ID,丢弃。%s' % response.url, log.ERROR) return else: log.msg(u'找到职位,ID=[%s]' % linkid) # title = first_item(hxs.xpath('//div[@class="ns_jd_headingbig hl"]/h1/strong/text()').extract()) title = title.rstrip(' ') logourl = first_item(hxs.xpath('//div[@class="ns_jd_comp_logo"]/img/@src').extract()) companyname = first_item(hxs.xpath('//span[@class="ns_comp_name"]/text()').extract()) #Locations match = re.search(r'<strong>Locations</strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M) if match: location = match.group(1) else: location = '' #Experience match = re.search(r'<strong>Experience </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M) if match: experience = match.group(1) else: experience = '' #Keywords / Skills match = re.search(r'<strong>Keywords / Skills </strong></h2></div>\s+<div class="ns_jobsum_txt"\s.+>(.+)\s</div>', data, re.I|re.M) if match: skills = match.group(1) else: skills = '' #Education match = re.search(r'<strong>Education </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M) if match: education = match.group(1) else: education = '' #Function match = re.search(r'<strong>Function </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M) if match: function = match.group(1) function = function.replace(' • ', '*') function = function.replace('<br />', '') else: function = '' #Role match = re.search(r'<strong>Role </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M) if match: role = match.group(1) role = role.replace(' • ', '*') role = role.replace('<br />', '') else: role = '' #Industry match = re.search(r'<strong>Industry </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I|re.M) if match: industry = match.group(1) industry = industry.replace(' • ', '') industry = industry.replace('<br />', ';') else: industry = '' #Summary match = re.search(r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)</div>', data, re.I|re.M) if match: summary = match.group(1) else: #存在中途换行的情况 match = re.search(r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+\s+.+)</div>', data, re.I|re.M) if match: summary = match.group(1) else: summary = '' # match = re.search(r'<strong>Posted On </strong></h2></div>\s+<div class="ns_jobsum_txt">\s(.+)\s</div>\t', data, re.I|re.M) if match: postdate = match.group(1) else: postdate = '' # desc = hxs.xpath('//div[@class="ns_jobdesc hl"]').extract() if desc: jobdesc = hxs.xpath('//div[@class="ns_jobdesc hl"]').extract()[0] else: jobdesc = '' # if desc and len(desc) > 1: comdesc = hxs.xpath('//div[@class="ns_jobdesc hl"]').extract()[1] else: comdesc = '' # title = FmtSQLCharater(title) companyname = FmtSQLCharater(companyname) location = FmtSQLCharater(location) # job = JobsDB_Job() job['SiteID'] = self.site_id job['LinkID'] = linkid job['JobTitle'] = title job['Company'] = companyname job['JobName'] = function job['JobDesc'] = FmtSQLCharater(summary + '<p>' + jobdesc) job['JobType'] = 1 job['SrcUrl'] = response.url job['Number'] = 'one person' #时间格式化 if postdate == '': postdate = datetime.today() else: postdate = postdate.replace('st', '') postdate = postdate.replace('nd', '') postdate = postdate.replace('rd', '') postdate = postdate.replace('th', '') postdate = datetime.strptime(postdate, '%d %b %Y') job['PublishTime'] = postdate job['RefreshTime'] = postdate job['CityName'] = location job['WorkArea'] = job['CityName'] job['JobComputerSkill'] = skills job['Exercise'] = experience job['Eduacation'] = education job['JobFunction'] = role job['Industry'] = industry # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = companyname company['Industry'] = industry company['CompanyLogoUrl'] = logourl company['CompanyDesc'] = FmtSQLCharater(comdesc) company['AreaName'] = job['CityName'] # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level = log.INFO)
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #开始解析 title = first_item(hxs.xpath('//h1[@itemprop="title"]/text()').extract()) salary = first_item(hxs.xpath('//span[@itemprop="baseSalary"]/text()').extract()) location = first_item(hxs.xpath('//span[@itemprop="address"]/text()').extract()) jobtype = first_item(hxs.xpath('//span[@itemprop="employmentType"]/text()').extract()) companyname = first_item(hxs.xpath('//span[@itemprop="name"]/text()').extract()) postdate = first_item(hxs.xpath('//span[@itemprop="datePosted"]/text()').extract()) jobdesc = first_item(hxs.xpath('//section[@class="description"]/div[@class="well"]').extract()) logourl = first_item(hxs.xpath('//section[@class="brandInfo"]/div[@class="well"]/h2/img/@src').extract()) if logourl != '': logourl = self.create_url(logourl) # match = re.search(r'<label>Contact:</label>\s*(.+)</li>', data, re.I|re.M) if match: contact = match.group(1) else: contact = '' # match = re.search(r'<label>Address:</label>\s*(.+)</li>', data, re.I|re.M) if match: address = match.group(1) else: address = '' # match = re.search(r'<label>Phone:</label>\s*(.+)</li>', data, re.I|re.M) if match: phone = match.group(1) else: phone = '' # match = re.search(r'<label>Email:</label>\s*(.+)</li>', data, re.I|re.M) if match: email = match.group(1) else: email = '' # match = re.search(r'<label>Website:</label>\s*<a href="(.+)" ', data, re.I|re.M) if match: website = match.group(1) else: website = '' title = FmtSQLCharater(title) companyname = FmtSQLCharater(companyname) location = FmtSQLCharater(location) address = FmtSQLCharater(address) # job = JobsDB_Job() job['SiteID'] = self.site_id match = re.search(r'\.id(.+)\?', response.url, re.I|re.M) if match: job['LinkID'] = str(int(match.group(1))) job['JobTitle'] = title job['Company'] = companyname job['JobName'] = response.meta['sector'] job['JobDesc'] = FmtSQLCharater(jobdesc) job['Salary'] = salary if jobtype.find('Full time') > 0: job['JobType'] = 1 else: job['JobType'] = 0 job['SrcUrl'] = response.url job['Number'] = 'one person' #时间格式化 if postdate == '': postdate = datetime.today() else: postdate = datetime.strptime(postdate, '%d %b %y') job['PublishTime'] = postdate job['RefreshTime'] = postdate job['CityName'] = location job['WorkArea'] = job['CityName'] job['JobAddress'] = address job['Mobile'] = phone # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = companyname company['CompanyAddress'] = address company['WebSite'] = website company['CompanyLogoUrl'] = logourl company['AreaName'] = job['CityName'] company['Mobile'] = phone # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level = log.INFO)
def parse_info(self, response): data = response.body if data == '': log.msg(format='%(request)s post fail.response is empty.', level=log.ERROR, request=response.url) return # """ root = response.meta['root'] leaf = response.meta['leaf'] age = response.meta['age'] star = response.meta['star'] """ asin = response.meta['asin'] # hxs = Selector(None, data) # container = hxs.xpath("//div[@class='a-container']") right = container.xpath("div[@id='rightCol']") left = container.xpath("div[@id='leftCol']") center = container.xpath("div[@id='centerCol']") # log.msg('Book--') b = Book() b['product_id'] = asin b['product_name'] = FmtSQLCharater( first_item( center.xpath( "div[@id='booksTitle']/div/h1[@id='title']/span[@id='productTitle']/text()" ).extract())) b['subname'] = b['product_name'] b['publish_paper_quality'] = FmtSQLCharater( first_item( center.xpath( "div[@id='booksTitle']/div/h1[@id='title']/span[2]/text()" ).extract())) author = center.xpath("div[@id='booksTitle']/div[@id='byline']") log.msg('author html:' + author.extract()) b['publish_author_name'] = FmtSQLCharater( first_item(author.xpath('string(.)').extract())) b['publish_author_name'] = b['publish_author_name'].replace( '\n', '').replace('\t', '').replace(' ', '') b['abstract'] = FmtSQLCharater( first_item( hxs.xpath( "div[@id='bookDescription_feature_div']/noscript/text()"). extract())) images = left.xpath( "div[@id='booksImageBlock_feature_div']/div[@id='imageBlockOuter']/div[@id='imageBlockThumbs']/span/div/img/@src" ).extract() bigImages = map( lambda x: x.replace( '_AC_SY60_CR,0,0,60,60_', '_SY498_BO1,204,203,200_').replace( '_AC_SX60_CR,0,0,60,60_', '_SX443_BO1,204,203,200_'), images) b['images'] = '#'.join(images) b['images_big'] = '#'.join(bigImages) # buybox = right.xpath( "div[@id='buybox_feature_div']/div[@id='combinedBuyBox']/form[@id='addToCart']/div[@id='buybox']/div/div[@class='a-box-inner']/div" ) b['sale_price'] = FmtSQLCharater( first_item( buybox.xpath( "//*[@id='a-autoid-5-announce']/span[2]/span").extract())) b['discount'] = FmtSQLCharater( first_item( buybox.xpath( "div[@id='buyNewSection']/div/div[@id='soldByThirdParty']/span[2]/text()" ).extract())) b['original_price'] = FmtSQLCharater( first_item( buybox.xpath( "//*[@id='a-autoid-4-announce']/span[2]").extract())) b['sale_price'] = b['sale_price'].replace('¥', '') b['discount'] = b['discount'].replace(' (', '').replace(u'折) ', '') b['original_price'] = b['original_price'].replace(u'¥', '') #基本信息 bullets = hxs.xpath( "//div[@id='productDetails']/table/tr/td[@class='bucket']/div[@class='content']/ul/li" ) for li in bullets: log.msg('Book-base-info') if li.xpath(u"b[contains(text(), 'Publisher')]"): publisher = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) #未来出版社; 第1版 (2011年11月1日) match = re.search(u'(.+); 第(.+)版 \((.+)\)', publisher, re.I | re.M) if match: b['publish_publisher'] = match.group(1) b['publish_version_num'] = match.group(2) b['publish_publish_date'] = match.group(3) elif li.xpath(u"b[contains(text(), 'Series')]"): b['product_name'] = FmtSQLCharater( first_item(li.xpath("a/text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Paperback')]"): b['publish_paper_quality'] = u'Paperback' b['publish_number_of_pages'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Hardcover')]"): b['publish_paper_quality'] = u'Hardcover' b['publish_number_of_pages'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), '纸板书')]"): b['publish_paper_quality'] = u'纸板书' b['publish_number_of_pages'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Age Range')]"): b['age'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Language')]"): b['publish_subtitle_language'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), '开本')]"): b['publish_product_size'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'ISBN-13')]"): b['publish_standard_id'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) #elif li.xpath(u"b[contains(text(), '条形码')]"): # b['publish_barcode'] = first_item(li.xpath("text()").extract()).lstrip() elif li.xpath(u"b[contains(text(), 'Product Dimensions')]"): b['publish_product_size2'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).replace( '\n', '').lstrip().rstrip()) elif li.xpath(u"b[contains(text(), 'Shipping Weight')]"): b['publish_product_weight'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).replace( '\n', '').lstrip().rstrip()) #elif li.xpath(u"b[contains(text(), '品牌')]"): # b['brand'] = first_item(li.xpath("text()").extract()).lstrip() #商品描述 begin = data.find('var iframeContent =') end = data.find('obj.onloadCallback = onloadCallback;') if begin and end: desc = data[begin + 21:end - 10] desc = urllib2.unquote(desc) hxs = Selector(None, desc) b['recommendation'] = first_item( hxs.xpath( u"//div[@class='content']/h3[contains(text(), '编辑推荐')]/following-sibling::div[1]/text()" ).extract()) b['catalog'] = first_item( hxs.xpath( u"//div[@class='content']/h3[contains(text(), '目录')]/following-sibling::div[1]/text()" ).extract()) b['more_information'] = first_item( hxs.xpath( u"//div[@class='content']/h3[contains(text(), '文摘')]/following-sibling::div[1]/text()" ).extract()) # yield b
def parse_list(self, response): data = response.body if data == '': log.msg(format='%(request)s post fail.response is empty.', level=log.ERROR, request=response.url) return # """ root = response.meta['root'] leaf = response.meta['leaf'] age = response.meta['age'] star = response.meta['star'] """ page = response.meta['page'] # hxs = Selector(None, data) # plist_a = hxs.xpath( "//div[@id='resultsCol']/div[@id='centerMinus']/div[@id='atfResults']/ul[@id='s-results-list-atf']/li" ) plist_b = hxs.xpath("//div[@id='btfResults']/ul/li") plist = plist_a + plist_b #log.msg(u'类别[图书->少儿->%s->%s->%s->%s]页码[%d]总数=%d,开始请求详情...' % (root['name'], leaf['name'], age['name'], star['name'], page, len(plist))) #og.msg(u'Start Request:%s' % plist); """ if len(plist) == 0: if data.find(u'verify'): log.msg(u'verify ban') """ for item in plist: asin = first_item(item.xpath('@data-asin').extract()) log.msg('Request ASIN Detail Page:' + str(asin)) # """ c = Category() c['product_id'] = asin c['category_path'] = 'n:658390051,n:!658391051,n:658409051,n:%d,n:%d,p_72:%d,p_n_age_range:%d' % (root['id'], leaf['id'], star['id'], age['id']) c['path_name'] = ' 图书 : 少儿 : %s : %s : %s : %s' % (root['name'], leaf['name'], star['name'], age['name']) yield c """ #请求详情 detailUrl = self.info_url.replace('<?asin?>', asin) log.msg('DetailUrl:' + detailUrl) yield Request( url=self.info_url.replace('<?asin?>', asin), callback=self.parse_info, headers=self.headers, meta={ 'asin': asin, 'proxy': 'http://192.168.1.130:8888' }, #meta={'root': root, 'leaf': leaf, 'age': age, 'star': star, 'asin': asin} ) """ #请求评论 yield Request( url=self.review_url.replace('<?asin?>', asin).replace('<?page?>', '1'), callback=self.parse_review, headers=self.headers, meta={'page': 1, 'asin': asin} ) """ """
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #页面解析 #企业横幅 company_banner = first_item( hxs.xpath( '//img[@id="company_banner"]/@data-original').extract()) #企业logo company_logo = first_item( hxs.xpath( '//img[@id="company_logo"]/@data-original').extract()) #职位名称 position_title = first_item( hxs.xpath('//h1[@id="position_title"]/text()').extract()) position_title = FmtSQLCharater(position_title) #企业名称 company_name = first_item( hxs.xpath('//h2[@id="company_name"]/a/text()').extract()) if company_name == '': company_name = first_item( hxs.xpath('//h2[@id="company_name"]/text()').extract()) company_name = company_name.replace('\n', '') company_name = company_name.replace('\t', '') company_name = company_name.lstrip(' ') company_name = company_name.rstrip(' ') company_name = FmtSQLCharater(company_name) if company_name == '': log.msg(u'企业名称为空,url=%s' % response.url) return #企业SrcUrl地址 company_url = first_item( hxs.xpath('//h2[@id="company_name"]/a/@href').extract()) #薪资 salary = first_item( hxs.xpath('//div[@id="salary"]/p/a/text()').extract()) #经验 experience = first_item( hxs.xpath( '//div[@id="experience"]/p[@id="years_of_experience"]/span[@id="years_of_experience"]/text()' ).extract()) experience = experience.replace('\n', '') experience = experience.replace('\t', '') #Location location = first_item( hxs.xpath( '//div[@id="location"]/p/span[@id="single_work_location"]/text()' ).extract()) location = location.replace('\n', '') location = location.replace('\t', '') #职位描述(可能包含岗位职责、职位要求) job_desc = first_item( hxs.xpath('//div[@id="job_description"]').extract()) #企业信息 company_registration_number = first_item( hxs.xpath('//span[@id="company_registration_number"]/text()'). extract()) company_industry = first_item( hxs.xpath('//p[@id="company_industry"]/text()').extract()) company_website = first_item( hxs.xpath('//a[@id="company_website"]/text()').extract()) company_contact = first_item( hxs.xpath('//p[@id="company_contact"]/text()').extract()) company_size = first_item( hxs.xpath('//p[@id="company_size"]/text()').extract()) work_environment_working_hours = first_item( hxs.xpath('//p[@id="work_environment_working_hours"]/text()'). extract()) work_environment_dress_code = first_item( hxs.xpath( '//p[@id="work_environment_dress_code"]/text()').extract()) work_environment_benefits = first_item( hxs.xpath( '//p[@id="work_environment_benefits"]/text()').extract()) work_environment_spoken_language = first_item( hxs.xpath('//p[@id="work_environment_spoken_language"]/text()' ).extract()) #gallery gallery = '' thumbs = hxs.xpath('//ul[@class="gallery-thumb"]/li') for item in thumbs: gallery += first_item( item.xpath('img/@data-original').extract()) + ';' #企业描述 company_overview_all = first_item( hxs.xpath('//div[@id="company_overview_all"]').extract()) #work location match = re.search(r'¢er=(.*?)&', data, re.I | re.M) if match: gps_location = match.group(1) lat = gps_location.split(',')[0] lng = gps_location.split(',')[1] else: lat = '0.0' lng = '0.0' # address = first_item( hxs.xpath('//p[@id="address"]/text()').extract()) address = FmtSQLCharater(address) #Advertised: 23-June-2015 posting_date = first_item( hxs.xpath('//p[@id="posting_date"]/text()').extract()) posting_date = posting_date.replace('Advertised:', '') posting_date = posting_date.replace(' ', '') # job = JobsDB_Job() job['SiteID'] = self.site_id #http://jobs.jobstreet.com/sg/jobs/4712859?fr=J job['LinkID'] = response.url[34:-5] job['JobTitle'] = position_title job['Company'] = company_name job['Industry'] = company_industry job['JobName'] = response.meta['name'] job['JobDesc'] = FmtSQLCharater(job_desc) job['Salary'] = salary job['Exercise'] = experience job['JobType'] = 1 job['SrcUrl'] = response.url job['SSWelfare'] = work_environment_benefits job['Number'] = 'one person' #时间格式化 PostDate = datetime.strptime(posting_date, '%d-%B-%Y') job['PublishTime'] = PostDate job['RefreshTime'] = PostDate if location <> '' and len(location.split('-')) > 1: job['CityName'] = location.split('-')[0].replace(' ', '') job['WorkArea1'] = location.split('-')[1].replace(' ', '') else: job['CityName'] = location job['WorkArea'] = job['CityName'] job['ForeignLanguage'] = work_environment_spoken_language job['JobWorkTime'] = work_environment_working_hours job['GisLongitude'] = lng job['GisLatitude'] = lat job['JobAddress'] = address job['Mobile'] = company_contact # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = company_name company['Industry'] = company_industry company['CompanyScale'] = company_size company['CompanyAddress'] = address company['CompanyUrl'] = company_url company['WebSite'] = company_website company['CompanyLogoUrl'] = company_logo company['AreaName'] = job['CityName'] company['CompanyDesc'] = FmtSQLCharater(company_overview_all) company['Mobile'] = company_contact company['GisLongitude'] = lng company['GisLatitude'] = lat company['OtherInfo'] = company_banner + '#' + gallery # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(None, data) #开始解析 linkid = response.meta['linkid'] # title = response.meta['title'] # logourl = response.meta['logourl'] # location = response.meta['location'] # function = response.meta['f'] # postdate = response.meta['postdate'] # companyname = first_item( hxs.xpath( '//div[@class="additional_info"]/span[@class="company"]/a/text()' ).extract()) companyname = companyname.lstrip(' ') companyname = companyname.rstrip(' ') if companyname == '': log.msg(u'该职位来源其他网站(%s),无法抓取.' % response.url, level=log.ERROR) return # desc = first_item( hxs.xpath('//div[@class="p-description"]').extract()) desc = desc.lstrip('<div class="p-description">') desc = desc.rstrip('</div>') desc = desc.replace('\t', '') # title = FmtSQLCharater(title) companyname = FmtSQLCharater(companyname) location = FmtSQLCharater(location) # job = JobsDB_Job() job['SiteID'] = self.site_id job['LinkID'] = linkid job['JobTitle'] = title job['Company'] = companyname job['JobName'] = function job['JobDesc'] = FmtSQLCharater(desc) job['JobType'] = 1 job['SrcUrl'] = response.url job['Number'] = 'one person' #时间格式化 if postdate == '': postdate = datetime.today() else: postdate = datetime.strptime(postdate, '%Y-%m-%d') job['PublishTime'] = postdate job['RefreshTime'] = postdate job['CityName'] = location job['WorkArea'] = job['CityName'] # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = companyname company['CompanyLogoUrl'] = logourl company['AreaName'] = job['CityName'] # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #开始解析 title = first_item( hxs.xpath( '//h1[@class="entry-title mt_title1"]/text()').extract()) companyname = first_item( hxs.xpath('//span[@class="entry-author"]/text()').extract()) companyname = companyname.rstrip(' - ') # match = re.search(r'^<td.+>Location</td>\s+<td.+>(.+)</td>$', data, re.I | re.M) if match: location = match.group(1) if location.find(', ') > 0: location = location.split(',')[0] else: location = '' # match = re.search(r'^<td.+>Posted</td>\s+<td.+>(.+)</td>$', data, re.I | re.M) if match: postdate = match.group(1) else: postdate = '' # jobdesc = first_item( hxs.xpath( '//div[@class="user-page mt_content1"]/div[@class="mt_content1"]' ).extract()) linkid = first_item( hxs.xpath('//input[@id="uid"]/@value').extract()) # title = FmtSQLCharater(title) companyname = FmtSQLCharater(companyname) location = FmtSQLCharater(location) # job = JobsDB_Job() job['SiteID'] = self.site_id job['LinkID'] = linkid job['JobTitle'] = title job['Company'] = companyname job['JobName'] = response.meta['sector'] job['JobDesc'] = FmtSQLCharater(jobdesc) job['Salary'] = salary job['JobType'] = 1 job['SrcUrl'] = response.url job['Number'] = 'one person' #时间格式化 if postdate == '': postdate = datetime.today() else: postdate = datetime.strptime(postdate, '%d %b %y') job['PublishTime'] = postdate job['RefreshTime'] = postdate job['CityName'] = location job['WorkArea'] = job['CityName'] job['JobAddress'] = address job['Mobile'] = phone # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = companyname company['CompanyAddress'] = address company['WebSite'] = website company['CompanyLogoUrl'] = logourl company['AreaName'] = job['CityName'] company['Mobile'] = phone # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #页面解析 #企业横幅 company_banner = first_item(hxs.xpath('//img[@id="company_banner"]/@data-original').extract()) #企业logo company_logo = first_item(hxs.xpath('//img[@id="company_logo"]/@data-original').extract()) #职位名称 position_title = first_item(hxs.xpath('//h1[@id="position_title"]/text()').extract()) position_title = FmtSQLCharater(position_title) #企业名称 company_name = first_item(hxs.xpath('//h2[@id="company_name"]/a/text()').extract()) if company_name == '': company_name = first_item(hxs.xpath('//h2[@id="company_name"]/text()').extract()) company_name = company_name.replace('\n', '') company_name = company_name.replace('\t', '') company_name = company_name.lstrip(' ') company_name = company_name.rstrip(' ') company_name = FmtSQLCharater(company_name) if company_name == '': log.msg(u'企业名称为空,url=%s' % response.url) return #企业SrcUrl地址 company_url = first_item(hxs.xpath('//h2[@id="company_name"]/a/@href').extract()) #薪资 salary = first_item(hxs.xpath('//div[@id="salary"]/p/a/text()').extract()) #经验 experience = first_item(hxs.xpath('//div[@id="experience"]/p[@id="years_of_experience"]/span[@id="years_of_experience"]/text()').extract()) experience = experience.replace('\n', '') experience = experience.replace('\t', '') #Location location = first_item(hxs.xpath('//div[@id="location"]/p/span[@id="single_work_location"]/text()').extract()) location = location.replace('\n', '') location = location.replace('\t', '') #职位描述(可能包含岗位职责、职位要求) job_desc = first_item(hxs.xpath('//div[@id="job_description"]').extract()) #企业信息 company_registration_number = first_item(hxs.xpath('//span[@id="company_registration_number"]/text()').extract()) company_industry = first_item(hxs.xpath('//p[@id="company_industry"]/text()').extract()) company_website = first_item(hxs.xpath('//a[@id="company_website"]/text()').extract()) company_contact = first_item(hxs.xpath('//p[@id="company_contact"]/text()').extract()) company_size = first_item(hxs.xpath('//p[@id="company_size"]/text()').extract()) work_environment_working_hours = first_item(hxs.xpath('//p[@id="work_environment_working_hours"]/text()').extract()) work_environment_dress_code = first_item(hxs.xpath('//p[@id="work_environment_dress_code"]/text()').extract()) work_environment_benefits = first_item(hxs.xpath('//p[@id="work_environment_benefits"]/text()').extract()) work_environment_spoken_language = first_item(hxs.xpath('//p[@id="work_environment_spoken_language"]/text()').extract()) #gallery gallery = '' thumbs = hxs.xpath('//ul[@class="gallery-thumb"]/li') for item in thumbs: gallery += first_item(item.xpath('img/@data-original').extract()) + ';' #企业描述 company_overview_all = first_item(hxs.xpath('//div[@id="company_overview_all"]').extract()) #work location match = re.search(r'¢er=(.*?)&', data, re.I|re.M) if match: gps_location = match.group(1) lat = gps_location.split(',')[0] lng = gps_location.split(',')[1] else: lat = '0.0' lng = '0.0' # address = first_item(hxs.xpath('//p[@id="address"]/text()').extract()) address = FmtSQLCharater(address) #Advertised: 23-June-2015 posting_date = first_item(hxs.xpath('//p[@id="posting_date"]/text()').extract()) posting_date = posting_date.replace('Advertised:', '') posting_date = posting_date.replace(' ', '') # job = JobsDB_Job() job['SiteID'] = self.site_id #http://jobs.jobstreet.com/sg/jobs/4712859?fr=J job['LinkID'] = response.url[34: -5] job['JobTitle'] = position_title job['Company'] = company_name job['Industry'] = company_industry job['JobName'] = response.meta['name'] job['JobDesc'] = FmtSQLCharater(job_desc) job['Salary'] = salary job['Exercise'] = experience job['JobType'] = 1 job['SrcUrl'] = response.url job['SSWelfare'] = work_environment_benefits job['Number'] = 'one person' #时间格式化 PostDate = datetime.strptime(posting_date, '%d-%B-%Y') job['PublishTime'] = PostDate job['RefreshTime'] = PostDate if location <> '' and len(location.split('-')) > 1: job['CityName'] = location.split('-')[0].replace(' ', '') job['WorkArea1'] = location.split('-')[1].replace(' ', '') else: job['CityName'] = location job['WorkArea'] = job['CityName'] job['ForeignLanguage'] = work_environment_spoken_language job['JobWorkTime'] = work_environment_working_hours job['GisLongitude'] = lng job['GisLatitude'] = lat job['JobAddress'] = address job['Mobile'] = company_contact # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = company_name company['Industry'] = company_industry company['CompanyScale'] = company_size company['CompanyAddress'] = address company['CompanyUrl'] = company_url company['WebSite'] = company_website company['CompanyLogoUrl'] = company_logo company['AreaName'] = job['CityName'] company['CompanyDesc'] = FmtSQLCharater(company_overview_all) company['Mobile'] = company_contact company['GisLongitude'] = lng company['GisLatitude'] = lat company['OtherInfo'] = company_banner + '#' + gallery # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level = log.INFO)
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #开始解析 match = re.search(r"^var foldr = '(.+)';", data, re.I | re.M) if match: linkid = match.group(1) else: linkid = '' if linkid == '': log.msg(u'页面没有找到职位ID,丢弃。%s' % response.url, log.ERROR) return else: log.msg(u'找到职位,ID=[%s]' % linkid) # title = first_item( hxs.xpath( '//div[@class="ns_jd_headingbig hl"]/h1/strong/text()'). extract()) title = title.rstrip(' ') logourl = first_item( hxs.xpath( '//div[@class="ns_jd_comp_logo"]/img/@src').extract()) companyname = first_item( hxs.xpath('//span[@class="ns_comp_name"]/text()').extract()) #Locations match = re.search( r'<strong>Locations</strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: location = match.group(1) else: location = '' #Experience match = re.search( r'<strong>Experience </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: experience = match.group(1) else: experience = '' #Keywords / Skills match = re.search( r'<strong>Keywords / Skills </strong></h2></div>\s+<div class="ns_jobsum_txt"\s.+>(.+)\s</div>', data, re.I | re.M) if match: skills = match.group(1) else: skills = '' #Education match = re.search( r'<strong>Education </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: education = match.group(1) else: education = '' #Function match = re.search( r'<strong>Function </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: function = match.group(1) function = function.replace(' • ', '*') function = function.replace('<br />', '') else: function = '' #Role match = re.search( r'<strong>Role </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: role = match.group(1) role = role.replace(' • ', '*') role = role.replace('<br />', '') else: role = '' #Industry match = re.search( r'<strong>Industry </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: industry = match.group(1) industry = industry.replace(' • ', '') industry = industry.replace('<br />', ';') else: industry = '' #Summary match = re.search( r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)</div>', data, re.I | re.M) if match: summary = match.group(1) else: #存在中途换行的情况 match = re.search( r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+\s+.+)</div>', data, re.I | re.M) if match: summary = match.group(1) else: summary = '' # match = re.search( r'<strong>Posted On </strong></h2></div>\s+<div class="ns_jobsum_txt">\s(.+)\s</div>\t', data, re.I | re.M) if match: postdate = match.group(1) else: postdate = '' # desc = hxs.xpath('//div[@class="ns_jobdesc hl"]').extract() if desc: jobdesc = hxs.xpath( '//div[@class="ns_jobdesc hl"]').extract()[0] else: jobdesc = '' # if desc and len(desc) > 1: comdesc = hxs.xpath( '//div[@class="ns_jobdesc hl"]').extract()[1] else: comdesc = '' # title = FmtSQLCharater(title) companyname = FmtSQLCharater(companyname) location = FmtSQLCharater(location) # job = JobsDB_Job() job['SiteID'] = self.site_id job['LinkID'] = linkid job['JobTitle'] = title job['Company'] = companyname job['JobName'] = function job['JobDesc'] = FmtSQLCharater(summary + '<p>' + jobdesc) job['JobType'] = 1 job['SrcUrl'] = response.url job['Number'] = 'one person' #时间格式化 if postdate == '': postdate = datetime.today() else: postdate = postdate.replace('st', '') postdate = postdate.replace('nd', '') postdate = postdate.replace('rd', '') postdate = postdate.replace('th', '') postdate = datetime.strptime(postdate, '%d %b %Y') job['PublishTime'] = postdate job['RefreshTime'] = postdate job['CityName'] = location job['WorkArea'] = job['CityName'] job['JobComputerSkill'] = skills job['Exercise'] = experience job['Eduacation'] = education job['JobFunction'] = role job['Industry'] = industry # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = companyname company['Industry'] = industry company['CompanyLogoUrl'] = logourl company['CompanyDesc'] = FmtSQLCharater(comdesc) company['AreaName'] = job['CityName'] # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)