def parse(self,response): self._logger.info("解析url:"+response.url) douban_item = BookItem() list = ['preface', 'catalog', 'translator', 'isbn', 'subhead', 'edition', 'language', 'orgcategory', 'type', 'packing', 'seriename', 'coverurl', 'coverpath', 'pages', 'epilogue', 'price', 'publishdate', 'sourcetype', 'editorsugest', 'papermeter', 'printedtime', 'summary', 'orgisbn', 'author','usersugest', 'orgpublisher', 'words', 'format', 'issuearea', 'contenttype', 'contentsummary', 'salecategory', 'publisher', 'impression', 'bookname', 'category', 'collectiontime', 'orgcode','skuid', 'commentcount', 'ifimport', '_row', '_entitycode', 'url','commentpercent','commenttag','authorintro','sourceprice'] for item_key in list: douban_item[item_key] = '' douban_item['bookname'] = response.xpath("//h1/span/text()").extract_first() selector_info = response.xpath("//div[@id='info']") key_list = response.xpath("//div[@id='info']//span/text()").extract() info_list = selector_info.xpath('./text()|a/text()|span/a/text()').extract() key_list = remove_meaningless_str(key_list) key_list = merge_key(key_list) info_list = remove_meaningless_str(info_list) info_list = merge_info(info_list) if len(key_list)>11: self._logger.error('出现封装字段数大于11的图书:'+response.url) self._logger.error(key_list) if len(key_list) == len(info_list): douban_item = packing_info(douban_item,key_list,info_list) else: self._logger.error('基础信息封装代码出现BUG:'+response.url) # 内容简介 is_set = '否' if not douban_item['isbn'] or len(douban_item['isbn']) != 13: is_set = '是' if is_set == '否': contentsummary_selector_list = response.xpath("//div[@id='link-report']//div[@class='intro']") douban_item['contentsummary'] = packing_content(contentsummary_selector_list,-1) douban_item['sourcetype'] = '03' douban_item['salecategory'] = '' douban_item['category'] = '' douban_item['orgcategory'] = '' contenttype = response.xpath("//a[@class=' tag']/text()").extract() contenttype = ','.join(contenttype) douban_item['contenttype'] = contenttype douban_item['issuearea'] = '' douban_item['type'] = '01' douban_item['edition'] = '' douban_item['impression'] = '' douban_item['words'] = '' douban_item['language'] = '' douban_item['printedtime'] = '' douban_item['format'] = '' douban_item['papermeter'] = '' # 封面url douban_item['coverurl'] = response.xpath("//a[@class='nbg']/@href").extract_first() # 保存图片地址 today_str = str(datetime.datetime.now()).split(".")[0].split()[0].replace('-', '') sku_id = re.findall(r"\d+",response.url)[0] douban_item['coverpath'] = '/book/'+today_str+'/'+'03'+douban_item['isbn']+'.png' # 目录 catalog_selector_list = response.xpath("//div[@id='dir_"+sku_id+"_full']") douban_item['catalog'] = packing_content(catalog_selector_list) douban_item['editorsugest'] = '' douban_item['usersugest'] = '' douban_item['preface'] = '' douban_item['summary'] = '' douban_item['epilogue'] = '' # 收集时间 now_str = str(datetime.datetime.now()).split(".")[0] douban_item['collectiontime'] = now_str douban_item['orgcode'] = '' douban_item['skuid'] = sku_id comment_count_list = response.xpath("//h2/span[@class='pl']/a/text()").extract() comment_count = 0 if comment_count_list != None: for i in comment_count_list: i = re.findall(r"\d+",i) if len(i)==0: i.append('0') comment_count = comment_count+int(i[0]) douban_item['commentcount'] = str(comment_count) douban_item['ifimport'] = '0' douban_item['_row'] = douban_item['skuid']+'03' douban_item['_entitycode'] = 'web_page_p_book_info_09' douban_item['is_set'] = '否' douban_item['url'] = response.url douban_item['commentpercent'] = '' douban_item['commenttag'] = '' douban_item['authorintro'] = '' douban_item['sourceprice'] = '' # 书评列表地址 long_comment_list = response.xpath("//div[@class='main-bd']/h2/a/@href").extract() if len(long_comment_list) >0: for long_comment_link in long_comment_list: yield scrapy.Request(long_comment_link, meta={'douban_item': douban_item}, callback=self.parse_long_comment) # 短评地址 short_comment_link = response.xpath("//div[@class='related_info']/p/a/@href").extract_first() if short_comment_link: yield scrapy.Request(short_comment_link,meta={'douban_item':douban_item},callback=self.parse_short_comment) yield douban_item # 添加相关图书的链接 similarity_urls = response.xpath("//dd//a/@href").extract() if len(similarity_urls)>0: for similarity_url in similarity_urls: # 判断url是否已经爬取过 # sku_id = re.findall(r"\d+", similarity_url)[0] # querysql = 'select * from web_page_book_info_09_douban where skuid = %s' %sku_id # self.cursor.execute(querysql) # result = self.cursor.fetchone() # if result != None: # continue yield scrapy.Request(similarity_url)
def parse(self, response): item = BookItem() for item_key in item_list: item[item_key] = '' item['is_set'] = '否' is_set = '否' # 判断isbn是否满足要求 isbn = self.get_basicinfo(response,'ISBN') isbn_list = isbn.split(',') if len(isbn_list) == 1: isbn = isbn_list[0] elif len(isbn_list) > 1: for i in isbn_list: i = i.strip() if len(i) == 13: isbn = i if not isbn: isbn = '' if is_set == '否': skuid = self.get_basicinfo(response,'ASIN') # 加载商品描述信息接口 html = self.get_content_and_cate(skuid) bookname = response.xpath("//h1/span[@id='productTitle']/text()").extract_first() bookname = bookname.strip() item['bookname'] = bookname item['subhead'] = '' publisher_str = self.get_basicinfo(response,'出版社') publisher = publisher_str.split(';')[0].strip() item['publisher'] = publisher item['orgpublisher'] = publisher contentsummary = response.xpath("//noscript/div/text()").extract() contentsummary = ''.join(contentsummary) item['contentsummary'] = contentsummary item['sourcetype'] = '05' author_list = response.xpath("//div[@id='bylineInfo']/span[1]/a/text()").extract() author = '#'.join(author_list) item['author'] = author translator_list = response.xpath("//div[@id='bylineInfo']/span[2]/a/text()").extract() translator = '#'.join(translator_list) item['translator'] = translator item['isbn'] = isbn item['orgisbn'] = isbn item['salecategory'] = '' item['category'] = '' item['orgcategory'] = '' contenttype_list = response.xpath("//div[@id='wayfinding-breadcrumbs_feature_div']//span[@class='a-list-item']/a/text()").extract() for index,c in enumerate(contenttype_list): contenttype_list[index] = c.strip() contenttype = ','.join(contenttype_list) item['contenttype'] = contenttype item['issuearea'] = '' item['type'] = '01' packing = response.xpath("//h1/span[2]/text()").extract_first() edition = re.findall('第(\d+)版',publisher_str) if not edition: edition = [''] item['edition'] = edition[0] item['impression'] = '' item['words'] = '' pages = re.findall('\d+', self.get_basicinfo(response,packing)) if not pages: pages = [''] pages = pages[0] item['pages'] = pages item['language'] = self.get_basicinfo(response, '语种') price = response.xpath("//div[@id = 'buyBoxInner']/ul/li/span/span[2]/text()").extract_first() price = re.findall('\d+[.]*\d+', price) item['price'] = price[0] item['format'] = self.get_basicinfo(response, '开本') item['papermeter'] = '' item['packing'] = packing item['coverurl'] = response.xpath("//div[@id = 'img-canvas']/img/@src").extract_first() item['seriename'] = '' item['catalog'] = self.parse_desc(html,'目录') item['editorsugest'] = self.parse_desc(html,'编辑推荐') item['usersugest'] = self.parse_desc(html,'名人推荐') item['preface'] = '' item['summary'] = self.parse_desc(html,'文摘') item['epilogue'] = '' publishdate = response.xpath("//h1/span[3]/text()").extract_first() if not publishdate: publishdate = '' if len(publishdate) > 7: pub_list = re.findall('(\d+)年(\d+)月',publishdate) publishdate = '-'.join(pub_list[0]) item['publishdate'] = publishdate item['printedtime'] = publishdate item['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') item['orgcode'] = '' item['skuid'] = skuid commentcount = response.xpath("//span[@id='acrCustomerReviewText']/text()").extract_first() if not commentcount: commentcount = '0' commentcount = re.findall('(\d+)*', commentcount) commentcount = ''.join(commentcount) item['commentcount'] = commentcount item['_row'] = skuid + item['sourcetype'] item['coverpath'] = '/book/' + datetime.datetime.now().strftime('%Y%m%d') + '/' + item['_row'] + '.jpg' item['is_set'] = '否' item['ifimport'] = '0' item['url'] = response.url item['_entitycode'] = 'web_page_p_book_info_09' item['commentpercent'] = '' try: tag_resp = self.get_commenttag(skuid) commenttag = tag_resp.xpath("//span/@data-cr-trigger-on-view") commenttag = json.loads(commenttag[0]) commenttag = commenttag['ajaxParamsMap']['lighthouseTerms'].replace('/', '#') except: commenttag = '' item['commenttag'] = commenttag item['authorintro'] = self.parse_desc(html,'作者简介') sourceprice = response.xpath("//div[@id='soldByThirdParty']/span[2]/text()").extract_first() sourceprice = re.findall('\d+[.]*\d+',sourceprice) if not sourceprice: sourceprice = [''] item['sourceprice'] = sourceprice[0] comments = response.xpath("//div[@id='cm-cr-dp-review-list']/div") #遍历评论列表 if comments: for comment in comments: comment_item = CommentItem() comment_item['isbn'] = isbn comment_item['uri'] = response.url comment_item['bookname'] = bookname comment_item['sourcetype'] = item['sourcetype'] comment_item['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') publishdate_c = response.xpath("//h1/span[3]/text()").extract_first() if not publishdate_c: publishdate_c = '' else: pub_list = re.findall('(\d+)年(\d+)月(\d+)日', publishdate_c) publishdate_c = '-'.join(pub_list[0]) comment_item['publishtime'] = publishdate_c username = comment.xpath("./div/div[1]/a/div/span/text()").extract_first() if not username: username = '' comment_item['username'] = username comment_item['hitcount'] = '0' comment_item['follownum'] = '0' suportnum = comment.xpath("./div/div[7]/span/div/span[@data-hook='helpful-vote-statement']/text()").extract_first() if not suportnum: suportnum = '0' suportnum = re.findall('\d+',suportnum)[0] comment_item['suportnum'] = suportnum comment_item['opposnum'] = '0' comment_item['commentid'] = comment.xpath("./@id").extract_first() comment_item['followcommentid'] = '' commenttitle = comment.xpath(".//a[@data-hook='review-title']/text()").extract_first() if not commenttitle: commenttitle = '' comment_item['commenttitle'] = commenttitle comment_item['commenttype'] = '0' comment_strs = comment.xpath(".//div[@data-hook='review-collapsed']/text()").extract() comment_strs = ''.join(comment_strs) comment_item['comment'] = comment_strs score = comment.xpath("//div[@id='cm-cr-dp-review-list']/div[1]/div[1]/div[2]/a/@title").extract_first() if not score: score = ['5.0'] score = re.findall('\d.\d', score)[0] score = score[:1] comment_item['score'] = score score = float(score) if score < 2: level = '2' elif score < 4: level = '1' else: level = '0' comment_item['level'] = level comment_item['commpoint'] = '' comment_item['type'] = '01' comment_item['sitename'] = '亚马逊' comment_item['_row'] = comment_item['isbn'] + comment_item['sourcetype'] + comment_item['publishtime'] + comment_item['commentid'] comment_item['_entitycode'] = 'web_page_p_book_comment_09' comment_item['skuid'] = skuid yield comment_item yield item
def parse(self, response): item = BookItem() for item_key in item_list: item[item_key] = '' item['is_set'] = '否' is_set = '否' # 判断isbn是否满足要求 isbn = self.get_basicinfo(response, 'ISBN') if len(isbn) != 13: isbn = '' is_set = '是' if is_set == '否': skuid = response.url.split('/')[-1].replace('.html', '') # 加载商品描述信息接口 html = self.get_content_and_cate(skuid) # 加载商品价格接口 sourceprice, price = self.get_price(skuid) # 加载商品评论、评论数、好评率接口 comments, commentcount, commentpercent, commenttag = self.get_comment( skuid) bookname = response.xpath( "//div[@class='sku-name']/text()").extract_first() bookname = bookname.strip() item['bookname'] = bookname item['subhead'] = '' item['publisher'] = self.get_basicinfo(response, '出版社') item['orgpublisher'] = self.get_basicinfo(response, '出版社') contentsummary = self.parse_desc(html, '内容简介') contentsummary = ''.join(contentsummary) item['contentsummary'] = contentsummary item['sourcetype'] = '01' author_list = response.xpath( "//div[@class='p-author']/a/@data-name").extract() author = '#'.join(author_list) item['author'] = author item['translator'] = '' item['isbn'] = isbn item['orgisbn'] = isbn item['salecategory'] = '' item['category'] = '' item['orgcategory'] = '' brand = self.get_basicinfo(response, '品牌') contenttype_list = response.xpath( "//div[@class='crumb fl clearfix']/div[@class='item']/a/text()" ).extract() try: contenttype_list.remove(brand) except: pass contenttype = ','.join(contenttype_list) item['contenttype'] = contenttype item['issuearea'] = '' item['type'] = '01' item['edition'] = self.get_basicinfo(response, '版次') item['impression'] = '' item['words'] = self.get_basicinfo(response, '字数') pages = re.findall('\d+', self.get_basicinfo(response, '页数')) if not pages: page = [''] pages = pages[0] item['pages'] = pages item['language'] = self.get_basicinfo(response, '正文语种') item['price'] = price item['format'] = self.get_basicinfo(response, '开本') item['papermeter'] = self.get_basicinfo(response, '用纸') item['packing'] = self.get_basicinfo(response, '包装') item['coverurl'] = 'http:' + response.xpath( "//div[@id= 'spec-n1']/img/@src").extract_first() item['seriename'] = self.get_basicinfo(response, '丛书名') item['catalog'] = self.parse_desc(html, '目录') item['editorsugest'] = self.parse_desc(html, '编辑推荐') item['usersugest'] = self.parse_desc(html, '精彩书评') item['preface'] = self.parse_desc(html, '前言/序言') item['summary'] = self.parse_desc(html, '精彩书摘') item['epilogue'] = '' publishdate = self.get_basicinfo(response, '出版时间') if not publishdate: publishdate = '' if len(publishdate) > 7: index = publishdate.rfind('-') publishdate = publishdate[:index] item['publishdate'] = publishdate item['printedtime'] = publishdate item['collectiontime'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item['orgcode'] = '' item['skuid'] = skuid item['commentcount'] = str(commentcount) item['_row'] = skuid + '01' item['coverpath'] = '/book/' + datetime.datetime.now().strftime( '%Y%m%d') + '/' + item['_row'] + '.jpg' item['is_set'] = '否' item['ifimport'] = '0' item['url'] = response.url item['_entitycode'] = 'web_page_p_book_info_09' item['commentpercent'] = commentpercent item['commenttag'] = commenttag item['authorintro'] = self.parse_desc(html, '作者简介') item['sourceprice'] = sourceprice #遍历评论列表 if comments: for comment in comments: comment_item = CommentItem() comment_item['isbn'] = isbn comment_item['uri'] = response.url comment_item['bookname'] = bookname comment_item['sourcetype'] = '01' comment_item['collectiontime'] = datetime.datetime.now( ).strftime('%Y-%m-%d %H:%M:%S') comment_item['publishtime'] = comment['creationTime'] comment_item['username'] = '******' comment_item['hitcount'] = '0' follownum = str(comment['replyCount']) if not follownum: follownum = '0' comment_item['follownum'] = follownum suportnum = str(comment['usefulVoteCount']) if not suportnum: suportnum = '0' comment_item['suportnum'] = suportnum comment_item['opposnum'] = '0' comment_item['commentid'] = str(comment['id']) comment_item['followcommentid'] = '' comment_item['commenttitle'] = '' comment_item['commenttype'] = '0' comment_item['comment'] = comment['content'] score = str(comment['score']) if not score: score = '5' comment_item['score'] = score score = int(score) if score < 2: level = '2' elif score < 4: level = '1' else: level = '0' comment_item['level'] = level comment_item['commpoint'] = '' comment_item['type'] = '01' comment_item['sitename'] = '京东' comment_item['_row'] = comment_item['isbn'] + comment_item[ 'sourcetype'] + comment_item[ 'publishtime'] + comment_item['username'] comment_item['_entitycode'] = 'web_page_p_book_comment_09' comment_item['skuid'] = skuid yield comment_item yield item
def parse(self, response): item = BookItem() for item_key in item_list: item[item_key] = '' is_set = '否' item['is_set'] = is_set skuid = response.url.split('/')[-1].replace('.html', '') bookname = response.xpath( "//span[@class='title_words']/@title").extract_first() bookname = bookname.strip() item['bookname'] = bookname item['subhead'] = response.xpath( "//p[@class='title_descript']/@title").extract_first() item['publisher'] = response.xpath( "//p[@id='publisher']//a/text()").extract_first() item['orgpublisher'] = response.xpath( "//p[@id='publisher']//a/text()").extract_first() contentsummary = response.xpath( "//div[@class='newEdit_box']//text()").extract() contentsummary = '<br>'.join(contentsummary) item['contentsummary'] = contentsummary item['sourcetype'] = '02' authors = response.xpath("//p[@id='author']//a/text()").extract_first() if not authors: authors = '' authors = authors.replace('、', ',') author_list = authors.split(',') authors = '#'.join(author_list) item['author'] = authors item['translator'] = '' item['isbn'] = '' item['orgisbn'] = '' item['salecategory'] = '' item['category'] = '' item['orgcategory'] = '' contenttype_list = response.xpath( "//div[@id='crumb']/a/text()").extract() for index, ct in enumerate(contenttype_list): ct = ct.replace('>', '') ct = ct.strip() contenttype_list[index] = ct if ct == bookname: contenttype_list.pop(index) contenttype = ','.join(contenttype_list) item['contenttype'] = contenttype item['issuearea'] = '' item['type'] = '02' item['edition'] = '' item['impression'] = '' basic_info_list = response.xpath( "//div[@class='explain_box']/p").extract() basic_info_str = ''.join(basic_info_list) words = re.findall('数:(\d+[.]*\d+)', basic_info_str) suffix = 1 if '万' in basic_info_str: suffix = 10000 if words: words = int(float(words[0]) * suffix) else: words = '' item['words'] = str(words) # 测试 item['pages'] = '' item['language'] = '' price_str = response.xpath( "//div[@class='cost_box']/p").extract_first() price = re.findall('\d+[.]*\d+', price_str) if not price: price = ['0'] item['price'] = price[0] item['format'] = '' item['papermeter'] = '' item['packing'] = '' item['coverurl'] = response.xpath( "//div[@class='bookCover_area']/img/@src").extract_first() item['seriename'] = '' catalog_list = response.xpath( "//div[@id='catalog_title']//text()").extract() catalog = '<br>'.join(catalog_list) item['catalog'] = catalog item['editorsugest'] = '' item['usersugest'] = '' item['preface'] = '' item['summary'] = '' item['epilogue'] = '' publishdate = re.findall('出版时间:([\d]{4}-[\d]{2})', basic_info_str) if not publishdate: publishdate = [''] publishdate = publishdate[0] item['publishdate'] = publishdate item['printedtime'] = publishdate item['collectiontime'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item['orgcode'] = '' item['skuid'] = skuid commentcount = response.xpath( "//div[@class='count_per']/em/text()").extract_first() if not commentcount: commentcount = '' commentcount = re.findall('\d+', commentcount) if not commentcount: commentcount = [''] item['commentcount'] = commentcount[0] item['_row'] = skuid + item['sourcetype'] item['coverpath'] = '/book/' + datetime.datetime.now().strftime( '%Y%m%d') + '/' + item['_row'] + '.jpg' item['is_set'] = '否' item['ifimport'] = '0' item['url'] = response.url item['_entitycode'] = 'web_page_p_book_info_09' item['commentpercent'] = '' item['commenttag'] = '' item['authorintro'] = '' item['sourceprice'] = '' # 获取评论列表 comments = self.get_comments(skuid) # 遍历评论列表 for comment in comments: comment_item = CommentItem() try: uri = 'http://e.dangdang.com/post_detail_page.html?barId=' + str( comment['barId']) + '&digestId=' + str( comment['mediaDigestId']) comment_item['isbn'] = '' comment_item['uri'] = uri comment_item['bookname'] = bookname comment_item['sourcetype'] = '02' comment_item['collectiontime'] = datetime.datetime.now( ).strftime('%Y-%m-%d %H:%M:%S') publishdate_ts = comment['createDateLong'] / 1000 publishdate_c = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publishdate_ts)) comment_item['publishtime'] = publishdate_c comment_item['username'] = comment['userBaseInfo']['nickName'] comment_item['hitcount'] = '0' comment_item['follownum'] = comment['commentNum'] comment_item['suportnum'] = comment['commentStar'] comment_item['opposnum'] = '0' comment_item['commentid'] = comment['mediaDigestId'] comment_item['followcommentid'] = '' comment_item['commenttitle'] = '' comment_item['commenttype'] = '0' comment_item['comment'] = comment['content'] comment_item['score'] = '5' comment_item['level'] = '0' comment_item['commpoint'] = '' comment_item['type'] = '02' comment_item['sitename'] = '当当' comment_item['_row'] = skuid + comment_item[ 'sourcetype'] + comment_item['publishtime'] + hashlib.md5( comment_item['username'].encode( 'utf-8')).hexdigest()[8:-8] comment_item['_entitycode'] = 'web_page_p_book_comment_09' comment_item['skuid'] = skuid yield comment_item except: continue for item_key in item_list: if not item[item_key]: item[item_key] = '' yield item
def parse(self, response): item = BookItem() # 将所有字段设为空串 for item_key in item_list: item[item_key] = '' item['is_set'] = '否' # 抓取isbn try: isbn = response.xpath('//div[@id="detail_describe"]/ul/li[5]/text()').extract_first() isbn = isbn.split(':')[1] except Exception as e: self._logger.error(e) isbn = '' item['orgisbn'] = isbn # 如果isbn长度不是13位的话,置为空,不存进数据库 if len(isbn) != 13: isbn = '' is_set = '是' item['isbn'] = isbn if is_set == '否' : # 获得商品id和店铺id skuid = re.findall('\d+', response.url)[0] shopid = response.xpath("//p[@class='goto_shop']/a[1]/@href").extract_first().split('/')[-1] # 调用接口以获取动态加载的数据 timemil_start = time.time() descrip_html = self.descrip_inter(skuid) comment_dict = self.comment_inter(skuid) price_dict = self.price_inter(skuid, shopid) tags = self.tag_inter(skuid) alsobuy_urls = self.alsobuy_inter(skuid, shopid) timemil_end = time.time() self._logger.info('解析url:'+response.url+' ===>调取接口耗时:'+str(timemil_end-timemil_start)+' s') for url_item in alsobuy_urls: # ab_url = url_item.xpath("./@href").extract_first() # ab_url = 'http://product.dangdang.com/' + ab_url.split('#')[0] ab_url = 'http://product.dangdang.com/' + url_item['productId']+'.html' taskId = binascii.crc32((ab_url).encode()) # ab_taskname = url_item.xpath("./img/@title").extract_first() ab_taskname = url_item['productName'] # 往site_book表中插入url任务 sql = '''insert into site_book(siteId,taskId,taskName,taskCode,startUrl,requestTimes,pollPeriod,autorun,status,crawlTime,maxDepth,threadNum,sleepTime,saveTime,newsType,rollUnit) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' params = (530701699,taskId,ab_taskname,'20',ab_url,3,86400,1,2,'2016-01-01 00:00:00',3,10,100,datetime.datetime.now(),'0','1') try: self.cursor.execute(sql, params) self.db.commit() self._logger.info('插入任务:taskId为 ' + str(taskId) + ' url为 ' + ab_url) except Exception as e: pass item['is_set'] = '否' bookname = response.xpath('//div[@id="product_info"]/div[1]/h1/@title').extract_first() item['bookname'] = bookname subhead = response.xpath("//span[@class='head_title_name']/@title").extract_first() if not subhead: subhead = '' item['subhead'] = subhead publisher = response.xpath('//div[@id="product_info"]/div[2]/span[2]/a/text()').extract_first() item['publisher'] = publisher item['orgpublisher'] = publisher item['contentsummary'] = self.packing_descrip(descrip_html,'content') item['editorsugest'] = self.packing_descrip(descrip_html,'abstract') item['sourcetype'] = '02' try: author_klist = response.xpath('//span[@id="author"]/text()').extract() author_list = response.xpath('//a[@dd_name="作者"]/text()').extract() author = [] translator = [] flag = True for index,k in enumerate(author_klist): if flag: author.append(author_list[index]) next_index = index+1 if next_index == len(author_klist): continue if author_klist[next_index] != ',' and author_klist[next_index] != ',': flag = False else: if index >= len(author_list): break translator.append(author_list[index]) author = '#'.join(author) translator = '#'.join(translator) except Exception as e: self._logger.error(e) author = item['publisher'] translator = '' item['author'] = author item['translator'] = translator item['salecategory'] = '' item['category'] = '' item['orgcategory'] = '' contenttype = response.xpath('//li[@id="detail-category-path"]/span/a/text()').extract() contenttype = ','.join(contenttype) item['contenttype'] = contenttype item['issuearea'] = '0' item['type'] = '01' # 版次 item['edition'] = '' # 印次 item['impression'] = '' item['words'] = '' item['pages'] = '' item['language'] = '' item['price'] = price_dict['price'] printedtime = response.xpath('//div[@id="product_info"]/div[2]/span[3]/text()').extract_first() if printedtime: printedtime = printedtime.strip() printedtime = printedtime[5:-1].replace('年', '-') else: printedtime = '' item['printedtime'] = printedtime format = response.xpath('//div[@id="detail_describe"]/ul/li[1]/text()').extract_first()[4:] item['format'] = format papermeter = response.xpath('//div[@id="detail_describe"]/ul/li[2]/text()').extract_first()[4:] item['papermeter'] = papermeter packing = response.xpath('//div[@id="detail_describe"]/ul/li[3]/text()').extract_first()[4:] item['packing'] = packing coverurl = response.xpath('//img[@id="largePic"]/@src').extract_first() item['coverurl'] = coverurl item['seriename'] = '' item['catalog'] = self.packing_descrip(descrip_html,'catalog') item['usersugest'] = self.packing_descrip(descrip_html,'mediaFeedback') item['preface'] = self.packing_descrip(descrip_html,'preface') item['summary'] = self.packing_descrip(descrip_html,'extract') item['epilogue'] = '' item['publishdate'] = printedtime item['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') item['orgcode'] = '' item['skuid'] = skuid item['_row'] = skuid+'02' item['coverpath'] ='/book/' + datetime.datetime.now().strftime('%Y%m%d') + '/'+item['_row'] + '.jpg' item['commentcount'] = comment_dict['commentcount'] item['ifimport'] = '0' item['url'] = response.url item['_entitycode'] = 'web_page_p_book_info_09' item['commentpercent'] = comment_dict['commentpercent'] item['commenttag'] = tags item['authorintro'] = self.packing_descrip(descrip_html,'authorIntroduction') item['sourceprice'] = price_dict['sourceprice'] comments = comment_dict['comments'] if comments: for comment in comments: try: citem = CommentItem() citem['isbn'] = isbn uri = comment.xpath('./div[1]/div[2]//a/@href') if not uri: uri = [response.url] uri = ''.join(uri) citem['uri'] = uri citem['bookname'] = bookname citem['sourcetype'] = '02' citem['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') publishtime = comment.xpath('./div[1]/div[4]/span[1]/text()') if not publishtime: continue publishtime = ''.join(publishtime) citem['publishtime'] = publishtime username = comment.xpath('./div[2]/span[1]/text()') if not username: username = ['无昵称用户'] username = ''.join(username) citem['username'] = username citem['hitcount'] = '0' citem['follownum'] = '0' suportnum = comment.xpath('./div[1]/div[5]/a[1]/text()') suportnum = ''.join(suportnum) if suportnum == '赞': suportnum = '0' citem['suportnum'] = suportnum citem['opposnum'] = '0' commentid = str(binascii.crc32((username + publishtime).encode())) citem['commentid'] = commentid citem['followcommentid'] = '-1' citem['commenttitle'] = '' citem['commenttype'] = '0' commentcontent = comment.xpath('./div[1]/div[2]//a/text()') commentcontent = ''.join(commentcontent) citem['comment'] = commentcontent score = comment.xpath('./div[1]/div[1]/em/text()') score = ''.join(score) if not score: score = '5' score = score[:-1] score = int(score) / 2 citem['score'] = str(score) if score < 2: citem['level'] = '2' elif score < 4: citem['level'] = '1' else: citem['level'] = '0' citem['commpoint'] = '' citem['type'] = '01' citem['sitename'] = '当当' citem['_row'] = citem['isbn'] + citem['sourcetype'] + citem['publishtime'] + hashlib.md5(citem['username'].encode('utf-8')).hexdigest()[8:-8] citem['_entitycode'] = 'web_page_p_book_comment_09' citem['skuid'] = skuid for citem_key in citem_list: if not citem[citem_key]: citem[citem_key] ='' yield citem except Exception as e: self._logger.error(e) continue for item_key in item_list: if not item[item_key]: item[item_key] = '' yield item