Beispiel #1
0
    def parse(self,response):
        self._logger.info("解析url:"+response.url)
        douban_item = BookItem()
        list = ['preface', 'catalog', 'translator', 'isbn', 'subhead', 'edition', 'language', 'orgcategory', 'type',
                 'packing', 'seriename', 'coverurl', 'coverpath', 'pages', 'epilogue', 'price', 'publishdate',
                 'sourcetype', 'editorsugest', 'papermeter', 'printedtime', 'summary', 'orgisbn', 'author','usersugest',
                 'orgpublisher', 'words', 'format', 'issuearea', 'contenttype', 'contentsummary',
                 'salecategory', 'publisher', 'impression', 'bookname', 'category', 'collectiontime', 'orgcode','skuid',
                 'commentcount', 'ifimport', '_row', '_entitycode', 'url','commentpercent','commenttag','authorintro','sourceprice']
        for item_key in list:
            douban_item[item_key] = ''
        douban_item['bookname'] = response.xpath("//h1/span/text()").extract_first()
        selector_info = response.xpath("//div[@id='info']")
        key_list = response.xpath("//div[@id='info']//span/text()").extract()
        info_list = selector_info.xpath('./text()|a/text()|span/a/text()').extract()
        key_list = remove_meaningless_str(key_list)
        key_list = merge_key(key_list)
        info_list = remove_meaningless_str(info_list)
        info_list = merge_info(info_list)
        if len(key_list)>11:
            self._logger.error('出现封装字段数大于11的图书:'+response.url)
            self._logger.error(key_list)
        if len(key_list) == len(info_list):
            douban_item = packing_info(douban_item,key_list,info_list)
        else:
            self._logger.error('基础信息封装代码出现BUG:'+response.url)
        # 内容简介
        is_set = '否'
        if not douban_item['isbn'] or len(douban_item['isbn']) != 13:
            is_set = '是'
        if is_set == '否':
            contentsummary_selector_list = response.xpath("//div[@id='link-report']//div[@class='intro']")
            douban_item['contentsummary'] = packing_content(contentsummary_selector_list,-1)
            douban_item['sourcetype'] = '03'
            douban_item['salecategory'] = ''
            douban_item['category'] = ''
            douban_item['orgcategory'] = ''
            contenttype = response.xpath("//a[@class='  tag']/text()").extract()
            contenttype = ','.join(contenttype)
            douban_item['contenttype'] = contenttype
            douban_item['issuearea'] = ''
            douban_item['type'] = '01'
            douban_item['edition'] = ''
            douban_item['impression'] = ''
            douban_item['words'] = ''
            douban_item['language'] = ''
            douban_item['printedtime'] = ''
            douban_item['format'] = ''
            douban_item['papermeter'] = ''
            # 封面url
            douban_item['coverurl'] = response.xpath("//a[@class='nbg']/@href").extract_first()
            # 保存图片地址
            today_str = str(datetime.datetime.now()).split(".")[0].split()[0].replace('-', '')
            sku_id = re.findall(r"\d+",response.url)[0]
            douban_item['coverpath'] = '/book/'+today_str+'/'+'03'+douban_item['isbn']+'.png'
            # 目录
            catalog_selector_list = response.xpath("//div[@id='dir_"+sku_id+"_full']")
            douban_item['catalog'] = packing_content(catalog_selector_list)

            douban_item['editorsugest'] = ''
            douban_item['usersugest'] = ''
            douban_item['preface'] = ''
            douban_item['summary'] = ''
            douban_item['epilogue'] = ''
            # 收集时间
            now_str = str(datetime.datetime.now()).split(".")[0]
            douban_item['collectiontime'] = now_str

            douban_item['orgcode'] = ''

            douban_item['skuid'] = sku_id
            comment_count_list = response.xpath("//h2/span[@class='pl']/a/text()").extract()
            comment_count = 0
            if comment_count_list != None:
                for i in comment_count_list:
                    i = re.findall(r"\d+",i)
                    if len(i)==0:
                        i.append('0')
                    comment_count = comment_count+int(i[0])
            douban_item['commentcount'] = str(comment_count)

            douban_item['ifimport'] = '0'
            douban_item['_row'] = douban_item['skuid']+'03'
            douban_item['_entitycode'] = 'web_page_p_book_info_09'
            douban_item['is_set'] = '否'
            douban_item['url'] = response.url
            douban_item['commentpercent'] = ''
            douban_item['commenttag'] = ''
            douban_item['authorintro'] = ''
            douban_item['sourceprice'] = ''



            # 书评列表地址
            long_comment_list = response.xpath("//div[@class='main-bd']/h2/a/@href").extract()
            if  len(long_comment_list) >0:
                for long_comment_link in long_comment_list:
                    yield scrapy.Request(long_comment_link, meta={'douban_item': douban_item}, callback=self.parse_long_comment)
            # 短评地址
            short_comment_link = response.xpath("//div[@class='related_info']/p/a/@href").extract_first()
            if short_comment_link:
                yield scrapy.Request(short_comment_link,meta={'douban_item':douban_item},callback=self.parse_short_comment)
            yield douban_item

            # 添加相关图书的链接
            similarity_urls = response.xpath("//dd//a/@href").extract()
            if len(similarity_urls)>0:
                for similarity_url in similarity_urls:
                    # 判断url是否已经爬取过
                    # sku_id = re.findall(r"\d+", similarity_url)[0]
                    # querysql = 'select * from web_page_book_info_09_douban where skuid = %s' %sku_id
                    # self.cursor.execute(querysql)
                    # result = self.cursor.fetchone()
                    # if result != None:
                    #     continue
                    yield scrapy.Request(similarity_url)
    def parse(self, response):
        item = BookItem()
        for item_key in item_list:
            item[item_key] = ''
        item['is_set'] = '否'
        is_set = '否'
        # 判断isbn是否满足要求
        isbn = self.get_basicinfo(response,'ISBN')
        isbn_list = isbn.split(',')
        if len(isbn_list) == 1:
            isbn = isbn_list[0]
        elif len(isbn_list) > 1:
            for i in isbn_list:
                i = i.strip()
                if len(i) == 13:
                    isbn = i
        if not isbn:
            isbn = ''
        if is_set == '否':
            skuid = self.get_basicinfo(response,'ASIN')
            # 加载商品描述信息接口
            html = self.get_content_and_cate(skuid)
            bookname = response.xpath("//h1/span[@id='productTitle']/text()").extract_first()
            bookname = bookname.strip()
            item['bookname'] = bookname
            item['subhead'] = ''
            publisher_str = self.get_basicinfo(response,'出版社')
            publisher = publisher_str.split(';')[0].strip()
            item['publisher'] = publisher
            item['orgpublisher'] = publisher
            contentsummary = response.xpath("//noscript/div/text()").extract()
            contentsummary = ''.join(contentsummary)
            item['contentsummary'] = contentsummary
            item['sourcetype'] = '05'
            author_list = response.xpath("//div[@id='bylineInfo']/span[1]/a/text()").extract()
            author = '#'.join(author_list)
            item['author'] = author
            translator_list = response.xpath("//div[@id='bylineInfo']/span[2]/a/text()").extract()
            translator = '#'.join(translator_list)
            item['translator'] = translator
            item['isbn'] = isbn
            item['orgisbn'] = isbn
            item['salecategory'] = ''
            item['category'] = ''
            item['orgcategory'] = ''
            contenttype_list = response.xpath("//div[@id='wayfinding-breadcrumbs_feature_div']//span[@class='a-list-item']/a/text()").extract()
            for index,c in enumerate(contenttype_list):
                contenttype_list[index] = c.strip()
            contenttype = ','.join(contenttype_list)
            item['contenttype'] = contenttype
            item['issuearea'] = ''
            item['type'] = '01'
            packing = response.xpath("//h1/span[2]/text()").extract_first()
            edition = re.findall('第(\d+)版',publisher_str)
            if not edition:
                edition = ['']
            item['edition'] = edition[0]
            item['impression'] = ''
            item['words'] = ''
            pages = re.findall('\d+', self.get_basicinfo(response,packing))
            if not pages:
                pages = ['']
            pages = pages[0]
            item['pages'] = pages

            item['language'] = self.get_basicinfo(response, '语种')
            price = response.xpath("//div[@id = 'buyBoxInner']/ul/li/span/span[2]/text()").extract_first()
            price = re.findall('\d+[.]*\d+', price)
            item['price'] = price[0]
            item['format'] = self.get_basicinfo(response, '开本')
            item['papermeter'] = ''
            item['packing'] = packing
            item['coverurl'] = response.xpath("//div[@id = 'img-canvas']/img/@src").extract_first()
            item['seriename'] = ''
            item['catalog'] = self.parse_desc(html,'目录')
            item['editorsugest'] = self.parse_desc(html,'编辑推荐')
            item['usersugest'] = self.parse_desc(html,'名人推荐')
            item['preface'] = ''
            item['summary'] = self.parse_desc(html,'文摘')
            item['epilogue'] = ''
            publishdate = response.xpath("//h1/span[3]/text()").extract_first()
            if not publishdate:
                publishdate = ''
            if len(publishdate) > 7:
                pub_list = re.findall('(\d+)年(\d+)月',publishdate)
                publishdate = '-'.join(pub_list[0])
            item['publishdate'] = publishdate
            item['printedtime'] = publishdate
            item['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            item['orgcode'] = ''
            item['skuid'] = skuid
            commentcount = response.xpath("//span[@id='acrCustomerReviewText']/text()").extract_first()
            if not commentcount:
                commentcount = '0'
            commentcount = re.findall('(\d+)*', commentcount)
            commentcount = ''.join(commentcount)
            item['commentcount'] = commentcount
            item['_row'] = skuid + item['sourcetype']
            item['coverpath'] = '/book/' + datetime.datetime.now().strftime('%Y%m%d') + '/' + item['_row'] + '.jpg'
            item['is_set'] = '否'
            item['ifimport'] = '0'
            item['url'] = response.url
            item['_entitycode'] = 'web_page_p_book_info_09'
            item['commentpercent'] = ''
            try:
                tag_resp = self.get_commenttag(skuid)
                commenttag = tag_resp.xpath("//span/@data-cr-trigger-on-view")
                commenttag = json.loads(commenttag[0])
                commenttag = commenttag['ajaxParamsMap']['lighthouseTerms'].replace('/', '#')
            except:
                commenttag = ''
            item['commenttag'] = commenttag
            item['authorintro'] = self.parse_desc(html,'作者简介')
            sourceprice = response.xpath("//div[@id='soldByThirdParty']/span[2]/text()").extract_first()
            sourceprice = re.findall('\d+[.]*\d+',sourceprice)
            if not sourceprice:
                sourceprice = ['']
            item['sourceprice'] = sourceprice[0]
            comments = response.xpath("//div[@id='cm-cr-dp-review-list']/div")
            #遍历评论列表
            if comments:
                for comment in comments:
                    comment_item = CommentItem()
                    comment_item['isbn'] = isbn
                    comment_item['uri'] = response.url
                    comment_item['bookname'] = bookname
                    comment_item['sourcetype'] = item['sourcetype']
                    comment_item['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    publishdate_c = response.xpath("//h1/span[3]/text()").extract_first()
                    if not publishdate_c:
                        publishdate_c = ''
                    else:
                        pub_list = re.findall('(\d+)年(\d+)月(\d+)日', publishdate_c)
                        publishdate_c = '-'.join(pub_list[0])
                    comment_item['publishtime'] = publishdate_c
                    username = comment.xpath("./div/div[1]/a/div/span/text()").extract_first()
                    if not username:
                        username = ''
                    comment_item['username'] = username
                    comment_item['hitcount'] = '0'
                    comment_item['follownum'] = '0'
                    suportnum = comment.xpath("./div/div[7]/span/div/span[@data-hook='helpful-vote-statement']/text()").extract_first()
                    if not suportnum:
                        suportnum = '0'
                    suportnum = re.findall('\d+',suportnum)[0]
                    comment_item['suportnum'] = suportnum
                    comment_item['opposnum'] = '0'
                    comment_item['commentid'] = comment.xpath("./@id").extract_first()
                    comment_item['followcommentid'] = ''
                    commenttitle = comment.xpath(".//a[@data-hook='review-title']/text()").extract_first()
                    if not commenttitle:
                        commenttitle = ''
                    comment_item['commenttitle'] = commenttitle
                    comment_item['commenttype'] = '0'
                    comment_strs = comment.xpath(".//div[@data-hook='review-collapsed']/text()").extract()
                    comment_strs = ''.join(comment_strs)
                    comment_item['comment'] = comment_strs
                    score = comment.xpath("//div[@id='cm-cr-dp-review-list']/div[1]/div[1]/div[2]/a/@title").extract_first()
                    if not score:
                        score = ['5.0']
                    score = re.findall('\d.\d', score)[0]
                    score = score[:1]
                    comment_item['score'] = score
                    score = float(score)
                    if score < 2:
                        level = '2'
                    elif score < 4:
                        level = '1'
                    else:
                        level = '0'
                    comment_item['level'] = level
                    comment_item['commpoint'] = ''
                    comment_item['type'] = '01'
                    comment_item['sitename'] = '亚马逊'
                    comment_item['_row'] = comment_item['isbn'] + comment_item['sourcetype'] + comment_item['publishtime'] + comment_item['commentid']
                    comment_item['_entitycode'] = 'web_page_p_book_comment_09'
                    comment_item['skuid'] = skuid
                    yield comment_item
            yield item
Beispiel #3
0
    def parse(self, response):
        item = BookItem()
        for item_key in item_list:
            item[item_key] = ''
        item['is_set'] = '否'
        is_set = '否'
        # 判断isbn是否满足要求
        isbn = self.get_basicinfo(response, 'ISBN')
        if len(isbn) != 13:
            isbn = ''
            is_set = '是'
        if is_set == '否':
            skuid = response.url.split('/')[-1].replace('.html', '')
            # 加载商品描述信息接口
            html = self.get_content_and_cate(skuid)
            # 加载商品价格接口
            sourceprice, price = self.get_price(skuid)
            # 加载商品评论、评论数、好评率接口
            comments, commentcount, commentpercent, commenttag = self.get_comment(
                skuid)
            bookname = response.xpath(
                "//div[@class='sku-name']/text()").extract_first()
            bookname = bookname.strip()
            item['bookname'] = bookname
            item['subhead'] = ''
            item['publisher'] = self.get_basicinfo(response, '出版社')
            item['orgpublisher'] = self.get_basicinfo(response, '出版社')
            contentsummary = self.parse_desc(html, '内容简介')
            contentsummary = ''.join(contentsummary)
            item['contentsummary'] = contentsummary
            item['sourcetype'] = '01'
            author_list = response.xpath(
                "//div[@class='p-author']/a/@data-name").extract()
            author = '#'.join(author_list)
            item['author'] = author
            item['translator'] = ''
            item['isbn'] = isbn
            item['orgisbn'] = isbn
            item['salecategory'] = ''
            item['category'] = ''
            item['orgcategory'] = ''
            brand = self.get_basicinfo(response, '品牌')
            contenttype_list = response.xpath(
                "//div[@class='crumb fl clearfix']/div[@class='item']/a/text()"
            ).extract()
            try:
                contenttype_list.remove(brand)
            except:
                pass
            contenttype = ','.join(contenttype_list)
            item['contenttype'] = contenttype
            item['issuearea'] = ''
            item['type'] = '01'
            item['edition'] = self.get_basicinfo(response, '版次')
            item['impression'] = ''
            item['words'] = self.get_basicinfo(response, '字数')
            pages = re.findall('\d+', self.get_basicinfo(response, '页数'))
            if not pages:
                page = ['']
            pages = pages[0]
            item['pages'] = pages

            item['language'] = self.get_basicinfo(response, '正文语种')
            item['price'] = price

            item['format'] = self.get_basicinfo(response, '开本')
            item['papermeter'] = self.get_basicinfo(response, '用纸')
            item['packing'] = self.get_basicinfo(response, '包装')
            item['coverurl'] = 'http:' + response.xpath(
                "//div[@id= 'spec-n1']/img/@src").extract_first()
            item['seriename'] = self.get_basicinfo(response, '丛书名')
            item['catalog'] = self.parse_desc(html, '目录')
            item['editorsugest'] = self.parse_desc(html, '编辑推荐')
            item['usersugest'] = self.parse_desc(html, '精彩书评')
            item['preface'] = self.parse_desc(html, '前言/序言')
            item['summary'] = self.parse_desc(html, '精彩书摘')
            item['epilogue'] = ''
            publishdate = self.get_basicinfo(response, '出版时间')
            if not publishdate:
                publishdate = ''
            if len(publishdate) > 7:
                index = publishdate.rfind('-')
                publishdate = publishdate[:index]

            item['publishdate'] = publishdate
            item['printedtime'] = publishdate
            item['collectiontime'] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
            item['orgcode'] = ''
            item['skuid'] = skuid
            item['commentcount'] = str(commentcount)
            item['_row'] = skuid + '01'
            item['coverpath'] = '/book/' + datetime.datetime.now().strftime(
                '%Y%m%d') + '/' + item['_row'] + '.jpg'
            item['is_set'] = '否'
            item['ifimport'] = '0'
            item['url'] = response.url
            item['_entitycode'] = 'web_page_p_book_info_09'
            item['commentpercent'] = commentpercent
            item['commenttag'] = commenttag
            item['authorintro'] = self.parse_desc(html, '作者简介')
            item['sourceprice'] = sourceprice

            #遍历评论列表
            if comments:
                for comment in comments:
                    comment_item = CommentItem()
                    comment_item['isbn'] = isbn
                    comment_item['uri'] = response.url
                    comment_item['bookname'] = bookname
                    comment_item['sourcetype'] = '01'
                    comment_item['collectiontime'] = datetime.datetime.now(
                    ).strftime('%Y-%m-%d %H:%M:%S')
                    comment_item['publishtime'] = comment['creationTime']
                    comment_item['username'] = '******'
                    comment_item['hitcount'] = '0'
                    follownum = str(comment['replyCount'])
                    if not follownum:
                        follownum = '0'
                    comment_item['follownum'] = follownum
                    suportnum = str(comment['usefulVoteCount'])
                    if not suportnum:
                        suportnum = '0'
                    comment_item['suportnum'] = suportnum
                    comment_item['opposnum'] = '0'
                    comment_item['commentid'] = str(comment['id'])
                    comment_item['followcommentid'] = ''
                    comment_item['commenttitle'] = ''
                    comment_item['commenttype'] = '0'
                    comment_item['comment'] = comment['content']
                    score = str(comment['score'])
                    if not score:
                        score = '5'
                    comment_item['score'] = score
                    score = int(score)
                    if score < 2:
                        level = '2'
                    elif score < 4:
                        level = '1'
                    else:
                        level = '0'
                    comment_item['level'] = level
                    comment_item['commpoint'] = ''
                    comment_item['type'] = '01'
                    comment_item['sitename'] = '京东'
                    comment_item['_row'] = comment_item['isbn'] + comment_item[
                        'sourcetype'] + comment_item[
                            'publishtime'] + comment_item['username']
                    comment_item['_entitycode'] = 'web_page_p_book_comment_09'
                    comment_item['skuid'] = skuid
                    yield comment_item
            yield item
Beispiel #4
0
    def parse(self, response):
        item = BookItem()
        for item_key in item_list:
            item[item_key] = ''
        is_set = '否'
        item['is_set'] = is_set
        skuid = response.url.split('/')[-1].replace('.html', '')

        bookname = response.xpath(
            "//span[@class='title_words']/@title").extract_first()
        bookname = bookname.strip()
        item['bookname'] = bookname
        item['subhead'] = response.xpath(
            "//p[@class='title_descript']/@title").extract_first()
        item['publisher'] = response.xpath(
            "//p[@id='publisher']//a/text()").extract_first()
        item['orgpublisher'] = response.xpath(
            "//p[@id='publisher']//a/text()").extract_first()
        contentsummary = response.xpath(
            "//div[@class='newEdit_box']//text()").extract()
        contentsummary = '<br>'.join(contentsummary)
        item['contentsummary'] = contentsummary
        item['sourcetype'] = '02'
        authors = response.xpath("//p[@id='author']//a/text()").extract_first()
        if not authors:
            authors = ''
        authors = authors.replace('、', ',')
        author_list = authors.split(',')
        authors = '#'.join(author_list)
        item['author'] = authors
        item['translator'] = ''
        item['isbn'] = ''
        item['orgisbn'] = ''
        item['salecategory'] = ''
        item['category'] = ''
        item['orgcategory'] = ''
        contenttype_list = response.xpath(
            "//div[@id='crumb']/a/text()").extract()
        for index, ct in enumerate(contenttype_list):
            ct = ct.replace('>', '')
            ct = ct.strip()
            contenttype_list[index] = ct
            if ct == bookname:
                contenttype_list.pop(index)
        contenttype = ','.join(contenttype_list)
        item['contenttype'] = contenttype
        item['issuearea'] = ''
        item['type'] = '02'
        item['edition'] = ''
        item['impression'] = ''
        basic_info_list = response.xpath(
            "//div[@class='explain_box']/p").extract()
        basic_info_str = ''.join(basic_info_list)
        words = re.findall('数:(\d+[.]*\d+)', basic_info_str)
        suffix = 1
        if '万' in basic_info_str:
            suffix = 10000
        if words:
            words = int(float(words[0]) * suffix)
        else:
            words = ''
        item['words'] = str(words)
        # 测试
        item['pages'] = ''
        item['language'] = ''
        price_str = response.xpath(
            "//div[@class='cost_box']/p").extract_first()
        price = re.findall('\d+[.]*\d+', price_str)
        if not price:
            price = ['0']
        item['price'] = price[0]
        item['format'] = ''
        item['papermeter'] = ''
        item['packing'] = ''
        item['coverurl'] = response.xpath(
            "//div[@class='bookCover_area']/img/@src").extract_first()
        item['seriename'] = ''
        catalog_list = response.xpath(
            "//div[@id='catalog_title']//text()").extract()
        catalog = '<br>'.join(catalog_list)
        item['catalog'] = catalog
        item['editorsugest'] = ''
        item['usersugest'] = ''
        item['preface'] = ''
        item['summary'] = ''
        item['epilogue'] = ''
        publishdate = re.findall('出版时间:([\d]{4}-[\d]{2})', basic_info_str)
        if not publishdate:
            publishdate = ['']
        publishdate = publishdate[0]
        item['publishdate'] = publishdate
        item['printedtime'] = publishdate
        item['collectiontime'] = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item['orgcode'] = ''
        item['skuid'] = skuid
        commentcount = response.xpath(
            "//div[@class='count_per']/em/text()").extract_first()
        if not commentcount:
            commentcount = ''
        commentcount = re.findall('\d+', commentcount)
        if not commentcount:
            commentcount = ['']
        item['commentcount'] = commentcount[0]
        item['_row'] = skuid + item['sourcetype']
        item['coverpath'] = '/book/' + datetime.datetime.now().strftime(
            '%Y%m%d') + '/' + item['_row'] + '.jpg'
        item['is_set'] = '否'
        item['ifimport'] = '0'
        item['url'] = response.url
        item['_entitycode'] = 'web_page_p_book_info_09'
        item['commentpercent'] = ''
        item['commenttag'] = ''
        item['authorintro'] = ''
        item['sourceprice'] = ''
        # 获取评论列表
        comments = self.get_comments(skuid)

        # 遍历评论列表
        for comment in comments:
            comment_item = CommentItem()
            try:
                uri = 'http://e.dangdang.com/post_detail_page.html?barId=' + str(
                    comment['barId']) + '&digestId=' + str(
                        comment['mediaDigestId'])
                comment_item['isbn'] = ''
                comment_item['uri'] = uri
                comment_item['bookname'] = bookname
                comment_item['sourcetype'] = '02'
                comment_item['collectiontime'] = datetime.datetime.now(
                ).strftime('%Y-%m-%d %H:%M:%S')
                publishdate_ts = comment['createDateLong'] / 1000
                publishdate_c = time.strftime("%Y-%m-%d %H:%M:%S",
                                              time.localtime(publishdate_ts))
                comment_item['publishtime'] = publishdate_c
                comment_item['username'] = comment['userBaseInfo']['nickName']
                comment_item['hitcount'] = '0'
                comment_item['follownum'] = comment['commentNum']
                comment_item['suportnum'] = comment['commentStar']
                comment_item['opposnum'] = '0'
                comment_item['commentid'] = comment['mediaDigestId']
                comment_item['followcommentid'] = ''
                comment_item['commenttitle'] = ''
                comment_item['commenttype'] = '0'
                comment_item['comment'] = comment['content']
                comment_item['score'] = '5'
                comment_item['level'] = '0'
                comment_item['commpoint'] = ''
                comment_item['type'] = '02'
                comment_item['sitename'] = '当当'
                comment_item['_row'] = skuid + comment_item[
                    'sourcetype'] + comment_item['publishtime'] + hashlib.md5(
                        comment_item['username'].encode(
                            'utf-8')).hexdigest()[8:-8]
                comment_item['_entitycode'] = 'web_page_p_book_comment_09'
                comment_item['skuid'] = skuid
                yield comment_item
            except:
                continue
        for item_key in item_list:
            if not item[item_key]:
                item[item_key] = ''
        yield item
Beispiel #5
0
    def parse(self, response):
        item = BookItem()
        # 将所有字段设为空串
        for item_key in item_list:
            item[item_key] = ''
        item['is_set'] = '否'
        # 抓取isbn
        try:
            isbn = response.xpath('//div[@id="detail_describe"]/ul/li[5]/text()').extract_first()
            isbn = isbn.split(':')[1]
        except Exception as e:
            self._logger.error(e)
            isbn = ''
        item['orgisbn'] = isbn
        # 如果isbn长度不是13位的话,置为空,不存进数据库
        if len(isbn) != 13:
            isbn = ''
            is_set = '是'
        item['isbn'] = isbn
        if is_set == '否' :
            # 获得商品id和店铺id
            skuid = re.findall('\d+', response.url)[0]
            shopid = response.xpath("//p[@class='goto_shop']/a[1]/@href").extract_first().split('/')[-1]

            # 调用接口以获取动态加载的数据
            timemil_start = time.time()
            descrip_html = self.descrip_inter(skuid)
            comment_dict = self.comment_inter(skuid)
            price_dict = self.price_inter(skuid, shopid)
            tags = self.tag_inter(skuid)
            alsobuy_urls = self.alsobuy_inter(skuid, shopid)
            timemil_end = time.time()
            self._logger.info('解析url:'+response.url+'    ===>调取接口耗时:'+str(timemil_end-timemil_start)+' s')
            for url_item in alsobuy_urls:
                # ab_url = url_item.xpath("./@href").extract_first()
                # ab_url = 'http://product.dangdang.com/' + ab_url.split('#')[0]
                ab_url = 'http://product.dangdang.com/' + url_item['productId']+'.html'
                taskId = binascii.crc32((ab_url).encode())
                # ab_taskname = url_item.xpath("./img/@title").extract_first()
                ab_taskname = url_item['productName']
                # 往site_book表中插入url任务
                sql = '''insert into site_book(siteId,taskId,taskName,taskCode,startUrl,requestTimes,pollPeriod,autorun,status,crawlTime,maxDepth,threadNum,sleepTime,saveTime,newsType,rollUnit) 
                        values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
                params = (530701699,taskId,ab_taskname,'20',ab_url,3,86400,1,2,'2016-01-01 00:00:00',3,10,100,datetime.datetime.now(),'0','1')
                try:
                    self.cursor.execute(sql, params)
                    self.db.commit()
                    self._logger.info('插入任务:taskId为 ' + str(taskId) + '  url为  ' + ab_url)
                except Exception as e:
                    pass
            item['is_set'] = '否'
            bookname = response.xpath('//div[@id="product_info"]/div[1]/h1/@title').extract_first()
            item['bookname'] = bookname
            subhead = response.xpath("//span[@class='head_title_name']/@title").extract_first()
            if not subhead:
                subhead = ''
            item['subhead'] = subhead
            publisher = response.xpath('//div[@id="product_info"]/div[2]/span[2]/a/text()').extract_first()
            item['publisher'] = publisher
            item['orgpublisher'] = publisher
            item['contentsummary'] = self.packing_descrip(descrip_html,'content')
            item['editorsugest'] = self.packing_descrip(descrip_html,'abstract')
            item['sourcetype'] = '02'
            try:
                author_klist = response.xpath('//span[@id="author"]/text()').extract()
                author_list = response.xpath('//a[@dd_name="作者"]/text()').extract()
                author = []
                translator = []
                flag = True
                for index,k in enumerate(author_klist):
                    if flag:
                        author.append(author_list[index])
                        next_index = index+1
                        if next_index == len(author_klist):
                            continue
                        if author_klist[next_index] != ',' and author_klist[next_index] != ',':
                            flag = False
                    else:
                        if index >= len(author_list):
                            break
                        translator.append(author_list[index])
                author = '#'.join(author)
                translator = '#'.join(translator)
            except Exception as e:
                self._logger.error(e)
                author = item['publisher']
                translator = ''
            item['author'] = author
            item['translator'] = translator
            item['salecategory'] = ''
            item['category'] = ''
            item['orgcategory'] = ''
            contenttype = response.xpath('//li[@id="detail-category-path"]/span/a/text()').extract()
            contenttype = ','.join(contenttype)
            item['contenttype'] = contenttype
            item['issuearea'] = '0'
            item['type'] = '01'
            # 版次
            item['edition'] = ''
            # 印次
            item['impression'] = ''
            item['words'] = ''
            item['pages'] = ''
            item['language'] = ''
            item['price'] = price_dict['price']
            printedtime = response.xpath('//div[@id="product_info"]/div[2]/span[3]/text()').extract_first()
            if printedtime:
                printedtime = printedtime.strip()
                printedtime = printedtime[5:-1].replace('年', '-')
            else:
                printedtime = ''
            item['printedtime'] = printedtime
            format = response.xpath('//div[@id="detail_describe"]/ul/li[1]/text()').extract_first()[4:]
            item['format'] = format
            papermeter = response.xpath('//div[@id="detail_describe"]/ul/li[2]/text()').extract_first()[4:]
            item['papermeter'] = papermeter
            packing = response.xpath('//div[@id="detail_describe"]/ul/li[3]/text()').extract_first()[4:]
            item['packing'] = packing
            coverurl = response.xpath('//img[@id="largePic"]/@src').extract_first()
            item['coverurl'] = coverurl
            item['seriename'] = ''
            item['catalog'] = self.packing_descrip(descrip_html,'catalog')
            item['usersugest'] = self.packing_descrip(descrip_html,'mediaFeedback')
            item['preface'] = self.packing_descrip(descrip_html,'preface')
            item['summary'] = self.packing_descrip(descrip_html,'extract')
            item['epilogue'] = ''
            item['publishdate'] = printedtime
            item['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            item['orgcode'] = ''
            item['skuid'] = skuid
            item['_row'] =  skuid+'02'
            item['coverpath'] ='/book/' + datetime.datetime.now().strftime('%Y%m%d') + '/'+item['_row'] + '.jpg'
            item['commentcount'] = comment_dict['commentcount']
            item['ifimport'] = '0'
            item['url'] = response.url
            item['_entitycode'] = 'web_page_p_book_info_09'
            item['commentpercent'] = comment_dict['commentpercent']
            item['commenttag'] = tags
            item['authorintro'] = self.packing_descrip(descrip_html,'authorIntroduction')
            item['sourceprice'] = price_dict['sourceprice']

            comments = comment_dict['comments']
            if comments:
                for comment in comments:
                    try:
                        citem = CommentItem()
                        citem['isbn'] = isbn
                        uri = comment.xpath('./div[1]/div[2]//a/@href')
                        if not uri:
                            uri = [response.url]
                        uri = ''.join(uri)
                        citem['uri'] = uri
                        citem['bookname'] = bookname
                        citem['sourcetype'] = '02'
                        citem['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                        publishtime = comment.xpath('./div[1]/div[4]/span[1]/text()')
                        if not publishtime:
                            continue
                        publishtime = ''.join(publishtime)
                        citem['publishtime'] = publishtime
                        username = comment.xpath('./div[2]/span[1]/text()')
                        if not username:
                            username = ['无昵称用户']
                        username = ''.join(username)
                        citem['username'] = username
                        citem['hitcount'] = '0'
                        citem['follownum'] = '0'
                        suportnum = comment.xpath('./div[1]/div[5]/a[1]/text()')
                        suportnum = ''.join(suportnum)
                        if suportnum == '赞':
                            suportnum = '0'
                        citem['suportnum'] = suportnum
                        citem['opposnum'] = '0'
                        commentid = str(binascii.crc32((username + publishtime).encode()))
                        citem['commentid'] = commentid
                        citem['followcommentid'] = '-1'
                        citem['commenttitle'] = ''
                        citem['commenttype'] = '0'
                        commentcontent = comment.xpath('./div[1]/div[2]//a/text()')
                        commentcontent = ''.join(commentcontent)
                        citem['comment'] = commentcontent
                        score = comment.xpath('./div[1]/div[1]/em/text()')
                        score = ''.join(score)
                        if not score:
                            score = '5'
                        score = score[:-1]
                        score = int(score) / 2
                        citem['score'] = str(score)
                        if score < 2:
                            citem['level'] = '2'
                        elif score < 4:
                            citem['level'] = '1'
                        else:
                            citem['level'] = '0'
                        citem['commpoint'] = ''
                        citem['type'] = '01'
                        citem['sitename'] = '当当'
                        citem['_row'] = citem['isbn'] + citem['sourcetype'] + citem['publishtime'] + hashlib.md5(citem['username'].encode('utf-8')).hexdigest()[8:-8]
                        citem['_entitycode'] = 'web_page_p_book_comment_09'
                        citem['skuid'] = skuid
                        for citem_key in citem_list:
                            if not citem[citem_key]:
                                citem[citem_key] =''
                        yield citem
                    except Exception as e:
                        self._logger.error(e)
                        continue
        for item_key in item_list:
            if not item[item_key]:
                item[item_key] = ''
        yield item