Python CommentItem 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: book_spiders.items

클래스/타입: CommentItem

hotexamples.com에서의 예제들: 6

Python CommentItem - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 book_spiders.items.CommentItem에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

CommentItem(6)

자주 사용되는 메소드들

CommentItem (6)

예제 #1

파일 보기

 def parse_short_comment(self,response):
     self._logger.info("请求短评列表页：" + response.url)
     douban_item = response.meta['douban_item']
     comment_list = response.xpath("//li[@class='comment-item']")
     for comment in comment_list:
         comment_item = CommentItem()
         comment_item['isbn'] = douban_item['isbn']
         comment_item['uri'] = response.url
         comment_item['bookname'] = douban_item['bookname']
         comment_item['sourcetype'] = '03'
         comment_item['collectiontime'] = douban_item['collectiontime']
         publishtime = comment.xpath("./div/h3/span[@class='comment-info']/span/text()").extract()[-1]
         publishtime = publishtime+' 00:00:00'
         comment_item['publishtime'] = publishtime
         comment_item['username'] = comment.xpath("./div[@class='avatar']/a/@title").extract_first()
         comment_item['hitcount'] = ''
         comment_item['follownum'] = ''
         comment_item['suportnum'] = comment.xpath("./div/h3/span[@class='comment-vote']/span/text()").extract_first()
         comment_item['opposnum'] = ''
         comment_item['commentid'] = str(comment.xpath("./@data-cid").extract_first())
         comment_item['followcommentid'] = ''
         comment_item['commenttitle'] = ''
         comment_item['commenttype'] = '1'
         comment_item['comment'] = comment.xpath(".//span[@class='short']/text()").extract_first()
         score_str = comment.xpath(".//span[@class='comment-info']/span[1]/@class").extract_first()
         if score_str == None:
             score_str = '30'
         score = int(re.findall(r"\d+", score_str)[0]) / 10
         comment_item['score'] = score
         level = 2
         if score>3:
             level = 0
         elif score == 2 or score == 3:
             level = 1
         comment_item['level'] = level
         comment_item['commpoint'] = ''
         comment_item['type'] = '01'
         comment_item['sitename'] = '豆瓣'
         comment_item['_row'] =hashlib.md5((response.url).encode('utf-8')).hexdigest()[8:-8]+'03'+publishtime+hashlib.md5(comment_item['username'].encode('utf-8')).hexdigest()[8:-8]
         comment_item['_entitycode'] = 'web_page_p_book_comment_09'
         comment_item['ifimport'] = '0'
         yield comment_item

예제 #2

파일 보기

 def parse_long_comment(self, response):
     self._logger.info("请求长评详情页：" + response.url)
     douban_item = response.meta['douban_item']
     comment_item = CommentItem()
     comment_item['isbn'] = douban_item['isbn']
     comment_item['uri'] = response.url
     comment_item['bookname'] = douban_item['bookname']
     comment_item['sourcetype'] = '03'
     comment_item['collectiontime'] = douban_item['collectiontime']
     comment_item['publishtime'] = response.xpath("//span[@class='main-meta']/text()").extract_first()
     comment_item['username'] = response.xpath("//header[@class='main-hd']/a[1]/span/text()").extract_first()
     comment_item['hitcount'] = ''
     comment_item['follownum'] = response.xpath("//span[@class='rec-num']/text()").extract_first()
     comment_item['suportnum'] = re.findall(r"\d+",response.xpath("//div[@class='main-panel-useful']/button[1]/text()").extract_first())[0]
     comment_item['opposnum'] = re.findall(r"\d+",response.xpath("//div[@class='main-panel-useful']/button[2]/text()").extract_first())[0]
     comment_item['commentid'] = re.findall(r"\d+",response.url)[0]
     comment_item['followcommentid'] = ''
     comment_item['commenttitle'] = response.xpath("//h1/span/text()").extract_first()
     comment_item['commenttype'] = '0'
     comment_item['comment'] = response.xpath("//div[@class='main-bd']").xpath("string(.)").extract()[0]
     score_list = re.findall(r"\d+", response.xpath("//header[@class='main-hd']/span[1]/@class").extract_first())
     score = 0
     if len(score_list) > 0:
         score = int(score_list[0]) / 10
     comment_item['score'] = score
     level = 2
     if score > 3:
         level = 0
     elif score == 2 or score == 3:
         level = 1
     comment_item['level'] = level
     comment_item['commpoint'] = ''
     comment_item['type'] = '01'
     comment_item['sitename'] = '豆瓣'
     comment_item['_row'] = hashlib.md5((response.url).encode('utf-8')).hexdigest()[8:-8] + '03' + comment_item['publishtime'] + hashlib.md5(comment_item['username'].encode('utf-8')).hexdigest()[8:-8]
     comment_item['_entitycode'] = 'web_page_p_book_comment_09'
     comment_item['ifimport'] = '0'
     yield comment_item

예제 #3

파일 보기

파일: toplistspider_amazon.py 프로젝트: floydScript/spider

    def parse(self, response):
        item = BookItem()
        for item_key in item_list:
            item[item_key] = ''
        item['is_set'] = '否'
        is_set = '否'
        # 判断isbn是否满足要求
        isbn = self.get_basicinfo(response,'ISBN')
        isbn_list = isbn.split(',')
        if len(isbn_list) == 1:
            isbn = isbn_list[0]
        elif len(isbn_list) > 1:
            for i in isbn_list:
                i = i.strip()
                if len(i) == 13:
                    isbn = i
        if not isbn:
            isbn = ''
        if is_set == '否':
            skuid = self.get_basicinfo(response,'ASIN')
            # 加载商品描述信息接口
            html = self.get_content_and_cate(skuid)
            bookname = response.xpath("//h1/span[@id='productTitle']/text()").extract_first()
            bookname = bookname.strip()
            item['bookname'] = bookname
            item['subhead'] = ''
            publisher_str = self.get_basicinfo(response,'出版社')
            publisher = publisher_str.split(';')[0].strip()
            item['publisher'] = publisher
            item['orgpublisher'] = publisher
            contentsummary = response.xpath("//noscript/div/text()").extract()
            contentsummary = ''.join(contentsummary)
            item['contentsummary'] = contentsummary
            item['sourcetype'] = '05'
            author_list = response.xpath("//div[@id='bylineInfo']/span[1]/a/text()").extract()
            author = '#'.join(author_list)
            item['author'] = author
            translator_list = response.xpath("//div[@id='bylineInfo']/span[2]/a/text()").extract()
            translator = '#'.join(translator_list)
            item['translator'] = translator
            item['isbn'] = isbn
            item['orgisbn'] = isbn
            item['salecategory'] = ''
            item['category'] = ''
            item['orgcategory'] = ''
            contenttype_list = response.xpath("//div[@id='wayfinding-breadcrumbs_feature_div']//span[@class='a-list-item']/a/text()").extract()
            for index,c in enumerate(contenttype_list):
                contenttype_list[index] = c.strip()
            contenttype = ','.join(contenttype_list)
            item['contenttype'] = contenttype
            item['issuearea'] = ''
            item['type'] = '01'
            packing = response.xpath("//h1/span[2]/text()").extract_first()
            edition = re.findall('第(\d+)版',publisher_str)
            if not edition:
                edition = ['']
            item['edition'] = edition[0]
            item['impression'] = ''
            item['words'] = ''
            pages = re.findall('\d+', self.get_basicinfo(response,packing))
            if not pages:
                pages = ['']
            pages = pages[0]
            item['pages'] = pages

            item['language'] = self.get_basicinfo(response, '语种')
            price = response.xpath("//div[@id = 'buyBoxInner']/ul/li/span/span[2]/text()").extract_first()
            price = re.findall('\d+[.]*\d+', price)
            item['price'] = price[0]
            item['format'] = self.get_basicinfo(response, '开本')
            item['papermeter'] = ''
            item['packing'] = packing
            item['coverurl'] = response.xpath("//div[@id = 'img-canvas']/img/@src").extract_first()
            item['seriename'] = ''
            item['catalog'] = self.parse_desc(html,'目录')
            item['editorsugest'] = self.parse_desc(html,'编辑推荐')
            item['usersugest'] = self.parse_desc(html,'名人推荐')
            item['preface'] = ''
            item['summary'] = self.parse_desc(html,'文摘')
            item['epilogue'] = ''
            publishdate = response.xpath("//h1/span[3]/text()").extract_first()
            if not publishdate:
                publishdate = ''
            if len(publishdate) > 7:
                pub_list = re.findall('(\d+)年(\d+)月',publishdate)
                publishdate = '-'.join(pub_list[0])
            item['publishdate'] = publishdate
            item['printedtime'] = publishdate
            item['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            item['orgcode'] = ''
            item['skuid'] = skuid
            commentcount = response.xpath("//span[@id='acrCustomerReviewText']/text()").extract_first()
            if not commentcount:
                commentcount = '0'
            commentcount = re.findall('(\d+)*', commentcount)
            commentcount = ''.join(commentcount)
            item['commentcount'] = commentcount
            item['_row'] = skuid + item['sourcetype']
            item['coverpath'] = '/book/' + datetime.datetime.now().strftime('%Y%m%d') + '/' + item['_row'] + '.jpg'
            item['is_set'] = '否'
            item['ifimport'] = '0'
            item['url'] = response.url
            item['_entitycode'] = 'web_page_p_book_info_09'
            item['commentpercent'] = ''
            try:
                tag_resp = self.get_commenttag(skuid)
                commenttag = tag_resp.xpath("//span/@data-cr-trigger-on-view")
                commenttag = json.loads(commenttag[0])
                commenttag = commenttag['ajaxParamsMap']['lighthouseTerms'].replace('/', '#')
            except:
                commenttag = ''
            item['commenttag'] = commenttag
            item['authorintro'] = self.parse_desc(html,'作者简介')
            sourceprice = response.xpath("//div[@id='soldByThirdParty']/span[2]/text()").extract_first()
            sourceprice = re.findall('\d+[.]*\d+',sourceprice)
            if not sourceprice:
                sourceprice = ['']
            item['sourceprice'] = sourceprice[0]
            comments = response.xpath("//div[@id='cm-cr-dp-review-list']/div")
            #遍历评论列表
            if comments:
                for comment in comments:
                    comment_item = CommentItem()
                    comment_item['isbn'] = isbn
                    comment_item['uri'] = response.url
                    comment_item['bookname'] = bookname
                    comment_item['sourcetype'] = item['sourcetype']
                    comment_item['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    publishdate_c = response.xpath("//h1/span[3]/text()").extract_first()
                    if not publishdate_c:
                        publishdate_c = ''
                    else:
                        pub_list = re.findall('(\d+)年(\d+)月(\d+)日', publishdate_c)
                        publishdate_c = '-'.join(pub_list[0])
                    comment_item['publishtime'] = publishdate_c
                    username = comment.xpath("./div/div[1]/a/div/span/text()").extract_first()
                    if not username:
                        username = ''
                    comment_item['username'] = username
                    comment_item['hitcount'] = '0'
                    comment_item['follownum'] = '0'
                    suportnum = comment.xpath("./div/div[7]/span/div/span[@data-hook='helpful-vote-statement']/text()").extract_first()
                    if not suportnum:
                        suportnum = '0'
                    suportnum = re.findall('\d+',suportnum)[0]
                    comment_item['suportnum'] = suportnum
                    comment_item['opposnum'] = '0'
                    comment_item['commentid'] = comment.xpath("./@id").extract_first()
                    comment_item['followcommentid'] = ''
                    commenttitle = comment.xpath(".//a[@data-hook='review-title']/text()").extract_first()
                    if not commenttitle:
                        commenttitle = ''
                    comment_item['commenttitle'] = commenttitle
                    comment_item['commenttype'] = '0'
                    comment_strs = comment.xpath(".//div[@data-hook='review-collapsed']/text()").extract()
                    comment_strs = ''.join(comment_strs)
                    comment_item['comment'] = comment_strs
                    score = comment.xpath("//div[@id='cm-cr-dp-review-list']/div[1]/div[1]/div[2]/a/@title").extract_first()
                    if not score:
                        score = ['5.0']
                    score = re.findall('\d.\d', score)[0]
                    score = score[:1]
                    comment_item['score'] = score
                    score = float(score)
                    if score < 2:
                        level = '2'
                    elif score < 4:
                        level = '1'
                    else:
                        level = '0'
                    comment_item['level'] = level
                    comment_item['commpoint'] = ''
                    comment_item['type'] = '01'
                    comment_item['sitename'] = '亚马逊'
                    comment_item['_row'] = comment_item['isbn'] + comment_item['sourcetype'] + comment_item['publishtime'] + comment_item['commentid']
                    comment_item['_entitycode'] = 'web_page_p_book_comment_09'
                    comment_item['skuid'] = skuid
                    yield comment_item
            yield item

예제 #4

파일 보기

    def parse(self, response):
        item = BookItem()
        for item_key in item_list:
            item[item_key] = ''
        is_set = '否'
        item['is_set'] = is_set
        skuid = response.url.split('/')[-1].replace('.html', '')

        bookname = response.xpath(
            "//span[@class='title_words']/@title").extract_first()
        bookname = bookname.strip()
        item['bookname'] = bookname
        item['subhead'] = response.xpath(
            "//p[@class='title_descript']/@title").extract_first()
        item['publisher'] = response.xpath(
            "//p[@id='publisher']//a/text()").extract_first()
        item['orgpublisher'] = response.xpath(
            "//p[@id='publisher']//a/text()").extract_first()
        contentsummary = response.xpath(
            "//div[@class='newEdit_box']//text()").extract()
        contentsummary = '<br>'.join(contentsummary)
        item['contentsummary'] = contentsummary
        item['sourcetype'] = '02'
        authors = response.xpath("//p[@id='author']//a/text()").extract_first()
        if not authors:
            authors = ''
        authors = authors.replace('、', ',')
        author_list = authors.split(',')
        authors = '#'.join(author_list)
        item['author'] = authors
        item['translator'] = ''
        item['isbn'] = ''
        item['orgisbn'] = ''
        item['salecategory'] = ''
        item['category'] = ''
        item['orgcategory'] = ''
        contenttype_list = response.xpath(
            "//div[@id='crumb']/a/text()").extract()
        for index, ct in enumerate(contenttype_list):
            ct = ct.replace('>', '')
            ct = ct.strip()
            contenttype_list[index] = ct
            if ct == bookname:
                contenttype_list.pop(index)
        contenttype = ','.join(contenttype_list)
        item['contenttype'] = contenttype
        item['issuearea'] = ''
        item['type'] = '02'
        item['edition'] = ''
        item['impression'] = ''
        basic_info_list = response.xpath(
            "//div[@class='explain_box']/p").extract()
        basic_info_str = ''.join(basic_info_list)
        words = re.findall('数：(\d+[.]*\d+)', basic_info_str)
        suffix = 1
        if '万' in basic_info_str:
            suffix = 10000
        if words:
            words = int(float(words[0]) * suffix)
        else:
            words = ''
        item['words'] = str(words)
        # 测试
        item['pages'] = ''
        item['language'] = ''
        price_str = response.xpath(
            "//div[@class='cost_box']/p").extract_first()
        price = re.findall('\d+[.]*\d+', price_str)
        if not price:
            price = ['0']
        item['price'] = price[0]
        item['format'] = ''
        item['papermeter'] = ''
        item['packing'] = ''
        item['coverurl'] = response.xpath(
            "//div[@class='bookCover_area']/img/@src").extract_first()
        item['seriename'] = ''
        catalog_list = response.xpath(
            "//div[@id='catalog_title']//text()").extract()
        catalog = '<br>'.join(catalog_list)
        item['catalog'] = catalog
        item['editorsugest'] = ''
        item['usersugest'] = ''
        item['preface'] = ''
        item['summary'] = ''
        item['epilogue'] = ''
        publishdate = re.findall('出版时间：([\d]{4}-[\d]{2})', basic_info_str)
        if not publishdate:
            publishdate = ['']
        publishdate = publishdate[0]
        item['publishdate'] = publishdate
        item['printedtime'] = publishdate
        item['collectiontime'] = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item['orgcode'] = ''
        item['skuid'] = skuid
        commentcount = response.xpath(
            "//div[@class='count_per']/em/text()").extract_first()
        if not commentcount:
            commentcount = ''
        commentcount = re.findall('\d+', commentcount)
        if not commentcount:
            commentcount = ['']
        item['commentcount'] = commentcount[0]
        item['_row'] = skuid + item['sourcetype']
        item['coverpath'] = '/book/' + datetime.datetime.now().strftime(
            '%Y%m%d') + '/' + item['_row'] + '.jpg'
        item['is_set'] = '否'
        item['ifimport'] = '0'
        item['url'] = response.url
        item['_entitycode'] = 'web_page_p_book_info_09'
        item['commentpercent'] = ''
        item['commenttag'] = ''
        item['authorintro'] = ''
        item['sourceprice'] = ''
        # 获取评论列表
        comments = self.get_comments(skuid)

        # 遍历评论列表
        for comment in comments:
            comment_item = CommentItem()
            try:
                uri = 'http://e.dangdang.com/post_detail_page.html?barId=' + str(
                    comment['barId']) + '&digestId=' + str(
                        comment['mediaDigestId'])
                comment_item['isbn'] = ''
                comment_item['uri'] = uri
                comment_item['bookname'] = bookname
                comment_item['sourcetype'] = '02'
                comment_item['collectiontime'] = datetime.datetime.now(
                ).strftime('%Y-%m-%d %H:%M:%S')
                publishdate_ts = comment['createDateLong'] / 1000
                publishdate_c = time.strftime("%Y-%m-%d %H:%M:%S",
                                              time.localtime(publishdate_ts))
                comment_item['publishtime'] = publishdate_c
                comment_item['username'] = comment['userBaseInfo']['nickName']
                comment_item['hitcount'] = '0'
                comment_item['follownum'] = comment['commentNum']
                comment_item['suportnum'] = comment['commentStar']
                comment_item['opposnum'] = '0'
                comment_item['commentid'] = comment['mediaDigestId']
                comment_item['followcommentid'] = ''
                comment_item['commenttitle'] = ''
                comment_item['commenttype'] = '0'
                comment_item['comment'] = comment['content']
                comment_item['score'] = '5'
                comment_item['level'] = '0'
                comment_item['commpoint'] = ''
                comment_item['type'] = '02'
                comment_item['sitename'] = '当当'
                comment_item['_row'] = skuid + comment_item[
                    'sourcetype'] + comment_item['publishtime'] + hashlib.md5(
                        comment_item['username'].encode(
                            'utf-8')).hexdigest()[8:-8]
                comment_item['_entitycode'] = 'web_page_p_book_comment_09'
                comment_item['skuid'] = skuid
                yield comment_item
            except:
                continue
        for item_key in item_list:
            if not item[item_key]:
                item[item_key] = ''
        yield item

예제 #5

파일 보기

    def parse(self, response):
        item = BookItem()
        for item_key in item_list:
            item[item_key] = ''
        item['is_set'] = '否'
        is_set = '否'
        # 判断isbn是否满足要求
        isbn = self.get_basicinfo(response, 'ISBN')
        if len(isbn) != 13:
            isbn = ''
            is_set = '是'
        if is_set == '否':
            skuid = response.url.split('/')[-1].replace('.html', '')
            # 加载商品描述信息接口
            html = self.get_content_and_cate(skuid)
            # 加载商品价格接口
            sourceprice, price = self.get_price(skuid)
            # 加载商品评论、评论数、好评率接口
            comments, commentcount, commentpercent, commenttag = self.get_comment(
                skuid)
            bookname = response.xpath(
                "//div[@class='sku-name']/text()").extract_first()
            bookname = bookname.strip()
            item['bookname'] = bookname
            item['subhead'] = ''
            item['publisher'] = self.get_basicinfo(response, '出版社')
            item['orgpublisher'] = self.get_basicinfo(response, '出版社')
            contentsummary = self.parse_desc(html, '内容简介')
            contentsummary = ''.join(contentsummary)
            item['contentsummary'] = contentsummary
            item['sourcetype'] = '01'
            author_list = response.xpath(
                "//div[@class='p-author']/a/@data-name").extract()
            author = '#'.join(author_list)
            item['author'] = author
            item['translator'] = ''
            item['isbn'] = isbn
            item['orgisbn'] = isbn
            item['salecategory'] = ''
            item['category'] = ''
            item['orgcategory'] = ''
            brand = self.get_basicinfo(response, '品牌')
            contenttype_list = response.xpath(
                "//div[@class='crumb fl clearfix']/div[@class='item']/a/text()"
            ).extract()
            try:
                contenttype_list.remove(brand)
            except:
                pass
            contenttype = ','.join(contenttype_list)
            item['contenttype'] = contenttype
            item['issuearea'] = ''
            item['type'] = '01'
            item['edition'] = self.get_basicinfo(response, '版次')
            item['impression'] = ''
            item['words'] = self.get_basicinfo(response, '字数')
            pages = re.findall('\d+', self.get_basicinfo(response, '页数'))
            if not pages:
                page = ['']
            pages = pages[0]
            item['pages'] = pages

            item['language'] = self.get_basicinfo(response, '正文语种')
            item['price'] = price

            item['format'] = self.get_basicinfo(response, '开本')
            item['papermeter'] = self.get_basicinfo(response, '用纸')
            item['packing'] = self.get_basicinfo(response, '包装')
            item['coverurl'] = 'http:' + response.xpath(
                "//div[@id= 'spec-n1']/img/@src").extract_first()
            item['seriename'] = self.get_basicinfo(response, '丛书名')
            item['catalog'] = self.parse_desc(html, '目录')
            item['editorsugest'] = self.parse_desc(html, '编辑推荐')
            item['usersugest'] = self.parse_desc(html, '精彩书评')
            item['preface'] = self.parse_desc(html, '前言/序言')
            item['summary'] = self.parse_desc(html, '精彩书摘')
            item['epilogue'] = ''
            publishdate = self.get_basicinfo(response, '出版时间')
            if not publishdate:
                publishdate = ''
            if len(publishdate) > 7:
                index = publishdate.rfind('-')
                publishdate = publishdate[:index]

            item['publishdate'] = publishdate
            item['printedtime'] = publishdate
            item['collectiontime'] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
            item['orgcode'] = ''
            item['skuid'] = skuid
            item['commentcount'] = str(commentcount)
            item['_row'] = skuid + '01'
            item['coverpath'] = '/book/' + datetime.datetime.now().strftime(
                '%Y%m%d') + '/' + item['_row'] + '.jpg'
            item['is_set'] = '否'
            item['ifimport'] = '0'
            item['url'] = response.url
            item['_entitycode'] = 'web_page_p_book_info_09'
            item['commentpercent'] = commentpercent
            item['commenttag'] = commenttag
            item['authorintro'] = self.parse_desc(html, '作者简介')
            item['sourceprice'] = sourceprice

            #遍历评论列表
            if comments:
                for comment in comments:
                    comment_item = CommentItem()
                    comment_item['isbn'] = isbn
                    comment_item['uri'] = response.url
                    comment_item['bookname'] = bookname
                    comment_item['sourcetype'] = '01'
                    comment_item['collectiontime'] = datetime.datetime.now(
                    ).strftime('%Y-%m-%d %H:%M:%S')
                    comment_item['publishtime'] = comment['creationTime']
                    comment_item['username'] = '******'
                    comment_item['hitcount'] = '0'
                    follownum = str(comment['replyCount'])
                    if not follownum:
                        follownum = '0'
                    comment_item['follownum'] = follownum
                    suportnum = str(comment['usefulVoteCount'])
                    if not suportnum:
                        suportnum = '0'
                    comment_item['suportnum'] = suportnum
                    comment_item['opposnum'] = '0'
                    comment_item['commentid'] = str(comment['id'])
                    comment_item['followcommentid'] = ''
                    comment_item['commenttitle'] = ''
                    comment_item['commenttype'] = '0'
                    comment_item['comment'] = comment['content']
                    score = str(comment['score'])
                    if not score:
                        score = '5'
                    comment_item['score'] = score
                    score = int(score)
                    if score < 2:
                        level = '2'
                    elif score < 4:
                        level = '1'
                    else:
                        level = '0'
                    comment_item['level'] = level
                    comment_item['commpoint'] = ''
                    comment_item['type'] = '01'
                    comment_item['sitename'] = '京东'
                    comment_item['_row'] = comment_item['isbn'] + comment_item[
                        'sourcetype'] + comment_item[
                            'publishtime'] + comment_item['username']
                    comment_item['_entitycode'] = 'web_page_p_book_comment_09'
                    comment_item['skuid'] = skuid
                    yield comment_item
            yield item

예제 #6

파일 보기

파일: toplistspider_dd.py 프로젝트: floydScript/spider

    def parse(self, response):
        item = BookItem()
        # 将所有字段设为空串
        for item_key in item_list:
            item[item_key] = ''
        item['is_set'] = '否'
        # 抓取isbn
        try:
            isbn = response.xpath('//div[@id="detail_describe"]/ul/li[5]/text()').extract_first()
            isbn = isbn.split('：')[1]
        except Exception as e:
            self._logger.error(e)
            isbn = ''
        item['orgisbn'] = isbn
        # 如果isbn长度不是13位的话，置为空，不存进数据库
        if len(isbn) != 13:
            isbn = ''
            is_set = '是'
        item['isbn'] = isbn
        if is_set == '否' :
            # 获得商品id和店铺id
            skuid = re.findall('\d+', response.url)[0]
            shopid = response.xpath("//p[@class='goto_shop']/a[1]/@href").extract_first().split('/')[-1]

            # 调用接口以获取动态加载的数据
            timemil_start = time.time()
            descrip_html = self.descrip_inter(skuid)
            comment_dict = self.comment_inter(skuid)
            price_dict = self.price_inter(skuid, shopid)
            tags = self.tag_inter(skuid)
            alsobuy_urls = self.alsobuy_inter(skuid, shopid)
            timemil_end = time.time()
            self._logger.info('解析url：'+response.url+'    ===>调取接口耗时:'+str(timemil_end-timemil_start)+' s')
            for url_item in alsobuy_urls:
                # ab_url = url_item.xpath("./@href").extract_first()
                # ab_url = 'http://product.dangdang.com/' + ab_url.split('#')[0]
                ab_url = 'http://product.dangdang.com/' + url_item['productId']+'.html'
                taskId = binascii.crc32((ab_url).encode())
                # ab_taskname = url_item.xpath("./img/@title").extract_first()
                ab_taskname = url_item['productName']
                # 往site_book表中插入url任务
                sql = '''insert into site_book(siteId,taskId,taskName,taskCode,startUrl,requestTimes,pollPeriod,autorun,status,crawlTime,maxDepth,threadNum,sleepTime,saveTime,newsType,rollUnit) 
                        values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
                params = (530701699,taskId,ab_taskname,'20',ab_url,3,86400,1,2,'2016-01-01 00:00:00',3,10,100,datetime.datetime.now(),'0','1')
                try:
                    self.cursor.execute(sql, params)
                    self.db.commit()
                    self._logger.info('插入任务：taskId为 ' + str(taskId) + '  url为  ' + ab_url)
                except Exception as e:
                    pass
            item['is_set'] = '否'
            bookname = response.xpath('//div[@id="product_info"]/div[1]/h1/@title').extract_first()
            item['bookname'] = bookname
            subhead = response.xpath("//span[@class='head_title_name']/@title").extract_first()
            if not subhead:
                subhead = ''
            item['subhead'] = subhead
            publisher = response.xpath('//div[@id="product_info"]/div[2]/span[2]/a/text()').extract_first()
            item['publisher'] = publisher
            item['orgpublisher'] = publisher
            item['contentsummary'] = self.packing_descrip(descrip_html,'content')
            item['editorsugest'] = self.packing_descrip(descrip_html,'abstract')
            item['sourcetype'] = '02'
            try:
                author_klist = response.xpath('//span[@id="author"]/text()').extract()
                author_list = response.xpath('//a[@dd_name="作者"]/text()').extract()
                author = []
                translator = []
                flag = True
                for index,k in enumerate(author_klist):
                    if flag:
                        author.append(author_list[index])
                        next_index = index+1
                        if next_index == len(author_klist):
                            continue
                        if author_klist[next_index] != '，' and author_klist[next_index] != ',':
                            flag = False
                    else:
                        if index >= len(author_list):
                            break
                        translator.append(author_list[index])
                author = '#'.join(author)
                translator = '#'.join(translator)
            except Exception as e:
                self._logger.error(e)
                author = item['publisher']
                translator = ''
            item['author'] = author
            item['translator'] = translator
            item['salecategory'] = ''
            item['category'] = ''
            item['orgcategory'] = ''
            contenttype = response.xpath('//li[@id="detail-category-path"]/span/a/text()').extract()
            contenttype = ','.join(contenttype)
            item['contenttype'] = contenttype
            item['issuearea'] = '0'
            item['type'] = '01'
            # 版次
            item['edition'] = ''
            # 印次
            item['impression'] = ''
            item['words'] = ''
            item['pages'] = ''
            item['language'] = ''
            item['price'] = price_dict['price']
            printedtime = response.xpath('//div[@id="product_info"]/div[2]/span[3]/text()').extract_first()
            if printedtime:
                printedtime = printedtime.strip()
                printedtime = printedtime[5:-1].replace('年', '-')
            else:
                printedtime = ''
            item['printedtime'] = printedtime
            format = response.xpath('//div[@id="detail_describe"]/ul/li[1]/text()').extract_first()[4:]
            item['format'] = format
            papermeter = response.xpath('//div[@id="detail_describe"]/ul/li[2]/text()').extract_first()[4:]
            item['papermeter'] = papermeter
            packing = response.xpath('//div[@id="detail_describe"]/ul/li[3]/text()').extract_first()[4:]
            item['packing'] = packing
            coverurl = response.xpath('//img[@id="largePic"]/@src').extract_first()
            item['coverurl'] = coverurl
            item['seriename'] = ''
            item['catalog'] = self.packing_descrip(descrip_html,'catalog')
            item['usersugest'] = self.packing_descrip(descrip_html,'mediaFeedback')
            item['preface'] = self.packing_descrip(descrip_html,'preface')
            item['summary'] = self.packing_descrip(descrip_html,'extract')
            item['epilogue'] = ''
            item['publishdate'] = printedtime
            item['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            item['orgcode'] = ''
            item['skuid'] = skuid
            item['_row'] =  skuid+'02'
            item['coverpath'] ='/book/' + datetime.datetime.now().strftime('%Y%m%d') + '/'+item['_row'] + '.jpg'
            item['commentcount'] = comment_dict['commentcount']
            item['ifimport'] = '0'
            item['url'] = response.url
            item['_entitycode'] = 'web_page_p_book_info_09'
            item['commentpercent'] = comment_dict['commentpercent']
            item['commenttag'] = tags
            item['authorintro'] = self.packing_descrip(descrip_html,'authorIntroduction')
            item['sourceprice'] = price_dict['sourceprice']

            comments = comment_dict['comments']
            if comments:
                for comment in comments:
                    try:
                        citem = CommentItem()
                        citem['isbn'] = isbn
                        uri = comment.xpath('./div[1]/div[2]//a/@href')
                        if not uri:
                            uri = [response.url]
                        uri = ''.join(uri)
                        citem['uri'] = uri
                        citem['bookname'] = bookname
                        citem['sourcetype'] = '02'
                        citem['collectiontime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                        publishtime = comment.xpath('./div[1]/div[4]/span[1]/text()')
                        if not publishtime:
                            continue
                        publishtime = ''.join(publishtime)
                        citem['publishtime'] = publishtime
                        username = comment.xpath('./div[2]/span[1]/text()')
                        if not username:
                            username = ['无昵称用户']
                        username = ''.join(username)
                        citem['username'] = username
                        citem['hitcount'] = '0'
                        citem['follownum'] = '0'
                        suportnum = comment.xpath('./div[1]/div[5]/a[1]/text()')
                        suportnum = ''.join(suportnum)
                        if suportnum == '赞':
                            suportnum = '0'
                        citem['suportnum'] = suportnum
                        citem['opposnum'] = '0'
                        commentid = str(binascii.crc32((username + publishtime).encode()))
                        citem['commentid'] = commentid
                        citem['followcommentid'] = '-1'
                        citem['commenttitle'] = ''
                        citem['commenttype'] = '0'
                        commentcontent = comment.xpath('./div[1]/div[2]//a/text()')
                        commentcontent = ''.join(commentcontent)
                        citem['comment'] = commentcontent
                        score = comment.xpath('./div[1]/div[1]/em/text()')
                        score = ''.join(score)
                        if not score:
                            score = '5'
                        score = score[:-1]
                        score = int(score) / 2
                        citem['score'] = str(score)
                        if score < 2:
                            citem['level'] = '2'
                        elif score < 4:
                            citem['level'] = '1'
                        else:
                            citem['level'] = '0'
                        citem['commpoint'] = ''
                        citem['type'] = '01'
                        citem['sitename'] = '当当'
                        citem['_row'] = citem['isbn'] + citem['sourcetype'] + citem['publishtime'] + hashlib.md5(citem['username'].encode('utf-8')).hexdigest()[8:-8]
                        citem['_entitycode'] = 'web_page_p_book_comment_09'
                        citem['skuid'] = skuid
                        for citem_key in citem_list:
                            if not citem[citem_key]:
                                citem[citem_key] =''
                        yield citem
                    except Exception as e:
                        self._logger.error(e)
                        continue
        for item_key in item_list:
            if not item[item_key]:
                item[item_key] = ''
        yield item