Esempio n. 1
0
    def parseVideoPageJson(self, response):
        if response.status != 200:
            print('get url error: ' + response.url)
            return
        account = response.meta['account']
        rltJson = json.loads(response.text)

        contentList = rltJson['data']
        curTime = dateUtil.getCurDate()
        for contentInfo in contentList:
            contentItem = ContentItem()
            contentItem['channel_id'] = self.channel_id
            contentItem['account_id'] = account
            contentItem['record_class'] = "content_info"
            contentItem['crawl_time'] = curTime
            contentItem['id'] = contentInfo['id']
            contentItem['title'] = contentInfo['title']
            contentItem['content_link'] = contentInfo['playLink']
            contentItem['publish_time'] = time.strftime(
                "%Y-%m-%d %H:%M:%S",
                time.localtime((int(contentInfo['createTime'])) / 1000))
            contentItem['read_count'] = contentInfo['playCount']
            contentItem['comment_count'] = contentInfo['commentCount']
            contentItem['like_count'] = contentInfo['likeCount']
            yield contentItem
Esempio n. 2
0
    def parseArticlePageJson(self, response):
        if response.status != 200:
            print('get url error: ' + response.url)
            return
        account = response.meta['account']
        cookie = response.meta['cookie']
        currentPage = response.meta['currentPage']
        totalPage = response.meta['totalPage']
        beginFlag = response.meta['beginFlag']
        rltJson = json.loads(response.text)
        if beginFlag:
            total = int(rltJson['total'])
            totalPage = math.ceil(total / 10)
            beginFlag = False

        contentList = rltJson['list']
        curTime = dateUtil.getCurDate()
        for contentInfo in contentList:
            contentItem = ContentItem()
            contentItem['channel_id'] = self.channel_id
            contentItem['account_id'] = account
            contentItem['record_class'] = "content_info"
            contentItem['crawl_time'] = curTime
            contentItem['id'] = contentInfo['id']
            contentItem['title'] = contentInfo['title']
            contentItem['content_link'] = contentInfo['url']
            contentItem['publish_time'] = time.strftime(
                "%Y-%m-%d %H:%M:%S",
                time.localtime(int(contentInfo['updatedAt'])))
            contentItem['read_count'] = contentInfo['readingNum']
            contentItem['comment_count'] = contentInfo['commentsNum']
            contentItem['share_count'] = contentInfo['forwardingNum']
            contentItem['collect_count'] = contentInfo['collectionNum']
            contentItem['recommend_count'] = contentInfo['recommendedNum']
            status = int(contentInfo['status'])  # 搜狗:1-已发布;40-未通过;134-草稿

            contentItem['publish_status'] = publicContentStatus[
                channelContentStatus[self.channel_id][status]]
            yield contentItem

        currentPage += 1
        if currentPage <= totalPage:
            time.sleep(5)
            yield scrapy.Request(
                self.articleUrl.format(currentPage),
                method='GET',
                callback=self.parseArticlePageJson,
                #headers=self.headers,
                cookies=cookie,
                meta={
                    'cookie': cookie,
                    'currentPage': currentPage,
                    'totalPage': totalPage,
                    'beginFlag': beginFlag,
                    'account': account
                })
Esempio n. 3
0
 def parseVideoAuthor(self, response):
     if response.status != 200:
         print('get url error: ' + response.url)
         return
     targetId = response.meta['targetId']
     rltJson = json.loads(response.text)
     if rltJson['errno'] != 0:
         return
     fansCnt = rltJson['data']['response']['cnt']['fansCnt']
     accountItem = AccountItem()
     curTime = dateUtil.getCurDate()
     accountItem['channel_id'] = self.channel_id
     accountItem['record_class'] = 'channel_info'
     accountItem['crawl_time'] = curTime
     accountItem['total_subscribe_count'] = fansCnt
     print(accountItem)
Esempio n. 4
0
 def parseFansAnalysisPageJson(self, response):
     if response.status != 200:
         print('get url error: ' + response.url)
         return
     account = response.meta['account']
     rltJson = json.loads(response.text)
     accountItem = AccountItem()
     accountItem['channel_id'] = self.channel_id
     accountItem['account_id'] = account
     accountItem['record_class'] = "channel_info"
     accountItem['crawl_time'] = dateUtil.getCurDate()
     accountItem['new_visit_count'] = rltJson['access']
     accountItem['total_visit_count'] = rltJson['total_access']
     accountItem['new_subscribe_count'] = rltJson['subscribe']
     accountItem['total_subscribe_count'] = rltJson['total_subscribe']
     yield accountItem
Esempio n. 5
0
    def parseArticlePage(self, response):
        if response.status != 200:
            print('get url error: ' + response.url)
            return
        rltJson = json.loads(response.text)
        if self.beginFlag:
            self.totalPage = rltJson['data']['totalPages']
            self.beginFlag = False

        contentList = rltJson['data']['ldata']
        curTime = dateUtil.getCurDate()
        for contentInfo in contentList:
            contentItem = ContentItem()
            contentItem['channel_id'] = self.channel_id
            contentItem['record_class'] = "content_info"
            contentItem['crawl_time'] = curTime
            contentItem['id'] = contentInfo['id']
            title = contentInfo['articleTitle']
            contentItem['title'] = title
            contentItem['content_link'] = self.viewArticleUrl.format(
                contentInfo['id'])
            if title in self.articleAnalysisDict:
                contentItem['publish_time'] = self.articleAnalysisDict[title][
                    'publish_time']
                contentItem['read_count'] = self.articleAnalysisDict[title][
                    'read_count']
                contentItem['share_count'] = self.articleAnalysisDict[title][
                    'share_count']
                contentItem['collect_count'] = self.articleAnalysisDict[title][
                    'collect_count']
            else:
                contentItem['publish_time'] = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.localtime((int(contentInfo['operTime'])) / 1000))
                contentItem['read_count'] = 0
                contentItem['share_count'] = 0
                contentItem['collect_count'] = 0
            yield contentItem

        self.curPage += 1
        if self.curPage <= self.totalPage:
            yield FormRequest(self.articleUrl,
                              method='POST',
                              formdata={'currPage': str(self.curPage)},
                              callback=self.parseArticlePage,
                              headers=self.headers,
                              cookies=self.cookies)
Esempio n. 6
0
    def parseListPageJson(self, response):
        if response.status != 200:
            print('get url error: ' + response.url)
            return
        rltJson = json.loads(response.text)
        if self.beginFlag:
            self.totalNumber = rltJson['data']['totalNumber']
            self.maxIndexId = math.ceil(self.totalNumber / 10)
            self.beginFlag = False

        contentList = rltJson['data']['articles']
        curTime = dateUtil.getCurDate()
        for contentInfo in contentList:
            contentItem = ContentItem()
            contentItem['channel_id'] = self.channel_id
            #contentItem['account_id'] = "2991941540"  #######test
            contentItem['record_class'] = "content_info"
            contentItem['crawl_time'] = curTime
            contentItem['id'] = contentInfo['article_id']
            contentItem['title'] = contentInfo['title']
            contentItem['content_link'] = contentInfo['url']
            contentItem['publish_time'] = contentInfo['pub_time']
            contentItem['comment_count'] = contentInfo['commentnum']
            if 'vid' in contentInfo:
                vid = contentInfo['vid']
                yield scrapy.Request(self.videoDetailUrl.format(vid),
                                     callback=self.parseVideoDetailPageJson,
                                     method='GET',
                                     headers=self.headers,
                                     cookies=self.cookies,
                                     meta={'item': contentItem})
            else:
                yield scrapy.Request(
                    self.articleDetailUrl.format(contentItem['id'] + '00'),
                    callback=self.parseArticleDetailPageJson,
                    method='GET',
                    headers=self.headers,
                    cookies=self.cookies,
                    meta={'item': contentItem})

        self.indexId += 1
        if self.indexId <= self.maxIndexId:
            yield scrapy.Request(self.contentListStartUrl.format(self.indexId),
                                 callback=self.parseListPageJson,
                                 method='GET',
                                 headers=self.headers,
                                 cookies=self.cookies)
Esempio n. 7
0
    def parseChannelPage(self, response):
        if response.status != 200:
            print('get url error: ' + response.url)
            return
        rltJson = json.loads(response.text)
        accountItem = AccountItem()
        accountItem['channel_id'] = self.channel_id
        accountItem['record_class'] = "channel_info"
        accountItem['crawl_time'] = dateUtil.getCurDate()
        accountItem['new_subscribe_count'] = rltJson['data'][0][
            'subscribe_cnt']
        accountItem['total_subscribe_count'] = rltJson['data'][0][
            'his_subscribe_cnt']
        accountItem['cancel_fans_count'] = rltJson['data'][0][
            'cancel_subscribe_cnt']

        yield accountItem
Esempio n. 8
0
    def parseArticlePageJson(self, response):
        if response.status != 200:
            print('get url error: ' + response.url)
            return
        account = response.meta['account']
        token = response.meta['token']
        currentPage = response.meta['currentPage']
        totalPage = response.meta['totalPage']
        beginFlag = response.meta['beginFlag']

        rltJson = json.loads(response.text)
        if beginFlag:
            totalPage = rltJson['data']['total_page']
            beginFlag = False
        contentList = rltJson['data']['data']
        curTime = dateUtil.getCurDate()
        for contentInfo in contentList:
            contentItem = ContentItem()
            contentItem['channel_id'] = self.channel_id
            contentItem['account_id'] = account
            contentItem['record_class'] = "content_info"
            contentItem['crawl_time'] = curTime
            contentItem['id'] = contentInfo['id']
            contentItem['title'] = contentInfo['title']
            contentItem['content_link'] = contentInfo['url']
            contentItem['publish_time'] = contentInfo['publish_time']
            contentItem['read_count'] = contentInfo['pv']
            contentItem['comment_count'] = contentInfo['comment_num']
            contentItem['share_count'] = contentInfo['share_num']
            contentItem['collect_count'] = contentInfo['fav_num']
            contentItem['recommend_count'] = contentInfo['rec_show_pv']
            status = int(contentInfo['status']) #趣头条:1-草稿;5-待审核;2-已发布;3-审核失败;4-回收站
            contentItem['publish_status'] = publicContentStatus[channelContentStatus[self.channel_id]['article'][status]]
            yield contentItem

        currentPage += 1
        if currentPage <= totalPage:
            time.sleep(5)
            yield scrapy.Request(
                self.articleUrl.format(currentPage, token, self.dtu),
                method='GET', callback=self.parseArticlePageJson,
                meta={'token': token, 'currentPage': currentPage, 'totalPage': totalPage, 'beginFlag': beginFlag, 'account': account})
Esempio n. 9
0
    def parseContentPageJson(self, response):
        if response.status != 200:
            print('get url error: ' + response.url)
            return
        rltJson = json.loads(response.text)
        if self.beginFlag:
            self.totalPage = rltJson['data']['page']['totalPage']
            self.beginFlag = False

        contentList = rltJson['data']['list']
        curTime = dateUtil.getCurDate()
        for contentInfo in contentList:
            contentItem = ContentItem()
            contentItem['channel_id'] = self.channel_id
            #contentItem['account_id'] = "13656689260"  #######test
            contentItem['record_class'] = "content_info"
            contentItem['crawl_time'] = curTime
            contentItem['id'] = contentInfo['id']
            contentItem['title'] = contentInfo['title']
            contentItem['content_link'] = contentInfo['url']
            contentItem['publish_time'] = contentInfo['publish_time']
            contentItem['audit_result'] = contentInfo['audit_msg']
            contentItem['read_count'] = contentInfo['read_amount']
            contentItem['comment_count'] = contentInfo['comment_amount']
            contentItem['share_count'] = contentInfo['share_amount']
            contentItem['collect_count'] = contentInfo['collection_amount']
            contentItem['recommend_count'] = contentInfo['rec_amount']
            contentItem['like_count'] = contentInfo['like_amount']
            status = contentInfo['status']

            contentItem['publish_status'] = publicContentStatus[channelContentStatus[self.channel_id][status]]
            '''if status == 'publish':
                contentItem['publish_status'] = 3
            if status == 'rejected':
                contentItem['publish_status'] = 2'''
            yield contentItem

        self.currentPage += 1
        if self.currentPage <= self.totalPage:
            yield scrapy.Request(self.contentStartUrl.format(self.currentPage),
                                 callback=self.parseContentPageJson, method='GET', headers=self.headers, cookies=self.cookies)
Esempio n. 10
0
    def parseVideoInfo(self, response):
        if response.status != 200:
            print('get url error: ' + response.url)
            return
        account = response.meta['account']
        url = response.url
        id = url.split("/")[-1]
        uploadDate = response.meta['uploadDate']

        titleList = response.xpath('//h1[starts-with(@class,"detail-cover-title")]/text()').extract()
        title = ""
        if len(titleList) == 1:
            title = titleList[0].strip()

        publishTimeList = response.xpath('//div[@itemprop="datePublished"]/strong/text()').extract()
        temp_publish_time = ""
        if len(publishTimeList) == 1:
            temp_publish_time = publishTimeList[0].strip()
        if temp_publish_time.find("分钟前") >= 0:
            try:
                minute_count = int(temp_publish_time.replace("分钟前", ""))
            except:
                minute_count = 0
            publish_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()-minute_count*60))
        else:
            temp_publish_time += ":00"
            temp_publish_time = temp_publish_time.replace("今天", "")
            temp_time = temp_publish_time.split(" ")[-1]
            publish_time = uploadDate + " " + temp_time

        playCountList = response.xpath('//div[@class="detail-location"]/text()').extract()
        play_count = 0
        if len(playCountList) == 2:
            try:
                play_count = int(playCountList[-1].strip().replace("播放", ""))
            except:
                play_count = 0

        likeCountList = response.xpath('//span[@itemprop="ratingCount"]/text()').extract()
        like_count = 0
        if len(likeCountList) == 1:
            try:
                like_count = int(likeCountList[0].strip())
            except:
                like_count = 0

        commentCountList = response.xpath('//span[@itemprop="reviewCount"]/text()').extract()
        comment_count = 0
        if len(commentCountList) == 1:
            try:
                comment_count = int(commentCountList[0].strip())
            except:
                comment_count = 0

        shareCountList = response.xpath('//span[@class="pr top-3"]/text()').extract()
        share_count = 0
        if len(shareCountList) == 2:
            try:
                share_count = int(shareCountList[1].strip().replace("分享", ""))
            except:
                share_count = 0

        curTime = dateUtil.getCurDate()
        contentItem = ContentItem()
        contentItem['channel_id'] = self.channel_id
        contentItem['account_id'] = account
        contentItem['record_class'] = "content_info"
        contentItem['crawl_time'] = curTime
        contentItem['id'] = id
        contentItem['title'] = title
        contentItem['content_link'] = url
        contentItem['publish_time'] = publish_time
        contentItem['read_count'] = play_count
        contentItem['comment_count'] = comment_count
        contentItem['share_count'] = share_count
        contentItem['like_count'] = like_count
        yield contentItem