def parse(self, response):
        try:
            jsonObj = json.loads(response.text)
            print('抓取新闻数目', len(jsonObj['data']))
            for data in jsonObj['data']:
                jsonNowObj = json.loads(data['content'])
                print(jsonNowObj['title'])
                missionBean = MissionBean(response.url, 0, ['train_rank'])
                missionBean.title = jsonNowObj['title']
                missionBean.info = jsonNowObj
                missionBean.info.update({
                    'news_type': '头条热点流',
                    'resource': '头条推荐流'
                })
                if missionBean.info.get('tag') == 'ad':
                    print('这是个广告,过滤')
                    continue

                self.client.save(missionBean)
        finally:
            ipDict = getRandomOneIP()
            yield Request(url=response.url,
                          headers=self.headers,
                          dont_filter=True,
                          meta={
                              'proxy':
                              'http://' + ipDict['ip'] + ':' + ipDict['port']
                          })
    def parse_item(self, response):
        i = 0
        bodys = response.xpath('//table[@class="list-table"]/tr')
        for body in bodys:
            if body.xpath('.//td[@class="first"]').extract():
                items = {}
                num = body.xpath(
                    './/td[@class="first"]/span/text()').extract_first()
                title = body.xpath(
                    './/td[@class="keyword"]/a/text()').extract_first()
                href = body.xpath(
                    './/td[@class="keyword"]/a/@href').extract_first()
                focus_num = body.xpath(
                    './/td[@class="last"]/span/text()').extract_first()
                items['index'] = num
                items['title'] = title
                items['news_type'] = '百度' + response.meta['news_type']
                items['url'] = href
                items['num'] = int(focus_num)
                items['focus_num'] = focus_num
                items['resource'] = '百度'
                print(items)
                i = i + 1
                try:
                    missionBean = MissionBean(href, 500, ['train_hotword'])
                    missionBean.title = title
                    missionBean.info = items
                    self.client.save(missionBean)

                except:
                    print("存储数据库出现异常")
                    traceback.print_exc()
        print('本次抓取个数{}'.format(i))
        self.sleepMyself()
    def __waitforImgs(self, urlImgList, newsId):
        """
        对一组图片进行下载,只要有一个失败,就报异常
        :param urlImgList:
        :param newsId:
        :return:
        """
        urlImgListAfterFilter = []
        for img in urlImgList:
            if img[0:2] == '//':
                img = 'http:' + img
            urlImgListAfterFilter.append(img)
            missionBean = MissionBean(img, 7000, [])
            missionBean.isFileTag = True
            missionBean.downloadCallback = 'set'
            redisDownloadSaveDb3(
                str(self.DEEP_DOWNLOAD) + '_' + self.FILE_KEY_DOWNLOAD,
                missionBean.getRedisDict())
        for url in urlImgListAfterFilter:
            timeStart = datetime.datetime.now().timestamp()
            while True:
                # 下载超时30秒
                if datetime.datetime.now().timestamp() - timeStart > 30:
                    raise Exception('图片下载超时')

                msgStr = redisGet(
                    3, self.FILE_KEY_DOWNLOAD + '_callback_' +
                    str(self.TYPE_DOWNLOAD) + ':' + url)
                if msgStr is None:
                    sleep(1)
                    continue
                else:
                    dictMsg = json.loads(msgStr)
                    print('图片下载成功,正在存储')
                    dictInsert = {}
                    fileUrl = dictMsg['fileUrl']
                    fileSome = fileUrl.split('/')
                    urlSome = img.split('/')
                    lstTag = ['.jpg', '.jpeg', '.gif', '.bmp', '.png']
                    for tag in lstTag:
                        if tag in fileUrl:
                            dictInsert['fileUrl'] = 'testFileName' + tag
                            break
                        dictInsert['fileUrl'] = 'testFileName.jpg'
                    dictInsert['Uploaded size'] = dictMsg['info']['file'][
                        'size']
                    dictInsert['Storage IP'] = fileSome[2]
                    dictInsert['Remote file_id'] = fileUrl.split(fileSome[2] +
                                                                 '/')[1]
                    dictInsert['imgUrl'] = dictMsg['url']
                    dictInsert['imgName'] = urlSome[len(urlSome) - 1]
                    dictInsert['Group name'] = fileSome[3]
                    dictInsert['articleId'] = newsId
                    self.new_db['d_news_images'].save(dictInsert)
                    break
 def parse_item(self, response):
     info = response.request.info
     html = response.body.decode()
     match = self.get_addr(html)
     if len(match) > 0:
         info['videoUrl'] = match[0]
     else:
         return
     bs4 = BeautifulSoup(response.text, 'html.parser')
     info['img'] = bs4.select_one("div[id=\"poster\"]").select_one('img')['src']
     missionBean = MissionBean(response.url, 3, ['fishing_new'])
     missionBean.html = html
     missionBean.title = info['title']
     missionBean.info = info
     self.client.save(missionBean)
Exemple #5
0
    def __getMissionBeanFromRedis(self, requestKeys):
        random.shuffle(requestKeys)  # 随机乱序
        for redisKey in requestKeys:
            keyNow = redisGetBiggetDeepKey(3, redisKey)
            if keyNow is None:
                print(redisKey + '_redis 无任务(不存在任何key)')
                time.sleep(1)
                return None

            strMissionBean = json.loads(redisRPop(3, keyNow))
            # 下载队列为空
            if strMissionBean is None:
                print(redisKey + '_redis 无任务(key中不存在missionBean)')
                return None

            missionBean = MissionBean("", 0, [])
            missionBean.__dict__ = strMissionBean
            missionBean.downloadMethod = redisKey
            return missionBean
    def parse_item(self, response):
        info = response.request.info
        html = response.text
        bs4 = BeautifulSoup(html, "html.parser")
        content = bs4.select_one('div[class=\"content\"]').prettify()
        info['content'] = content
        missionBean = MissionBean(response.url, 1001, ['qutoutiao'])
        missionBean.info = info
        missionBean.html = html
        missionBean.title = info['title']
        # 组装正式版Bean
        newsBean = NewsBean()
        newsBean.titleInfo = info['title']
        newsBean.content = info['content']
        newsBean.url = response.url
        newsBean.newsId = info['id']
        newsBean.tags = info['tag']

        newsBean.etc = {'news_type': info['type']}
        newsBean.fromChannel = self.TYPE_DICT.get(int(info['type']), '其他')
        newsBean.fromSpider = '推荐流'
        newsBean.fromType = 8
        newsBean.goodNum = int(info['like_num'])
        newsBean.commentNum = int(info['comment_count'])
        newsBean.readNum = int(info['read_count'])
        newsBean.mediaName = info['source_name']
        newsBean.mediaId = info['source_name']
        newsBean.introduction = info['introduction']
        newsBean.imgUrls = info['cover']
        newsBean.shareNum = info['share_count']
        missionBean.info = newsBean.__dict__
        # 其中publishDate和createTime由于redis的格式问题
        # TODO 只能传递时间戳
        newsBean.publishDate = datetime.datetime.fromtimestamp(
            int(info['publish_time']) / 1000).timestamp()
        newsBean.createTime = newsBean.createTime.timestamp()
        daoFilterAndSave.MongoFilterSave(missionBean)
from zywa_database_core.dao.mongo.mongoClientMyself import MongoClientMyself
from zywa_extract_helper.model.missionBean import MissionBean

if __name__ == '__main__':
    __mongoClient = MongoClientMyself(host="172.10.3.219",
                                      port=20000,
                                      db="xiaociwei",
                                      user="******",
                                      password="******")
    items = __mongoClient.selectAll(tableName='iqiyi_video')
    i = 0
    for item in items:

        missionBean = MissionBean('', 0, [])
        missionBean.__dict__ = item
        print(i)
        print(missionBean.title)
        redisLPush(4, 'data_clear_' + str(missionBean.type),
                   missionBean.getRedisDict())
        i += 1
    def parse(self, response):
        try:

            if 'top.baidu' in response.url:
                modes = response.xpath(
                    '//div[@class="hblock"]/ul/li/a/@href').extract()
                for mode in modes[1:]:
                    news_type = response.xpath(
                        '//div[@class="hblock"]/ul/li[{}]/a/@title'.format(
                            str(1 + modes.index(mode)))).extract_first()
                    yield Request(url=self.baidu_mainurl + mode[1:],
                                  callback=self.parse_item,
                                  dont_filter=True,
                                  meta={'news_type': news_type},
                                  priority=2)
            if 'weibo' in response.url:
                rhtml = response.xpath('//script/text()').extract(
                )  # 变量瞎定义的,大家将就着看,获取整个页面的script的字符串信息。
                htm = rhtml[8]  # 获取目标ID为realtimehot的Table的脚本信息,为什么是8呢?我在页面数的。
                start = htm.find("(")
                substr = htm[start + 1:-1]  # 截取脚本里面的json串信息。
                html = json.loads(substr)['html']
                bs4 = BeautifulSoup(html, 'html.parser')
                trTags = bs4.select('tr[action-type=\"hover\"]')
                print("发现潜在词数量", len(trTags))
                for trTag in trTags:
                    dictInfo = {}
                    dictInfo['index'] = trTag.find('em').string

                    dictInfo['title'] = trTag.find('p',
                                                   class_='star_name').a.string
                    dictInfo['url'] = trTag.find(
                        'p', class_='star_name').a.get('href')
                    dictInfo['resource'] = '微博'
                    try:
                        dictInfo['num'] = int(
                            trTag.find('p', class_='star_num').span.string)
                    except:
                        dictInfo['num'] = -1
                    missionBean = MissionBean(dictInfo['url'], 501,
                                              ['train_hotword'])
                    missionBean.title = str(dictInfo['title'])
                    if 'realtimehot' in response.url:
                        missionBean.info = {'news_type': '微博热搜'}
                    if 'socialevent' in response.url:
                        missionBean.info = {'news_type': '微博新时代'}
                    missionBean.info.update(dictInfo)
                    print(missionBean.title)
                    self.client.save(missionBean)
            if 'news.163' in response.url:
                typeName0 = '163'
                bs4 = BeautifulSoup(response.text, "html.parser")
                items = bs4.select_one("div[class=\"area areabg1\"]")
                i = 0
                for titleBarTag in items.select("div[class=\"titleBar\"]"):
                    # 这个网站比较奇怪,是并列关系,第n个titleBar对应第n个left和right
                    typeName1 = titleBarTag.select_one("h2").get_text()  # 分类名
                    """
                    左侧分类【点击榜】
                    """
                    areaLeftTag = items.select(
                        'div[class=\"area-half left\"]')[i]
                    typeName2 = areaLeftTag.select_one("h2").get_text()
                    liTags = items.select(
                        "div[class=\"title-tab\"]")[i].select('li')
                    j = 0
                    for li in liTags:
                        typeName3 = li.get_text()
                        # print(str(areaLeftTag))
                        tableTag = areaLeftTag.select('table')[j]
                        for newsTag in tableTag.select("tr"):
                            # 标题行不抓取
                            if "标题" in newsTag.get_text():
                                continue
                            infoDict = {}
                            infoDict['title'] = newsTag.select_one(
                                'a').get_text()
                            infoDict['index'] = int(
                                newsTag.select("td")[0].select_one(
                                    "span").get_text())
                            infoDict['num'] = int(
                                newsTag.select("td")[1].get_text())
                            infoDict['upOrDown'] = -1
                            infoDict['url'] = newsTag.select_one('a')['href']
                            infoDict[
                                'news_type'] = typeName0 + typeName1 + typeName2 + typeName3
                            infoDict['resource'] = '163'
                            missionBean = MissionBean(response.url, 510,
                                                      ['train_hotword'])
                            missionBean.info = infoDict
                            self.client.save(missionBean)
                        j = j + 1
                    """
                    右侧分类【跟帖榜】
                    """
                    areaLeftTag = items.select(
                        'div[class=\"area-half right\"]')[i]
                    typeName2 = areaLeftTag.select_one("h2").get_text()
                    liTags = items.select(
                        "div[class=\"title-tab\"]")[i].select('li')
                    j = 0
                    for li in liTags:
                        typeName3 = li.get_text()
                        # print(str(areaLeftTag))
                        tableTag = areaLeftTag.select('table')[j]
                        for newsTag in tableTag.select("tr"):
                            # 标题行不抓取
                            if "标题" in newsTag.get_text():
                                continue
                            infoDict = {}
                            infoDict['title'] = newsTag.select_one(
                                'a').get_text()
                            infoDict['index'] = int(
                                newsTag.select("td")[0].select_one(
                                    "span").get_text())
                            infoDict['num'] = int(
                                newsTag.select("td")[1].get_text())
                            infoDict['upOrDown'] = -1
                            infoDict['url'] = newsTag.select_one('a')['href']
                            infoDict[
                                'news_type'] = typeName0 + typeName1 + typeName2 + typeName3
                            infoDict['resource'] = '163'
                            missionBean = MissionBean(response.url, 511,
                                                      ['train_hotword'])
                            missionBean.info = infoDict
                            self.client.save(missionBean)
                        j = j + 1
                    i = i + 1
            if 'api.1sapp' in response.url:
                jsonBean = json.loads(response.text)
                print(jsonBean)
                for i, news in enumerate(jsonBean['data']['data']):
                    items = {}
                    parseUrl = urlparse(response.url)
                    strParseQs = parseUrl[4]
                    res = parse.parse_qs(strParseQs)
                    pageNum = int(res.get('page')[0])
                    limitNum = int(res.get('limit')[0])
                    items['index'] = i + 1 + (pageNum - 1) * limitNum
                    items['title'] = news['title']
                    items['news_type'] = '趣头条推荐流'
                    items['url'] = news['url']
                    items['num'] = 10000 - items['index']
                    items['num1'] = int(news['read_count'])
                    items['num2'] = int(news['share_count'])
                    items['resource'] = '趣头条'
                    missionBean = MissionBean(items['url'], 500,
                                              ['train_hotword'])
                    missionBean.title = items['title']
                    missionBean.info = items
                    self.client.save(missionBean)
        except:
            traceback.print_exc()
        finally:
            print("正在添加新任务至队列头部")
            request = Request(url=response.url, dont_filter=True)
            yield request