Exemple #1
0
    def returnSohuURLs(self, index, number, dataList):
        '''用于批量提取搜狐新闻
            过程:先模拟后台请求,得到json数据,分析得到文章的id,再将文章的网页前缀与文章id拼接,得到详情页地址'''
        extends = '&sceneId=1460&page=1&size=20'
        page = ''
        rang = number.split('-')

        start = int(rang[0])
        end = int(rang[1])

        pageSize = end - start
        if pageSize <= 0:
            print('后缀为:' + index + '的url所配置的数量不合法==>' + range[0] + ':' +
                  range[1])
        parameter = '&sceneId=' + index
        page = int(end / pageSize)
        parameter = parameter + '&page=' + str(page) + '&size=' + str(pageSize)
        # response = requests.get(dataList[0]+parameter)

        # 请求头 添加cookie信息
        cookies = requests.cookies.RequestsCookieJar()
        # 添加请求头和cookies
        response = requests.get(dataList[0] + parameter,
                                headers=getHeader(),
                                cookies=cookies)
        content = response.text
        # 将conten字符格式转化为json格式,便于操作
        data = json.loads(content)
        urls = []
        for temp in (data):
            url = dataList[1] + str(temp['id']) + '_' + str(temp['authorId'])
            urls.append(url)
        # print(urls)
        return urls
Exemple #2
0
    def returnQQURLs(self, number, req):
        '''用于批量提取腾讯新闻  只提取新闻,不提取专题
           过程:后台请求时,会携带上次请求所得的id作为参数'''
        # 腾讯新闻一次返回10 个新闻
        pageSize = 10
        rang = number.split('-')
        start = int(rang[0])
        end = int(rang[1])
        pageCount = int((end - start) / pageSize)
        urlList = []
        expIdsList = []

        # 请求头 添加cookie信息
        cookies = requests.cookies.RequestsCookieJar()
        for i in range(pageCount):
            if i == 0:
                url2 = req + '&page=' + str(i) + '&expIds='
            else:
                url2 = req + '&page=' + str(i) + '&expIds=' + '|'.join(
                    str(id) for id in expIdsList)
            expIdsList.clear()
            # response = requests.get(url2)
            # 添加请求头和cookies
            response = requests.get(url2, headers=getHeader(), cookies=cookies)
            # 将数据转为json格式
            conten = json.loads(response.text)
            dataList = conten.get('data')
            for temp in dataList:
                # article_tple为11代表该条新闻指定的是一个专题
                if (temp['article_type']) != 11:
                    url = temp['vurl']
                    id = temp['id']
                    urlList.append(url)
                    expIdsList.append(id)
                else:
                    pass
                    # print('专题链接:'+temp['vurl'])
        # print('腾讯新闻url数量:%d'%len(urlList))
        # print(urlList)
        return urlList
Exemple #3
0
    def returnTouTiaoURLs(self, index, number, req):
        '''用于批量提取今日头条的新闻
           过程:后台请求时,会携带上次请求中的max_behot_time的值作为参数'''
        max_behot_time = max_behot_time_tmp = 0
        category = index
        rang = number.split('-')
        start = int(rang[0])
        end = int(rang[1])
        pageSize = 10
        pageCount = int((end - start) / pageSize)
        urlList = []
        # 请求头 添加cookie信息
        cookies = requests.cookies.RequestsCookieJar()
        for i in range(pageCount):
            reqUrl = req + '&category=%s&max_behot_time=%d&max_behot_time_tmp=%d' % (
                category, max_behot_time, max_behot_time_tmp)

            print(reqUrl)
            # 添加请求头和cookies
            response = requests.get(reqUrl,
                                    headers=getHeader(),
                                    cookies=cookies)
            cookies.update(response.cookies)
            content = json.loads(response.text)
            dataList = content['data']
            for data in dataList:
                if 'article'.__eq__(data['article_genre']):
                    itemId = data['item_id']
                    # 请求得到的json数据中,不会包含完整的详情页地址,需要与详情页前缀拼接
                    newsUrl = 'https://www.toutiao.com/a%s' % (itemId)
                    urlList.append(newsUrl)
            # print('urlList长度:%d'%(len(urlList)))
            nextDic = content['next']
            nextId = nextDic['max_behot_time']
            max_behot_time = max_behot_time_tmp = nextId
        # print(urlList)
        return urlList