Ejemplo n.º 1
0
    def prejudge(self):
        """Pre-processing before parse page
        判断爬取是否需要重新放回redis
        :return: None
        """
        reput = False
        proxies, self.ip = set_proxy()
        try:
            self.response = requests.get(self.comment_url, proxies=proxies, timeout=9)
            if json.loads(self.response.text)['message'] != 'success':
            # netloc1 = urlparse(self.response.url).netloc
            # netloc2 = urlparse(self.url).netloc
            # if (netloc1 or netloc2) != 'www.toutiao.com' :
                logging.warning('invalid page:{}'.format(self.url))
                self.response = None
        except (Timeout, ConnectTimeout, ReadTimeout) as e:
            self.response = None
            logging.info('请求超时:' + str(e))
            rem_proxy(self.ip)
        except ConnectionError as e:
            reput = True
            logging.info('网络中断连接错误:' + str(e))
            rem_proxy(self.ip)
        except HTTPError as e:
            self.response = None
            logging.info('Http错误:' + str(e))
            rem_proxy(self.ip)


        return reput
Ejemplo n.º 2
0
    def prejudge(self):
        """Pre-processing before parse page

        :return: None
        """
        reput = False
        proxies, self.ip = set_proxy()
        try:
            url = 'http://kuaibao.qq.com/s/{}'.format(self.id)
            self.response = requests.get(url, proxies=proxies, timeout=15)
            self.response.raise_for_status()
        except (Timeout, ConnectTimeout, ReadTimeout) as e:
            self.response = None
            logging.error('请求超时:' + str(e))
            rem_proxy(self.ip)
        except ConnectionError as e:
            reput = True
            logging.error('网络中断连接错误:' + str(e))
            rem_proxy(self.ip)
        except HTTPError as e:
            self.response = None
            logging.error('Http错误:' + str(e))
            rem_proxy(self.ip)

        return reput
Ejemplo n.º 3
0
    def prejudge(self):
        """Pre-processing before parse page

        :return: None
        """
        reput = False
        proxies, self.ip = set_proxy()
        try:
            self.response = requests.get(self.url, proxies=proxies, timeout=15)
            self.response.raise_for_status()
            self.response.encoding = 'utf-8'
            self.response.xpath = etree.HTML(self.response.text).xpath
        except (Timeout, ConnectTimeout, ReadTimeout) as e:
            self.response = None
            logging.error('请求超时:' + str(e))
            rem_proxy(self.ip)
        except ConnectionError as e:
            reput = True
            logging.error('网络中断连接错误:' + str(e))
            rem_proxy(self.ip)
        except HTTPError as e:
            self.response = None
            logging.error('Http错误:' + str(e))
            rem_proxy(self.ip)

        return reput
Ejemplo n.º 4
0
def get_followers_count(id):
    url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value={}'.format(
        id)

    response = request.get(url, headers=headers)
    info = json.loads(response.text)

    print(info['data']['userInfo']['followers_count'])
Ejemplo n.º 5
0
def get_followers_count(id):
    url = 'https://api.bilibili.com/x/relation/stat?jsonp=jsonp&vmid={}'.format(
        id)

    response = request.get(url, headers=headers)
    info = json.loads(response.text)

    print(info['data']['follower'])
Ejemplo n.º 6
0
def get_id(url):
    response = request.get(url, headers=headers)
    response_xpath = ''.join(
        html.etree.HTML(
            response.text).xpath('//head//link[@rel="canonical"]//@href'))
    id = response_xpath.split('/')[4].replace('av', '')
    print(id)
    return id
Ejemplo n.º 7
0
 def get_comment(self):
     comments = []
     proxies, self.ip = set_proxy()
     page = 1
     count = 0
     while True:
         url = 'https://api.1sapp.com/comment/?content_id={}&page={}'.format(
             self.id, page)
         response = requests.get(url, proxies=proxies, timeout=15)
         ob_json = json.loads(response.text)
         for data in ob_json['data']:
             comment_id = data['comment_id']
             comment_text = data['comment']
             reply_count = data['reply_number']
             create_time = str(data['create_time'])
             create_time = time.strptime(create_time, "%Y-%m-%d %H:%M:%S")
             create_time = int(time.mktime(create_time))
             digg_count = data['like_num']
             user_name = data['nickname']
             avatar = data['avatar']
             info = {
                 'originId': comment_id,
                 'content': comment_text,
                 'reply_count': reply_count,
                 'publishTime': create_time,
                 'likeNum': digg_count,
                 'username': user_name,
                 'avatar': avatar,
                 # 'replies': [],
             }
             # replies = []
             # for reply in data['reply_list']:
             #     comment_id = reply['comment_id']
             #     comment = reply['comment']
             #     create_time = str(reply['create_time'])
             #     create_time = time.strptime(create_time, "%Y-%m-%d %H:%M:%S")
             #     create_time = int(time.mktime(create_time))
             #     nickname = reply['nickname']
             #     like_num = reply['like_num']
             #     avatar = reply['avatar']
             #     reply_info = {
             #         'originId': comment_id,
             #         'content': comment,
             #         'publishTime': create_time,
             #         'likeNum': like_num,
             #         'username': nickname,
             #         'avatar': avatar,
             #     }
             #     replies.append(reply_info)
             # info['replies'] = replies
             comments.append(info)
         count += 1
         if count >= 5:
             break
         page += 1
     self.result['comments'] = comments
Ejemplo n.º 8
0
def get_data(id):
    url = 'https://m.weibo.cn/api/statuses/repostTimeline?id={}'.format(id)
    response = request.get(url, headers=headers)
    info = json.loads(response.text)
    forward = info['data']['total_number']

    url = 'https://m.weibo.cn/api/comments/show?id={}'.format(id)
    response = request.get(url, headers=headers)
    info = json.loads(response.text)
    comments = info['data']['total_number']

    url = 'https://m.weibo.cn/api/attitudes/show?id={}'.format(id)
    response = request.get(url, headers=headers)
    info = json.loads(response.text)
    attitudes = info['data']['total_number']

    data = {
        'forward': forward,
        'comments': comments,
        'attitudes': attitudes,
    }
    print(data)
Ejemplo n.º 9
0
def get_commentupdate():
    response = requests.get(commentupdate_api)
    ob_json = json.loads(response.text)
    num = 0
    for comment in ob_json['data']:
        num += 1
        if 'toutiao' in comment['url']:
            toutiao_comment.put(comment)
            print('已添加{}'.format(comment['url']))
            print(num)
        else:
            kuaibao_comment.put(comment)
            print('已添加{}'.format(comment['url']))
            print(num)
Ejemplo n.º 10
0
def get_data(id):
    url = 'https://api.bilibili.com/x/web-interface/archive/stat?aid={}'.format(
        id)
    response = request.get(url, headers=headers)
    info = json.loads(response.text)
    coin = info['data']['coin']
    like = info['data']['like']
    view = info['data']['view']
    share = info['data']['share']
    favorite = info['data']['favorite']  #收藏
    data = {
        'coin': coin,
        'like': like,
        'view': view,
        'share': share,
        'favorite': favorite
    }
    print(data)
Ejemplo n.º 11
0
def get_data_article(id):
    url = 'https://api.bilibili.com/x/article/viewinfo?id={}&mobi_app=h5&from=homepage_0&jsonp=jsonp'.format(
        id)
    response = request.get(url, headers=headers)
    info = json.loads(response.text)
    coin = info['data']['stats']['coin']
    like = info['data']['stats']['like']
    view = info['data']['stats']['view']
    comment = info['data']['stats']['reply']
    share = info['data']['stats']['share']
    favorite = info['data']['stats']['favorite']  #收藏
    data = {
        'coin': coin,
        'like': like,
        'view': view,
        'comment': comment,
        'share': share,
        'favorite': favorite
    }
    print(data)
Ejemplo n.º 12
0
 def get_comment(self):
     comments = []
     proxies, self.ip = set_proxy()
     response = requests.get(self.comment_url, proxies=proxies, timeout=9).text
     ob_json = json.loads(response)
     count = 0
     for comment in ob_json['data']:
         comment = comment.get('comment')
         comment_text = comment.get('text')
         if judge(comment_text,key):
             continue
         reply_count = int(comment.get('reply_count'))
         create_time = int(comment.get('create_time'))
         dongtai_id = comment.get('id')
         try:
             digg_count = comment.get('digg_count')
         except:
             digg_count = 0
         user_name = comment.get('user_name')
         user_profile_image_url = comment.get('user_profile_image_url')
         info = {
             'originId':dongtai_id,
             'content': comment_text,
             'publishTime': create_time,
             'likeNum': digg_count,
             'username': user_name,
             'avatar': user_profile_image_url,
             # 'replies': [],
         }
         # if reply_count > 0:
         #     rep = []
         #     url = 'https://www.toutiao.com/api/comment/get_reply/?comment_id={}&offset=0&count=20'.format(dongtai_id)
         #     headers = {
         #         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
         #
         #     response = requests.get(url,proxies=proxies,timeout=3,headers=headers).text
         #     ob_json = json.loads(response)
         #     for reply in ob_json['data']['data']:
         #         reply_text = reply.get('text')
         #         if judge(reply_text, key):
         #             continue
         #         reply_create_time = int(reply.get('create_time'))
         #         name = reply.get('user').get('name')
         #         avatar_url = reply.get('user').get('avatar_url')
         #         originId = reply.get('id')
         #         try:
         #             digg_count = reply.get('digg_count')
         #         except:
         #             digg_count = 0
         #         reply_info = {
         #             'originId':originId,
         #             'content': reply_text,
         #             'publishTime': reply_create_time,
         #             'likeNum': digg_count,
         #             'username': name,
         #             'avatar': avatar_url,
         #         }
         #         rep.append(reply_info)
         #     info['replies'] = rep
         comments.append(info)
         count += 1
         if count >= 5:
             break
     self.result['comments'] = comments
Ejemplo n.º 13
0
from tools.myrequests import _requests as request
from tools.myrequests import Headers as headers
import json
from pyquery import PyQuery
import time
import queue
import threading
import logging
from lxml import html


def get_followers_count(id):
    url = 'https://api.bilibili.com/x/relation/stat?jsonp=jsonp&vmid={}'.format(
        id)

    response = request.get(url, headers=headers)
    info = json.loads(response.text)

    print(info['data']['follower'])


if __name__ == '__main__':
    # get_followers_count(321548603)
    url = 'https://b23.tv/8JESgv'
    response = request.get(url, headers=headers)
    print(response.text)