Ejemplo n.º 1
0
class sougou_1():

    tools = tool()
    headers = tools.headers()
    dict_ = tools.dict_()

    def response(self, keyword):
        url = 'https://search.sohu.com/search/meta'
        data = tools.data(keyword)
        response = requests.get(url=url, params=data)
        datas = json.loads(response.text)['data']['media']
        return datas

    def parse(self, data):  #解析个人的信息并进行存储到mysql中,只有大于10000粉丝数的才会被存储
        try:
            totalPv = data['scoreMap']['totalPv']
            newsCount = data['scoreMap']['newsCount']
            if totalPv >= 100000 and newsCount >= 20:
                author = data['userName']  # 用户昵称
                home_url = data['weiboUrl']  # 用户主页地址
                avatar = data['avatorUrl']  # 用户头像地址
                if avatar.split('//')[0] == "http:":
                    avatar_url = avatar
                else:
                    avatar_url = 'http:' + avatar
                brief = data['description']  # 作者简介
                # fans_num = re.findall('fans_num.*?:\"(.*?)\",', html)[0]  # 粉丝数量
                create_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime())  # 创建时间
                source_name = '搜狐号'  # 来源名称
                biz = "souhu" + str(data['id'])
                # follow_num = re.findall('follow_num.*?:\"(.*?)\",', html)[0] #关注数
                tags = keyword

                sql1 = 'select * from spider_user_info where author="%s"' % author
                cursor = self.tools.sqll(sql1)
                result = cursor.fetchall()
                if not result:
                    sql2 = 'insert into spider_user_info(id,author,biz,avatar_url,home_url, source_name, brief, create_time,tags) values("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                        id, author, biz, avatar_url, home_url, source_name,
                        brief, create_time, tags)
                    self.tools.sqll(sql2)
                    print(id, biz, author, home_url, avatar_url, source_name,
                          brief, create_time, tags)

                else:
                    pass
            else:
                pass
        except Exception as e:
            print(e)
Ejemplo n.º 2
0
class qiehao_1():
    tools = tool()
    headers = tools.headers()
    dict_ = tools.dict_()

    def response(self, keyword):
        url = 'https://r.inews.qq.com/verticalSearch?chlid=_qqnews_custom_search_qiehao&search_from=click&uid=44ce3651532c37f9&omgid=e76c5bfab95b6547ffab46fb08c39bd795f60010213414&trueVersion=5.8.12&qimei=44ce3651532c37f9&appver=25_android_5.8.12&devid=44ce3651532c37f9&Cookie=lskey%3D;skey%3D;uin%3D;%20luin%3D;logintype%3D0;%20main_login%3D;%20&qn-sig=f50e2c8c758767a6bc87be6605573722&qn-rid=219c9f88-e74a-4670-bb7d-3497cec83c8a'
        data = tools.data(keyword)
        response = requests.post(url,
                                 data=data,
                                 headers=self.headers,
                                 timeout=15)
        html = json.loads(response.text)
        datas = html['secList']
        return datas

    def fan_num(self, url):
        try:
            fans_num = json.loads(
                requests.get(url).text)['channelInfo']['subCount']
        except:
            fans_num = 0
        return fans_num

    def parse(self, data):  #解析个人的信息并进行存储到mysql中,只有大于10000粉丝数的才会被存储
        try:
            author = data['chlname']  # 用户昵称
            biz = "qiehao" + str(data['chlid'])
            home_url = 'https://r.inews.qq.com/getSubItem?chlid={}'.format(
                data['chlid'])  # 用户主页地址
            avatar_url = data['imgurl']  # 用户头像地址
            brief = data['abstract']  # 作者简介
            create_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                        time.localtime())  # 创建时间
            source_name = '企鹅号'  # 来源名称
            fans_num = int(self.fan_num(home_url))  # 粉丝数量
            tags = keyword
            sql1 = 'select * from spider_user_info where author="%s"' % author
            cursor = self.tools.sqll(sql1)
            result = cursor.fetchall()
            if not result and fans_num >= 1:
                sql2 = 'insert into spider_user_info(id,author,biz,home_url,avatar_url,brief,create_time,source_name,fans_num,tags) values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                    id, author, biz, home_url, avatar_url, brief, create_time,
                    source_name, fans_num, tags)
                self.tools.sqll(sql2)
                print(id, author, biz, home_url, avatar_url, brief,
                      create_time, source_name, fans_num, tags)
            else:
                pass
        except Exception as e:
            print(e)
Ejemplo n.º 3
0
class baijia_2():
    tools = tool()
    headers = tools.headers()
    dict_ = tools.dict_()
    browser = tools.browser()


    def checksql(self): #通过判断status的数值判断书否已经爬取过这个人的关注的人
        sql1='select biz from spider_user_info where status = 0 and source_name = "百家号" limit 0,1 '
        cursor = self.tools.sqll(sql1)
        result = cursor.fetchall()
        biz=result[0]['biz']
        uk=biz.split('/')[1]
        sql2='update spider_user_info set status=1 where biz="%s"' % biz
        self.tools.sqll(sql2)
        return uk


    def response(self,id):  #获取这个人的关注的好友列表页面源码
        for i in self.dict_:
            self.browser.add_cookie({
                'name': i,
                'value': self.dict_[i],
            })
        if id.isdigit():
            url = 'https://author.baidu.com/profile?context={%22from%22:%22dusite_sresults%22,%22app_id%22:%22' + id + '%22}&cmdType=&pagelets[]=root&reqID=0&ispeed=1'
        else:
            url = 'https://author.baidu.com/profile?context={%22from%22:%22dusite_sresults%22,%22uk%22:%22' + id + '%22}&cmdType=&pagelets[]=root&reqID=0&ispeed=1'
        try:
            self.browser.get(url)
        except:
            pass
        html = self.browser.page_source
        return html


    def fans(self,uk):  # 获取关注的信息
        ids=[]
        try:
            url = 'https://mbd.baidu.com/webpage?action=personaljumpsublist&type=subscribe&uk={}'.format(uk)
            response = requests.get(url=url)
            response.encoding = 'utf-8'
            html = response.text
            datas = json.loads(html)
            follows = datas['data']['follow_list']['modify']
            for follow in follows:
                id = follow['third_id']
                ids.append(id)
            return ids
        except Exception as e:
            print(e)
            return ids


    def parse(self,id):  # 解析他所关注的人的个人信息,并检查是否存在于数据库,不存在插入
        html=self.response(id)
        try:
            html = html.replace("\\", "")
            author = re.findall('display_name\":\"(.*?)\"', html)[0].encode("gb18030", "ignore").decode("utf8","ignore").replace("\\", "")  # 用户昵称
            third_id=re.findall('third_id\":\"(.*?)",', html)[0]
            home_url = 'https://author.baidu.com/home/' + third_id  # 用户主页地址
            avatar_url = re.findall('avatar_raw\":\"(.*?)",', html)[0]  # 用户头像地址
            brief = str(re.findall('sign\":\"(.*?)\",', html)[0]).encode("gb18030", "ignore").decode("utf8","ignore").replace("\\","")  # 作者简介
            fans_num = re.findall('fans_num.*?:\"(.*?)\",', html)[0]  # 粉丝数量
            create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())  # 创建时间
            source_name = '百家号'  # 来源名称
            follow_num = re.findall('follow_num.*?:\"(.*?)\",', html)[0]  # 关注数
            uk = re.findall('uk.*?:\"(.*?)\",', html)[0]  # uk码
            biz = third_id + '/' + uk
            if int(fans_num) >= 10000:
                sql3='select biz from spider_user_info where biz="%s"' % biz
                cursor = self.tools.sqll(sql3)
                result = cursor.fetchall()
                if not result:
                    sql4='insert into spider_user_info(author,home_url,fans_num,avatar_url,source_name,brief,biz,create_time) values("%s","%s","%s","%s","%s","%s","%s","%s")' % (
                        author, home_url, fans_num, avatar_url, source_name, brief, biz, create_time)
                    self.tools.sqll(sql4)
                    print(author, home_url, third_id, fans_num, avatar_url, source_name, brief, biz, create_time,follow_num, uk)
                else:
                    pass

        except Exception as e:
            print(e)
Ejemplo n.º 4
0
class baijia_1():
    from uitls import tool
    tools = tool()
    headers = tools.headers()
    dict_ = tools.dict_()
    browser = tools.browser()

    def get_author(self, i, j):  #获取列表作者的个人主页url
        datas = {
            "word": "{}+百家号".format(i),
            "pd": "cambrian_list",
            "atn": "index",
            "title": "{}+百家号".format(i),
            "lid": "8219726775998088715",
            "ms": "1",
            "frsrcid": "206",
            "frorder": "1",
            "sig": "593303",
            "pn": 10 * j,
            "mod": "1",
        }
        url = 'https://m.baidu.com/sf'
        response = requests.get(url, params=datas, headers=self.headers)
        response.encoding = 'utf-8'
        html = response.text
        tree = etree.HTML(html)
        datas = tree.xpath('//div[@class="sfc-cambrian-list-subscribe"]')
        urls = []
        for data in datas:
            url = data.xpath('./div/a/@href')[0]
            urls.append(url)
        return urls

    def get_id(self, url):  #获取作者id拿到得
        response = requests.get(url=url)
        response.encoding = 'utf-8'
        html = response.text
        try:
            app_id = re.findall('home/(.*)\?from=dusite_sresults"', html)[0]
            return app_id
        except:
            pass

    def homepage(self, app_id):  # 通过id获取个人主页页面信息
        for i in self.dict_:
            self.browser.add_cookie({
                'name': i,
                'value': self.dict_[i],
            })
        url = 'https://author.baidu.com/profile?context={%22from%22:%22dusite_sresults%22,%22app_id%22:%22' + str(
            app_id) + '%22}&cmdType=&pagelets[]=root&reqID=0&ispeed=1'
        self.browser.get(url)
        html = self.browser.page_source
        return html

    def parse(self, html):  #解析个人的信息并进行存储到mysql中,只有大于10000粉丝数的才会被存储
        html = html.replace("\\", "")
        author = re.findall('display_name\":\"(.*?)\"', html)[0].encode(
            "gb18030", "ignore").decode("utf8", "ignore").replace("\\",
                                                                  "")  # 用户昵称
        home_url = 'https://author.baidu.com/home/' + app_id  # 用户主页地址
        avatar_url = re.findall('avatar_raw\":\"(.*?)",', html)[0]  # 用户头像地址
        brief = str(re.findall('sign\":\"(.*?)\",', html)[0]).encode(
            "gb18030", "ignore").decode("utf8", "ignore").replace("\\",
                                                                  "")  # 作者简介
        fans_num = re.findall('fans_num.*?:\"(.*?)\",', html)[0]  # 粉丝数量
        create_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                    time.localtime())  # 创建时间
        source_name = '百家号'  # 来源名称
        follow_num = re.findall('follow_num.*?:\"(.*?)\",', html)[0]  #关注数
        uk = re.findall('uk.*?:\"(.*?)\",', html)[0]  #uk码
        biz = app_id + '/' + uk
        if int(fans_num) >= 10000:
            sql1 = 'select biz from spider_user_info where biz="%s"' % biz
            cursor = self.tools.sqll(sql1)
            result = cursor.fetchall()
            if not result:
                sql2 = 'insert into spider_user_info(author,home_url,fans_num,avatar_url,source_name,brief,biz,create_time) values("%s","%s","%s","%s","%s","%s","%s","%s")' % (
                    author, home_url, fans_num, avatar_url, source_name, brief,
                    biz, create_time)
                self.tools.sqll(sql2)
                print(author, home_url, fans_num, avatar_url, source_name,
                      brief, biz, create_time, follow_num, uk)
            else:
                pass
Ejemplo n.º 5
0
        source_name = '百家号'  # 来源名称
        follow_num = re.findall('follow_num.*?:\"(.*?)\",', html)[0]  #关注数
        uk = re.findall('uk.*?:\"(.*?)\",', html)[0]  #uk码
        biz = app_id + '/' + uk
        if int(fans_num) >= 10000:
            sql1 = 'select biz from spider_user_info where biz="%s"' % biz
            cursor = self.tools.sqll(sql1)
            result = cursor.fetchall()
            if not result:
                sql2 = 'insert into spider_user_info(author,home_url,fans_num,avatar_url,source_name,brief,biz,create_time) values("%s","%s","%s","%s","%s","%s","%s","%s")' % (
                    author, home_url, fans_num, avatar_url, source_name, brief,
                    biz, create_time)
                self.tools.sqll(sql2)
                print(author, home_url, fans_num, avatar_url, source_name,
                      brief, biz, create_time, follow_num, uk)
            else:
                pass


if __name__ == '__main__':
    baijia_1 = baijia_1()
    tools = tool()
    list_ = tools.list_()
    for i in list_:
        for j in range(0, 10):
            urls = baijia_1.get_author(i, j)
            for url in urls:
                app_id = baijia_1.get_id(url)
                html = baijia_1.homepage(app_id)
                baijia_1.parse(html)
Ejemplo n.º 6
0
class souhu_2():
    tools = tool()
    dict_ = tools.dict_()

    def checksql(self):  # 检查没有爬取过文章的作者信息2
        sql1 = 'select * from spider_user_info where status = 0 and source_name = "搜狐号" limit 0,1 '
        cursor = self.tools.sqll(sql1)
        result = cursor.fetchall()
        info = result[0]
        return info

    def article(self, info):  #获取100篇文章文章的页面
        author_id = info['biz'].replace('souhu', "")
        datas = []
        try:
            for i in range(1, 6):
                url = 'https://v2.sohu.com/author-page-api/author-articles/wap/{}?pNo={}'.format(
                    author_id, i)
                response = requests.get(url=url)
                data = json.loads(response.text)['data']['wapArticleVOS']
                datas.extend(data)
            return datas
        except:
            return datas

    def read_num(self, article_id):  # 获取点赞量和阅读量
        url = 'https://v2.sohu.com/author-page-api/articles/pv?articleIds={}'.format(
            article_id)
        response = requests.get(url=url)
        num = json.loads(response.text)[str(article_id)]
        return num

    def content(self, article_id):
        content_replys = []
        url = 'https://api.interaction.sohu.com/api/comments/maincomments?source_id=mp_{}&page_no=1&page_size=10&reply_count=10&type=0'.format(
            article_id)
        try:
            response = requests.get(url=url)
            comments = json.loads(response.text)
            for comment in comments:
                content_reply = {
                    'thumb_num': comment['displayStatus'],
                    'content': comment['content']
                }
                content_replys.append(content_reply)
            return content_replys
        except:
            content_replys = None
            return content_replys

    def parse(self, info):
        datas = self.article(info)
        if len(datas) == 0:
            pass
        else:
            return datas

    def parse_2(self, data):
        article = {}
        try:
            article_id = data['id']
            num = self.read_num(article_id)
            article['read_num'] = num
            article['author'] = info['author']
            article['avatar_url'] = info['avatar_url']
            article['title'] = data['title']
            article['source_url'] = "https://" + data['link']
            article['source_name'] = '搜狐号'
            img_url = data['cover']
            if img_url.split('//')[0] == "http:":
                article['img_url'] = img_url
            else:
                article['img_url'] = 'http:' + img_url
            article['published_time'] = time.strftime(
                "%Y-%m-%d %H:%M:%S",
                time.localtime(int(str(data['publicTime'])[:-3])))
            article["author_info"] = {
                "biz": info['biz'].replace('souhu', ''),  # bizID
                "brief": self.utils.filter_emoji(info['brief'])  # 摘要信息
            }
            content_replys = self.content(str(article_id))
            article['content_reply'] = content_replys
            print(article)

        except:
            pass
Ejemplo n.º 7
0
class baijia_3():
    tools = tool()
    headers = tools.headers()
    dict_ = tools.dict_()
    browser = tools.browser()

    def checksql(self):  #检查没有爬取过文章的作者信息
        sql1 = 'select * from spider_user_info where status = 1 and source_name = "百家号" limit 0,1 '
        cursor = self.tools.sqll(sql1)
        result = cursor.fetchall()
        info = result[0]
        return info

    def article(self, info):  #获取100篇文章文章的页面
        uk = info['biz'].split('/')[1]
        for i in self.dict_:
            self.browser.add_cookie({
                'name': i,
                'value': self.dict_[i],
            })
        url = 'https://author.baidu.com/list?type=article&tab=2&uk={}&num=100'.format(
            uk)
        try:
            self.browser.get(url)
        except:
            pass
        html = self.browser.page_source
        return html, uk

    def read_point(self, dynamic, thread, uk):  #获取点赞量和阅读量
        bian = '%5b%7b%22user_type%22%3a%223%22%2c%22dynamic_id%22%3a%22{}%22%2c%22dynamic_type%22%3a%222%22%2c%22dynamic_sub_type%22%3a%222001%22%2c%22thread_id%22%3a%22{}%22%2c%22feed_id%22%3a%22{}%22%7d%5d'.format(
            dynamic, thread, dynamic)
        response = requests.get(
            'https://mbd.baidu.com/webpage?type=homepage&action=interact&format=jsonp&params={}&uk={}'
            .format(bian, uk),
            timeout=10)
        nums = json.loads(
            response.text.replace('callback(',
                                  '').replace(')', ''))['data']['user_list']
        return nums

    def content(self, thread):  #获取评论数
        comment_url = 'https://ext.baidu.com/api/comment/v1/comment/getlist?appid=101&start=0&num=10&thread_id={}'.format(
            thread)
        try:
            r1 = requests.get(url=comment_url, timeout=10)
            comments = json.loads(r1.text)['ret']['list']
            content_replys = []
            if len(comments) >= 10:
                for i in range(10):
                    content_reply = {
                        'thumb_num': comments[i]['like_count'],
                        'content': comments[i]['content']
                    }
                    content_replys.append(content_reply)
                return content_replys
            else:
                for comment in comments:
                    content_reply = {
                        'thumb_num': comment['like_count'],
                        'content': comment['content']
                    }
                    content_replys.append(content_reply)
                return content_replys
        except:
            content_replys = None
            return content_replys

    def parse(self, info):
        html, uk = self.article(info)
        articles = []
        try:
            new = json.loads(re.findall('.*">(.*?)<', html)[0])
            datas = new['data']['list']
            for data in datas:
                article = {}
                try:
                    dynamic = data['dataAttrs']['dynamic-id']
                    thread = data['dataAttrs']['thread-id']
                    nums = self.read_point(dynamic, thread, uk)
                    article['reply_num'] = tuple(
                        nums.values())[0]['comment_num']  #评价数
                    article['read_num'] = tuple(
                        nums.values())[0]['read_num']  #阅读量
                    article['thumb_num'] = tuple(
                        nums.values())[0]['praise_num']  # 点赞数
                    article['author'] = info['author']  # 作者
                    article['avatar_url'] = info['avatar_url']  # 头像
                    article['title'] = data['title']  # 文章标题
                    article['source_url'] = data['url']  # 文章链接
                    article['source_name'] = '百家号'
                    article['img_url'] = data['cover_images'][0][
                        'src']  #文章图片,有三张取第一个
                    article['published_time'] = time.strftime(
                        "%Y-%m-%d %H:%M:%S",
                        time.localtime(data['publish_at']))  #创作时间
                    article["author_info"] = {
                        "biz": info['biz'],  # bizID
                        "fans_num": info['fans_num'],  # 粉丝数量
                        "brief": (info['brief'])  # 摘要信息
                    }
                    content_replys = self.content(thread)
                    article['content_reply'] = content_replys  #评价信息
                    print(article)
                    articles.append(article)
                except:
                    pass
            biz = info['biz']
            sql2 = 'update spider_user_info set status=2 where biz="%s"' % biz
            self.tools.sqll(sql2)
            return articles
        except Exception as e:
            print(e)
            biz = info['biz']
            sql3 = 'update spider_user_info set status=3 where biz="%s"' % biz
            self.tools.sqll(sql3)
            return articles