Exemple #1
0
 def get(self,url,headers=None,proxies=None,timeout=5,timeoutRetry=5):
     '''
     :param url: 目标URL
     :param headers: 请求头
     :param proxies: 代理IP
     :param timeout: 超时时间
     :param timeoutRetry: 超时次数
     :return: respone
     '''
     if not headers:
         headers={'User-Agent':random.choice(UA),
                  'Cookie': '__DAYU_PP=6NEQzBVeevByUuMjrEzEffffffff8a15a08bae39; q_c1=9c46a0fb9939422881967decd788e462|1521676280000|1521676280000; _zap=fd45d673-aa0f-4999-ae2d-57dce5ceae92; l_cap_id="MTVhZTgwMmE2NjJmNDU3ZjliYWNmYzMyNjdiZTAzODE=|1524095802|97edbda534c1a22c5d8db4392a9179c1d21b400e"; r_cap_id="MDVjMDY3MDk0YjkwNGFhNWI2NDkwM2FiNzdlOThmN2U=|1524095802|0b58fcce4bf7e66bf1184ac9e18e0ee5bb697c93"; cap_id="YTljOWU3ZDQ2YWZjNDBlZGFiMzBkMWZkYTIzNDljZjQ=|1524095802|c914e7f3996b7dac4464d21b80c67f46ef35b4fb"; capsion_ticket="2|1:0|10:1524095932|14:capsion_ticket|44:ZGU5NzQwZjE2Mjk0NDJmNDhiMjIyNTRiZmU5ZWVmNmM=|be383318d21315d6c3b0b662368773b0223a71e1bf5e6426a12aa94e5a8e0221"; __utma=155987696.1438037459.1523587187.1523587187.1523955550.2; __utmz=155987696.1523587187.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAANWDWRwK9AEAjDIUOoWFTi0g+Pvf; _xsrf=c860fc1b-7a7d-46c8-ae16-3ef95d1dbe1f; d_c0="ALAvfaeYdA2PTopEc437l1BsFz92c33MUBE=|1523942945"; l_n_c=1; n_c=1; __utmc=155987696; z_c0="2|1:0|10:1524095942|4:z_c0|92:Mi4xbWJaS0JnQUFBQUFBc0M5OXA1aDBEU1lBQUFCZ0FsVk54aW5GV3dDUGY2Z3F1R29xNTZwekVaSXJ3WG90Y2RnVmNR|0227ba78d786d9e305d14d06d65e7a232bc6c03555df0ffcc52a4206a1f9b5f4"'
         }
     if not proxies:
         proxylist=getProxy(1)
         proxies={'proxies':random.choice(proxylist)}
     '''
     requests.get请求
     '''
     try:
         res=requests.get(url,headers=headers,proxies=proxies,timeout=timeout)
         res.raise_for_status()
         htmlCode=res.text
     except Exception as e:
         logging.error('getEXCEPT:{}'.format(e))
         if timeoutRetry>0:
             proxylist=getProxy(1)
             htmlCode=self.get(url=url,headers=headers,proxies=random.choice(proxylist),timeoutRetry=timeoutRetry-1)
         else:
             logging.error('getTIMEOUT:{}'.format(url))
             htmlCode=None
     return htmlCode
Exemple #2
0
def parseMessage(processID):
    '''
    解析所有已爬token
    :param processID: 进程编号,justForTest
    :return: None
    '''
    while True:
        #检测代理的剩余量,补充及大休眠
        if r.scard('proxy') < 10:
            if processID == 0:
                getProxy.getProxy()
            time.sleep(100)

        #检测爬虫是否完成
        if r.scard('oldID') == 0:
            time.sleep(20)
            if r.scard('oldID') == 0:
                print('(%s)号信息结束' % processID)
                break
            else:
                continue

        #随机获取并解析
        parseToken = r.spop('oldID')
        messageUrl = 'https://www.zhihu.com/api/v4/members/{token}?include=name,url_token,educations,business,locations,employments,gender,following_count,follower_count,voteup_count,thanked_count,favorited_count,answer_count,articles_count,question_count.topics'
        proxy = r.srandmember('proxy')
        htmlCode = httpClass.get(messageUrl.format(token=parseToken),
                                 headers={
                                     'User-Agent': random.choice(Setting.UA),
                                     'Cookie': random.choice(Setting.Cookies)
                                 },
                                 proxies={'proxy': proxy})
        #访问失败跳过,过会重来
        if htmlCode == None:
            r.srem('proxy', proxy)
            r.sadd('oldID', parseToken)
            continue

        #存储2MySQL DB:zhihuSpider
        parseClass = Json(htmlCode)
        parseClass.parseMessage()
Exemple #3
0
def main(kind):
    #用来存储的总list
    totalList = []
    #更改Referer,告知网页出处
    headers['Referer'] = 'https://%s.douban.com/chart' % kind
    #开始页面,0等于从1开始
    Num = 0
    #生成代理IP池
    proxys = getProxy(1)
    #250条信息,单页25条信息,需要循环10次
    for i in range(10):
        #生成URL,eg.movie
        homeUrl = 'http://' + kind + '.douban.com/top250?start=' + str(Num)
        #判断种类,根据种类解析Movie
        if kind == 'movie':
            #解析Top250主页面,返回内文URL
            pageUrlList = parseMovieHome(homeUrl, proxys)
            #开始抓取内文提示
            print('\n链接豆瓣{}TOP250第{}页信息成功(剩余{}页)'.format(kind, i + 1, 9 - i))
            #解析单个页面,返回需要信息(电影名/年份/评分/评分人数/标签/剧情简介/)
            totalList = totalList + parseMovie(pageUrlList, proxys)
        #判断种类,根据种类解析Music
        elif kind == 'music':
            #解析Top250主页面,返回内文URL
            pageUrlList = parseMusicHome(homeUrl, proxys)
            #开始抓取内文提示
            print('\n链接豆瓣{}TOP250第{}页信息成功(剩余{}页)'.format(kind, i + 1, 9 - i))
            #解析单个页面,返回需要信息(专辑名/表演者/评分/评分人数/流派/专辑类型/介质/发行时间/出版者/条形码/ISRC/简介/曲目)
            totalList = totalList + parseMusic(pageUrlList, proxys)
        #判断种类,根据种类解析Book
        elif kind == 'book':
            #解析Top250主页面,返回内文URL
            pageUrlList = parseBookHome(homeUrl, proxys)
            #开始抓取内文提示
            print('\n链接豆瓣{}TOP250第{}页信息成功(剩余{}页)'.format(kind, i + 1, 9 - i))
            #解析单个页面,返回需要信息(名字/作者/评分/评分人数/出版年/页数/ISBC/内容简介)
            totalList = totalList + parseBook(pageUrlList, proxys)
        #HomeUrl更换
        Num += 25
    #存储信息excel
    saveMessage(totalList, kind)
    #结束movie循环
    return True
Exemple #4
0
                print(('%s------%s' % (id, message)))
                print('爬取数目:' + str(len(totalToken)))
                totalToken.append(token)
                time.sleep(2)
                ot(token, proxylist)
            else:
                pass
        judge = pageParse.parsePage()
        offset += 20


if __name__ == '__main__':
    id = 'jixin'
    totalToken = []
    '''
    通过用户关注页面获取url_token/通过is_end判断是否有下页/通过next获取下一页关注界面
    初始用户关注页面(只需user)
    https://www.zhihu.com/api/v4/members/{user}/followees?include=data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics&limit=20&offset=0
    '''
    proxylist = getProxy(1)
    startHttp = Http()
    startCode, proxylist = startHttp.get(
        url.format(user=id),
        headers=headers,
        proxies={'proxy': 'http://111.170.82.89:61234'},
        proxylist=proxylist)
    startParse = jsonParse(startCode)
    message, token = startParse.parseMessage()
    totalToken.append(token)
    ot(token, proxylist)
Exemple #5
0
def allFollowID(processID):
    '''
    爬取所有关注者及子孙关注者token
    :param processID: 进程编号,justForTest
    :return: None
    '''
    print(1)
    while True:
        print('googingggg')
        #检测代理的剩余量,补充及大休眠
        if r.scard('proxy') < 10:
            if processID == 0:
                getProxy.getProxy()
            time.sleep(100)
        proxy = r.srandmember('proxy')

        #检测爬虫是否完成
        if r.scard('newID') == 0:
            time.sleep(20)
            if r.scard('newID') == 0:
                break
            else:
                continue

        #从newID随机获取token并查询是否已解析
        A = r.spop('newID')
        if r.sismember('oldID', A):
            continue
        n = 0  #页码
        print(A)
        htmlCode = httpClass.get(Setting.followUrl.format(id=A, page=n),
                                 headers={
                                     'User-Agent': random.choice(Setting.UA),
                                     'Cookie': random.choice(Setting.Cookies)
                                 },
                                 proxies={'proxy': proxy})
        print(htmlCode)
        while True:

            #访问失败跳过该token
            if htmlCode == None:
                r.sadd('oldID', A)
                r.srem('proxy', proxy)
                break

            parseClass = Json(htmlCode)
            if parseClass.isZeroFollow() == True:
                r.sadd('oldID', A)
                break

            if parseClass.isEnd() == True:
                #末页,最终解析
                followList = parseClass.parseFollow()
                for i in followList:
                    r.sadd('newID', i)
                r.sadd('oldID', A)

                momentTime = time.time()
                time.sleep(3)
                print('已爬%s条,未爬%s条' % (r.scard('oldID'), r.scard('newID')))
                print('(%s)已运行%s秒' % (processID, (momentTime - startTime)))
                break  #解析下一个token
            else:
                #还有下一页,继续解析
                followList = parseClass.parseFollow()
                for i in followList:
                    r.sadd('newID', i)
                n += 20
Exemple #6
0
 def proxyCount(self,processID):
     #检测代理的剩余量,补充
     if processID==0:
         if r.scard('proxy') < 30:
             getProxy.getProxy()