def get(self,url,headers=None,proxies=None,timeout=5,timeoutRetry=5): ''' :param url: 目标URL :param headers: 请求头 :param proxies: 代理IP :param timeout: 超时时间 :param timeoutRetry: 超时次数 :return: respone ''' if not headers: headers={'User-Agent':random.choice(UA), 'Cookie': '__DAYU_PP=6NEQzBVeevByUuMjrEzEffffffff8a15a08bae39; q_c1=9c46a0fb9939422881967decd788e462|1521676280000|1521676280000; _zap=fd45d673-aa0f-4999-ae2d-57dce5ceae92; l_cap_id="MTVhZTgwMmE2NjJmNDU3ZjliYWNmYzMyNjdiZTAzODE=|1524095802|97edbda534c1a22c5d8db4392a9179c1d21b400e"; r_cap_id="MDVjMDY3MDk0YjkwNGFhNWI2NDkwM2FiNzdlOThmN2U=|1524095802|0b58fcce4bf7e66bf1184ac9e18e0ee5bb697c93"; cap_id="YTljOWU3ZDQ2YWZjNDBlZGFiMzBkMWZkYTIzNDljZjQ=|1524095802|c914e7f3996b7dac4464d21b80c67f46ef35b4fb"; capsion_ticket="2|1:0|10:1524095932|14:capsion_ticket|44:ZGU5NzQwZjE2Mjk0NDJmNDhiMjIyNTRiZmU5ZWVmNmM=|be383318d21315d6c3b0b662368773b0223a71e1bf5e6426a12aa94e5a8e0221"; __utma=155987696.1438037459.1523587187.1523587187.1523955550.2; __utmz=155987696.1523587187.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAANWDWRwK9AEAjDIUOoWFTi0g+Pvf; _xsrf=c860fc1b-7a7d-46c8-ae16-3ef95d1dbe1f; d_c0="ALAvfaeYdA2PTopEc437l1BsFz92c33MUBE=|1523942945"; l_n_c=1; n_c=1; __utmc=155987696; z_c0="2|1:0|10:1524095942|4:z_c0|92:Mi4xbWJaS0JnQUFBQUFBc0M5OXA1aDBEU1lBQUFCZ0FsVk54aW5GV3dDUGY2Z3F1R29xNTZwekVaSXJ3WG90Y2RnVmNR|0227ba78d786d9e305d14d06d65e7a232bc6c03555df0ffcc52a4206a1f9b5f4"' } if not proxies: proxylist=getProxy(1) proxies={'proxies':random.choice(proxylist)} ''' requests.get请求 ''' try: res=requests.get(url,headers=headers,proxies=proxies,timeout=timeout) res.raise_for_status() htmlCode=res.text except Exception as e: logging.error('getEXCEPT:{}'.format(e)) if timeoutRetry>0: proxylist=getProxy(1) htmlCode=self.get(url=url,headers=headers,proxies=random.choice(proxylist),timeoutRetry=timeoutRetry-1) else: logging.error('getTIMEOUT:{}'.format(url)) htmlCode=None return htmlCode
def parseMessage(processID): ''' 解析所有已爬token :param processID: 进程编号,justForTest :return: None ''' while True: #检测代理的剩余量,补充及大休眠 if r.scard('proxy') < 10: if processID == 0: getProxy.getProxy() time.sleep(100) #检测爬虫是否完成 if r.scard('oldID') == 0: time.sleep(20) if r.scard('oldID') == 0: print('(%s)号信息结束' % processID) break else: continue #随机获取并解析 parseToken = r.spop('oldID') messageUrl = 'https://www.zhihu.com/api/v4/members/{token}?include=name,url_token,educations,business,locations,employments,gender,following_count,follower_count,voteup_count,thanked_count,favorited_count,answer_count,articles_count,question_count.topics' proxy = r.srandmember('proxy') htmlCode = httpClass.get(messageUrl.format(token=parseToken), headers={ 'User-Agent': random.choice(Setting.UA), 'Cookie': random.choice(Setting.Cookies) }, proxies={'proxy': proxy}) #访问失败跳过,过会重来 if htmlCode == None: r.srem('proxy', proxy) r.sadd('oldID', parseToken) continue #存储2MySQL DB:zhihuSpider parseClass = Json(htmlCode) parseClass.parseMessage()
def main(kind): #用来存储的总list totalList = [] #更改Referer,告知网页出处 headers['Referer'] = 'https://%s.douban.com/chart' % kind #开始页面,0等于从1开始 Num = 0 #生成代理IP池 proxys = getProxy(1) #250条信息,单页25条信息,需要循环10次 for i in range(10): #生成URL,eg.movie homeUrl = 'http://' + kind + '.douban.com/top250?start=' + str(Num) #判断种类,根据种类解析Movie if kind == 'movie': #解析Top250主页面,返回内文URL pageUrlList = parseMovieHome(homeUrl, proxys) #开始抓取内文提示 print('\n链接豆瓣{}TOP250第{}页信息成功(剩余{}页)'.format(kind, i + 1, 9 - i)) #解析单个页面,返回需要信息(电影名/年份/评分/评分人数/标签/剧情简介/) totalList = totalList + parseMovie(pageUrlList, proxys) #判断种类,根据种类解析Music elif kind == 'music': #解析Top250主页面,返回内文URL pageUrlList = parseMusicHome(homeUrl, proxys) #开始抓取内文提示 print('\n链接豆瓣{}TOP250第{}页信息成功(剩余{}页)'.format(kind, i + 1, 9 - i)) #解析单个页面,返回需要信息(专辑名/表演者/评分/评分人数/流派/专辑类型/介质/发行时间/出版者/条形码/ISRC/简介/曲目) totalList = totalList + parseMusic(pageUrlList, proxys) #判断种类,根据种类解析Book elif kind == 'book': #解析Top250主页面,返回内文URL pageUrlList = parseBookHome(homeUrl, proxys) #开始抓取内文提示 print('\n链接豆瓣{}TOP250第{}页信息成功(剩余{}页)'.format(kind, i + 1, 9 - i)) #解析单个页面,返回需要信息(名字/作者/评分/评分人数/出版年/页数/ISBC/内容简介) totalList = totalList + parseBook(pageUrlList, proxys) #HomeUrl更换 Num += 25 #存储信息excel saveMessage(totalList, kind) #结束movie循环 return True
print(('%s------%s' % (id, message))) print('爬取数目:' + str(len(totalToken))) totalToken.append(token) time.sleep(2) ot(token, proxylist) else: pass judge = pageParse.parsePage() offset += 20 if __name__ == '__main__': id = 'jixin' totalToken = [] ''' 通过用户关注页面获取url_token/通过is_end判断是否有下页/通过next获取下一页关注界面 初始用户关注页面(只需user) https://www.zhihu.com/api/v4/members/{user}/followees?include=data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics&limit=20&offset=0 ''' proxylist = getProxy(1) startHttp = Http() startCode, proxylist = startHttp.get( url.format(user=id), headers=headers, proxies={'proxy': 'http://111.170.82.89:61234'}, proxylist=proxylist) startParse = jsonParse(startCode) message, token = startParse.parseMessage() totalToken.append(token) ot(token, proxylist)
def allFollowID(processID): ''' 爬取所有关注者及子孙关注者token :param processID: 进程编号,justForTest :return: None ''' print(1) while True: print('googingggg') #检测代理的剩余量,补充及大休眠 if r.scard('proxy') < 10: if processID == 0: getProxy.getProxy() time.sleep(100) proxy = r.srandmember('proxy') #检测爬虫是否完成 if r.scard('newID') == 0: time.sleep(20) if r.scard('newID') == 0: break else: continue #从newID随机获取token并查询是否已解析 A = r.spop('newID') if r.sismember('oldID', A): continue n = 0 #页码 print(A) htmlCode = httpClass.get(Setting.followUrl.format(id=A, page=n), headers={ 'User-Agent': random.choice(Setting.UA), 'Cookie': random.choice(Setting.Cookies) }, proxies={'proxy': proxy}) print(htmlCode) while True: #访问失败跳过该token if htmlCode == None: r.sadd('oldID', A) r.srem('proxy', proxy) break parseClass = Json(htmlCode) if parseClass.isZeroFollow() == True: r.sadd('oldID', A) break if parseClass.isEnd() == True: #末页,最终解析 followList = parseClass.parseFollow() for i in followList: r.sadd('newID', i) r.sadd('oldID', A) momentTime = time.time() time.sleep(3) print('已爬%s条,未爬%s条' % (r.scard('oldID'), r.scard('newID'))) print('(%s)已运行%s秒' % (processID, (momentTime - startTime))) break #解析下一个token else: #还有下一页,继续解析 followList = parseClass.parseFollow() for i in followList: r.sadd('newID', i) n += 20
def proxyCount(self,processID): #检测代理的剩余量,补充 if processID==0: if r.scard('proxy') < 30: getProxy.getProxy()