def indexgeter(qi): TatgetTag = False indexlist = [] # 从日志中读取上次进行到的位置 lastpage, lastgid, lasttoken = getlastindex() lastpage = int(lastpage) print('为API爬虫获取代理') APIProxy = getIP() # 借用API的代理寻找之前的进行到的最后一条所在的页数 if lasttoken != None: lastpage = findPage(lastpage, lastgid, lasttoken, APIProxy) print("创建目录爬虫") # 上次进行到的页数和项目 geter = getindex(lastpage=lastpage, toekn=lasttoken) while True: # 如果目录队列的项目小于5则开始获取下一页 if qi.qsize() < 5: # 目录列表为空才会启动 if len(indexlist) == 0: try: # 获取到目录 indexlist = geter.getlist() except GetIndexError as e: print(e) break try: # 调用API爬虫 APIdata = getAPIdata(indexlist, APIProxy) # 生成值对象列表 dataOVlist = datadump(APIdata) # 将值对象送入队列 for dataOV in dataOVlist: if dataOV.getindex()[1] == TARGET[1]: TatgetTag = True break # print(dataOV) qi.put(dataOV) # 清空队列 indexlist = [] dataOVlist = [] except BanIPError: print("API爬虫的代理已被Ban,更换代理") APIProxy = getIP() except APIError as e: print(e) break if TatgetTag: print("到达目标位置,目录发生器停止运行") break
def findPage(lastpage, lastindex, proxy): excookies = requests.utils.cookiejar_from_dict(COOKIE_DICT, cookiejar=None, overwrite=True) ehheaders = { 'Accept': 'text/html, application/xhtml+xml, image/jxr, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.7, ja; q=0.3', 'Connection': 'Keep-Alive', 'Host': 'exhentai.org', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393' } print("开始获取上次进行到的位置") while True: try: html = requests.get('https://exhentai.org/?page=' + str(lastpage), headers=ehheaders, cookies=excookies, proxies=proxy).text if 'Archive Download' not in html and 'IP' in html: proxy = getIP() elif lastindex[1] not in html: lastpage += 1 continue else: return lastindex except BaseException: pass
def go(no=1): IPandport = [ '104.196.177.247', 80, ] indexs = getindex(no).geter() proxycount = 0 count = 0 try: for index in indexs: while True: try: id = index[2] data = getfav_rat(index[0], index[1], IPandport) writedata(data[0], data[1], data[2], id) sleep(0.5) count += 1 no += 1 proxycount = 0 break except ProxyError: proxycount += 1 sleep(5) except ExPandaError: raise except BanIPError: IPandport = getIP() proxycount = 0 if proxycount >= 5: IPandport = getIP() proxycount = 0 # 每拿到100条数据,暂停十秒 if count == 100: sleep(10) count = 0 except BaseException as e: print('发生未知错误') # 发生错误时,输出错误信息、当前所用的no、id、代理 logging.exception(e) print(no) print(id) print(IPandport) remind('程序异常终止')
def changeproxies(): try: IPandport = getIP() except IPProxyPoolRunError as e: IPandport = MyIPandport except IPPoolEmpError as e: IPandport = MyIPandport if IPandport[0] in banedIPlist and IPandport[0] == '144.168.63.75': print('已无任何可用IP') raise NOIPError return IPandport
def __init__(self, lastpage=0, toekn=None): self.__lastpage = lastpage # 当前进行到第几页 self.__token = toekn # 上一页的最后一条 self.__excookies = requests.utils.cookiejar_from_dict( config.COOKIE_DICT, cookiejar=None, overwrite=True) # 装载cookie self.__IPandport = getIP() self.__proxies = { "https": "http://%s:%s" % (self.__IPandport[0], str(self.__IPandport[1])) } print("目录爬虫启动")
def __open_next(self): ErrorCount = 0 while True: try: return self.__open_Ex(self.__excookies, exurl='https://exhentai.org/?page=' + str(self.__lastpage)) except ExOpenError: print("代理姨妈或者E绅士服务器姨妈,等待5秒后重试") ErrorCount += 1 sleep(5) except BanIPError: print() print("目录爬虫的IP已被Ban,更换IP") self.__proxies = getIP() except BanIPError as e: print("发生未知错误,等待5秒后重试") print("错误输出:", e.__str__()) ErrorCount += 1 sleep(5) if ErrorCount >= 10: print("目录获取失败超过10次,更换IP") self.__proxies = getIP()
def reProcess(qi, qd, qe, n): if n < 3: try: IPandport = getIP() print("获取到了一个新的可用IP,启动一个进程") n += 1 Process(target=dataget, args=(qi, qd, qe, IPandport)).start() # 成功新开进程是,n+1 n += 1 except IPPoolEmpError: pass except IPProxyPoolRunError: print("IPProxyPoolRunError") pass return n
def changeip(IPandport): i = 0 while i < 5: try: # 如果原代理还能使用,继续使用原代理 testIP(IPandport) return IPandport except ProxyInvaError: i += 1 try: #如果原代理已不能使用,更换IP return getIP() except IPPoolEmpError: print("IP池已空,启动备用IP完成已分配的任务") raise
def reProcess(qip, n): if n < 20: while True: print("开始尝试重启爬虫线程") try: IPandport = getIP() print("获取到了一个新的可用IP") n += 1 qip.put(IPandport) if n >= 10: break except IPPoolEmpError: print("IP池已空") break except IPProxyPoolRunError: print("IPProxyPoolRunError") break return n
def reProcess(qip, n): # 如果实际启动的线程低于设定值,错误处理进程会反复尝试获取新IP,所以不要把线程数开太高 if n < THREAD_MAX: while True: print("开始尝试重启爬虫线程") try: IPandport = getIP() print("获取到了一个新的可用IP") n += 1 qip.put(IPandport) if n >= 10: break except IPPoolEmpError: print("IP池已空") break except IPProxyPoolRunError: print("IPProxyPoolRunError") break return n
def __dataget(qi,qd,qe,px,n): lock = threading.Lock() count = 0 try: ProxyErrorCount = 0 while True: if qi.empty(): sleep(1) else: while True: lock.acquire() try: index = qi.get() finally: lock.release() id = index[2] try: data = getfav_rat(index[0],index[1],px) sleep(0.5) data.append(id) lock.acquire() try: qd.put(data) finally: lock.release() count +=1 ProxyErrorCount = 0 except ProxyError: ProxyErrorCount +=1 sleep(10) if ProxyErrorCount >=10: # 代理连续错误十次则更换IP print("更换IP") px = getIP() ProxyErrorCount = 0 if count >=100: sleep(10) # 如果发生任何不能处理的错误(没有代理、熊猫等),向错误队列内输出当前使用的代理和当前数据 except BaseException as e: print(e.__str__()) n -= 1 errordata = [px,index,id] qe.put(errordata)
def dataget(qi, qd, qe, px): count = 0 try: ProxyErrorCount = 0 while True: if qi.empty(): sleep(1) else: while True: index = qi.get() id = index[2] try: data = getfav_rat(index[0], index[1], px) sleep(0.5) data.append(id) qd.put(data) print(data) count += 1 ProxyErrorCount = 0 except IsExHon: print("这个本子是EX的,跳过") break except ProxyError: ProxyErrorCount += 1 sleep(10) if ProxyErrorCount >= 10: # 代理连续错误十次则更换IP print("更换IP") px = getIP() ProxyErrorCount = 0 if count >= 100: sleep(10) # 如果发生任何不能处理的错误(没有代理、熊猫等),向错误队列内输出当前使用的代理和当前数据 except BaseException as e: print(e.__str__()) # 向错误队列中输入当前使用的代理、当前进行到的项目、id errordata = [px, index, id] qe.put(errordata)
def __dataget(qi, qd, qe, px, n): count = 0 try: ProxyErrorCount = 0 while True: if qi.empty(): sleep(1) else: while True: data = qi.get() try: index = data.getindex() webdata = getfav_rat(index[0], index[1], px) data.update(favorited=webdata['favorited'], ratings=webdata['ratings'], elanguage=webdata['elanguage'], title_jpn=webdata['title_jpn']) sleep(0.5) qd.put(data) count += 1 ProxyErrorCount = 0 # 爬取EX本子时,404错误全为代理服务器抛出的 except ProxyError: ProxyErrorCount += 1 sleep(10) if ProxyErrorCount >= 10: # 代理连续错误十次则更换IP print("更换爬虫IP") px = getIP() ProxyErrorCount = 0 if count >= 100: sleep(10) # 如果发生任何不能处理的错误(没有代理、熊猫等),向错误队列内输出当前使用的代理和当前数据 except BaseException as e: print("爬虫进程发生未知错误:", e.__str__()) n -= 1 errordata = [px, data] qe.put(errordata)
from Proxy.IPPool import getIP import threading ''' 后记:目录、数据处理、错误处理作为三个线程在主进程里启动 爬虫作为子进程启动 但事实上并没有对爬虫进程做任何控制,如果不是在PyCharm中运行,可能会有僵尸进程的问题 另外我的爬虫进程仍然是单进程单线程的,爬虫这种IO阻塞很严重的工作这么做效率很低 事实上就是因为我的电脑风扇声大到快要爆炸..我才写了多线程版本。 ''' if __name__ == '__main__': # 目录队列 qindex = Queue() # 数据队列 qdata = Queue() # 错误队列 qerror = Queue() threading.Thread(target=index_get, args=(qindex, 197078)).start() threading.Thread(target=read, args=(qdata, )).start() n = 0 # 最多启动四个爬虫进程 while n < 4: try: Process(target=dataget, args=(qindex, qdata, qerror, getIP())).start() n += 1 except BaseException: break threading.Thread(target=error_handing, args=(qindex, qdata, qerror, n)).start()
def __read_html(self): hlist = [] ErrorCount = 0 while True: try: bsobj = BeautifulSoup(self.__open_next()) table = bsobj.find('table', {'class': 'itg'}) for link in table.findAll( 'a', href=re.compile( 'https://exhentai\.org/g/[0-9]{1,8}/[A-Za-z0-9]{10}/' )): if 'href' in link.attrs: hlist.append(self.__parse_html(link.attrs['href'])) # 检查上一页的最后一条有没有被挤到这页,如果有,修剪目录list if self.__token in hlist: lastindex = hlist.index(self.__token) hlist = hlist[lastindex + 1:len(hlist)] # 更新最后一条的值 self.__token = hlist[-1] # 写入最后一条的值 rootpath = os.path.abspath(config.__file__) if os.name == "nt": pathlist = rootpath.split("\\") pathlist.pop(len(pathlist) - 1) pathlist.append("lastpage&index.txt") lastindexpath = "\\".join(pathlist) else: pathlist = rootpath.split("/") pathlist.pop(len(pathlist) - 1) pathlist.append("lastpage&index.txt") lastindexpath = "/".join(pathlist) with open(lastindexpath, 'w') as f: f.write( str(self.__lastpage) + "," + str(self.__token[0]) + ',' + self.__token[1]) self.__lastpage += 1 return hlist except BaseException as e: print("列表爬虫发生未知异常,重新尝试获取列表") print("异常信息:", e.__str__()) rootpath = os.path.abspath(config.__file__) if os.name == "nt": pathlist = rootpath.split("\\") pathlist.pop(len(pathlist) - 1) pathlist.append("Log.txt") logpath = "\\".join(pathlist) else: pathlist = rootpath.split("/") pathlist.pop(len(pathlist) - 1) pathlist.append("Log.txt") logpath = "/".join(pathlist) with open(logpath, 'a') as f: f.write("列表爬虫异常:" + e.__str__()) ErrorCount += 1 if ErrorCount > 5: self.__IPandport = getIP() self.__proxies = { "https": "http://%s:%s" % (self.__IPandport[0], str(self.__IPandport[1])) }
def go(id=1, IPandport=MyIPandport): x = 0 i = 0 # 目录数计数 indexcount = 0 # json语句变量 indexlist = [] # 代理错误计数 proxycount = 0 IPandport = getIP() jsonStr = {"method": "gdata", "gidlist": indexlist, "namespace": 1} geter = getindex(id).geter() try: for index in geter: indexlist.append([index[0], index[1]]) indexcount += 1 # print(index) while indexcount == 25: try: datajson = getdata(jsonStr, IPandport) id = writedata(datajson, id) print('已写入%s' % (id - 1)) i += 1 indexcount = 0 # 如果写入成功,重置目录计数 indexlist = [] # 如果写入成功,重置目录列表 jsonStr = { "method": "gdata", "gidlist": indexlist, "namespace": 1 } proxycount = 0 # 如果写入成功,重置代理测试计数 except requests.exceptions.ProxyError as e: proxycount += 1 sleep(10) print(e.__str__()) except BanIPError as e: print('%s该代理已经被ban,更换代理' % (IPandport[0])) banedIPlist.append(IPandport[0]) IPandport = changeproxies() proxycount = 0 # 重置代理计数 except ConnectionResetError: print('%s该代理已经被ban,更换代理' % (IPandport[0])) banedIPlist.append(IPandport[0]) IPandport = changeproxies() proxycount = 0 # 重置代理计数 if proxycount == 3: print('当前代理已连续三次失效,更换代理') IPandport = changeproxies() proxycount = 0 # 重置代理计数 if i == 5: sleep(6) i = 0 if len(indexlist) != 0 and len(indexlist) != 25: try: datajson = getdata(jsonStr, IPandport) id = writedata(id, datajson) indexcount = 0 # 如果写入成功,重置目录计数 indexlist = [] # 如果写入成功,重置目录列表 jsonStr = { "method": "gdata", "gidlist": indexlist, "namespace": 1 } proxycount = 0 # 如果写入成功,重置代理测试计数 except requests.exceptions.ProxyError as e: proxycount += 1 print(e.__str__()) except BanIPError as e: banedIPlist.append(IPandport[0]) IPandport = changeproxies() proxycount = 0 # 重置代理计数 if proxycount == 3: print('当前代理已连续三次失效,更换代理') IPandport = changeproxies() proxycount = 0 # 重置代理计数 except BaseException as e: print('发生未知错误') logging.exception(e) print(id) print(jsonStr) remind('程序异常终止')
if __name__ == "__main__": # 目录队列 qindex = Queue() # 数据队列 qdata = Queue() # 错误队列 qerror = Queue() # 代理队列 qip = Queue() # 启动目录发生器进程 Process(target=indexgeter, args=(qindex, )).start() # 开始启动爬虫进程 n = 0 while n < PROCESS_MAX: Process(target=webdatageter, args=(qindex, qdata, qerror, qip)).start() n += 1 # 数据写入器线程 threading.Thread(target=data_writer, args=(qdata, )).start() # 错误处理线程 threading.Thread(target=error_handing, args=(qdata, qerror, qip, n)).start() # 获取THREAD_COUNT个代理用于开启爬虫线程 n = 0 while n < THREAD_MAX * THREAD_MAX: try: qip.put(getIP()) n += 1 except BaseException: break