def getListHtml(self, url, repeat_count=0): validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(url, proxies=proxy, timeout=5) if response.status_code == 200: self.validip_que.put(validip) response.encoding = "utf-8" imgurls = re.findall( r'"file":{"bucket":"hbimg", "key":"([\S\s\\s\r\n]{0,}?)",', response.text, re.I) for imgurl in imgurls: src = "http://img.hb.aicdn.com/" + imgurl self.sqlInsertFailedUrl(url, "image") else: raise StatusCodeError("状态码错误") except BaseException as e: print("列表页下载异常,错误信息为%s" % (str(e))) self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s列表页下载失败,正在进行第%d次重新下载!" % (url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页下载失败,添加至数据库" % (url)) self.sqlInsertFailedUrl(url, "list")
def downloadImage(self, src, file_path, repeat_count=0): validip = self.validip_que.get() proxy = {'http': validip} try: start_time = time.time() response = requests.get(src, proxies=proxy) if response.status_code == 200: img_content = response.content with open(file_path, "wb") as f: f.write(img_content) end_time = time.time() inter = end_time - start_time print("%s成功下载图片%s,共花费%f秒" % (self.threadName, file_path, inter)) else: raise StatusCodeError("状态码错误") except BaseException as e: print("%s图片下载异常,错误信息为%s" % (self.threadName, str(e))) self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s图片下载抛出异常,正在进行第%d次重新下载!" % (src, repeat_count)) self.downloadImage(src, file_path, repeat_count) else: print("%s图片下载失败,将添加下载失败信息到数据表" % (src)) self.sqlInsertFailedUrl(src, "image")
def downloadImage(self, src, file_path, repeat_count=0): # validip = self.validip_que.get() # proxy = {'http': validip} try: start_time = time.time() response = requests.get(src) if response.status_code == 200: img_content = response.content with open(file_path, "wb") as f: f.write(img_content) end_time = time.time() inter = end_time - start_time print("{}成功下载图片{},共花费{}秒".format(self.threadName, file_path, inter)) else: raise StatusCodeError("状态码错误") except BaseException as e: print("{}图片下载异常,错误信息为{},行号为".format(self.threadName, str(e), e.__traceback__.tb_lineno)) # self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("{}图片{}下载抛出异常,正在进行第{}次重新下载!".format( self.threadName, src, repeat_count)) self.downloadImage(src, file_path, repeat_count) else: print("{}图片{}下载失败,将添加下载失败信息到数据表".format(self.threadName, src)) self.sqlInsertFailedUrl(src, "image")
def getListHtml(self, url, repeat_count=0): # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(url, timeout=5) if response.status_code == 200: # self.validip_que.put(validip) page_no = url.replace( "http://www.cndesign.com/Query/Works?key=%E6%8F%92%E7%94%BB&page=", "") page_dir = "C:/cdndesign/page" + page_no self.makeDir(page_dir, "列表文件夹") response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") a_list = soup.select(".pl_img_box a") for a in a_list: arc_url = "http://www.cndesign.com" + a.get("href") self.getArcHtml(arc_url, page_dir) else: raise StatusCodeError("状态码错误") except BaseException as e: print("列表页请求异常,错误信息为%s" % (str(e))) # self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s列表页下载失败,正在进行第%d次重新下载!" % (url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页下载失败,添加至数据库" % (url)) self.sqlInsertFailedUrl(url, "list")
def getArcHtml(self,arc_url,page_dir,repeat_count=0): start_time=time.time() validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(arc_url,proxies=proxy,timeout=15) if response.status_code == 200: self.validip_que.put(validip) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") img_list = soup.select('.pic-content img') for img in img_list: src=img.get("src") if src: alt = img.get("alt") invalid_str_arr = ["/", ".","\\", "\r\n","。","*", '"', "<", ">", "|", "?","?",":"] for invalid_str in invalid_str_arr: alt=alt.replace(invalid_str, "") file_extension_name = src.split("/")[-1].split(".")[-1] file_name = "{}.{}".format(alt,file_extension_name) file_path = page_dir + "/" + file_name if not os.path.exists(file_path): self.downloadImage(src,file_path) else: raise StatusCodeError("状态码错误") except BaseException as e: print("{}详情页下载异常,错误信息为{},所在行号为{}" .format (self.threadName,str(e),e.__traceback__.tb_lineno)) self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s详情页%s下载失败,正在进行第%d次重新下载!" % (self.threadName,arc_url, repeat_count)) self.getArcHtml(arc_url, page_dir, repeat_count) else: print("%s详情页%s下载失败" % (self.threadName,arc_url)) self.sqlInsertFailedUrl(arc_url,"article")
def getListHtml(self, url, repeat_count=0): validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(url, proxies=proxy, timeout=5) if response.status_code == 200: page_no = url.replace("https://www.huiyi8.com/qqbq/", "").replace(".html", "") page_dir = "C:/huiyi8/page{}".format(page_no) self.makeDir(page_dir, "列表文件夹") self.validip_que.put(validip) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") a_list = soup.select(".works-img-box") for a in a_list: arc_url = a.get("href") self.getArcHtml(arc_url, page_dir) else: raise StatusCodeError("状态码错误") except BaseException as e: print("%s列表页下载异常,错误信息为%s" % (self.threadName, str(e))) self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s列表页%s下载失败,正在进行第%d次重新下载!" % (self.threadName, url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页%s下载失败,添加至数据库" % (self.threadName, url)) self.sqlInsertFailedUrl(url, "list")
def getListHtml(self, url, repeat_count=0): validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(url, proxies=proxy, timeout=5) if response.status_code == 200: self.validip_que.put(validip) page_no = url.split("%B1%ED%C7%E9%B0%FC/")[-1].replace("/", "") page_dir = "C:/lanrentuku/page" + page_no self.makeDir(page_dir, "列表文件夹") response.encoding = "gb2312" soup = BeautifulSoup(response.text, "lxml") a_list = soup.select(".list-qq dl dd a") for a in a_list: arc_url = "http://www.lanrentuku.com/%s" % (a.get("href")) self.getArcHtml(arc_url, page_dir) else: raise StatusCodeError("状态码错误") except BaseException as e: self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s列表页%s下载失败,正在进行第%d次重新下载!" % (self.threadName, url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页%s下载失败,添加至数据库" % (self.threadName, url)) self.sqlInsertFailedUrl(url, "list")
def getListHtml(self, url, repeat_count=0): # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(url, timeout=5) if response.status_code == 200: # self.validip_que.put(validip) page_no = url.replace( "https://www.zcool.com.cn/search/content?type=0&field=0&other=0&sort=5&word=%E6%8F%92%E7%94%BB&recommend=0&requestId=requestId_1550647688974&p=", "").replace("#tab_anchor", "") page_dir = "C:/zcool/page" + page_no self.makeDir(page_dir, "列表文件夹") response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") a_list = soup.select(".card-img-hover") for a in a_list: arc_url = a.get("href") self.getArcHtml(arc_url, page_dir) else: raise StatusCodeError("状态码错误") except BaseException as e: print("列表页下载异常,错误信息为%s" % (str(e))) # self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s列表页下载失败,正在进行第%d次重新下载!" % (url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页下载失败,添加至数据库" % (url)) self.sqlInsertFailedUrl(url, "list")
def getArcHtml(self, arc_url, repeat_count=0): start_time = time.time() validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(arc_url, proxies=proxy, timeout=5) if response.status_code == 200: self.validip_que.put(validip) response.encoding = "gb2312" soup = BeautifulSoup(response.text, "lxml") img_list = soup.select('.content_word img') title = soup.select('h3')[0].text invalid_str_arr = ["/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|", "?", "?", ":"] for invalid_str in invalid_str_arr: title = title.replace(invalid_str, "") index = 0 for img in img_list: index += 1 src = img.get("src") alt = img.get("alt") file_extension_name = src.split("!")[0].split("/")[-1].split(".")[-1] file_name = "{}-{}.{}".format(title, index, file_extension_name) file_path = "C:/QQ/qq" + "/" + file_name self.downloadImage(src, file_path) else: raise StatusCodeError("状态码错误") except BaseException as e: print("%s详情页下载异常,错误信息为%s" % (self.threadName, str(e))) repeat_count += 1 if repeat_count < 4: print("%s详情页%s下载失败,正在进行第%d次重新下载!" % (self.threadName, arc_url, repeat_count)) self.getArcHtml(arc_url, repeat_count) else: print("%s详情页%s下载失败" % (self.threadName, arc_url)) self.sqlInsertFailedUrl(arc_url, "article")
def getListHtml(self,url,repeat_count=0): # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(url,timeout=7) if response.status_code == 200: # self.validip_que.put(validip) page_no = url.replace("https://www.jiuwa.net/face/p-","") page_dir = "C:/jiuwa/index_" + page_no self.makeDir(page_dir,"列表文件夹") response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") a_list = soup.select(".title a") for a in a_list: arc_url=a.get("href") self.getArcHtml("https://www.jiuwa.net"+arc_url,page_dir) else: raise StatusCodeError("状态码错误") except BaseException as e: print("{}列表页下载异常,错误信息为{},错误行号为{}" .format(self.threadName, str(e),e.__traceback__.tb_lineno)) # self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("{}列表页{}下载失败,正在进行第{}次重新下载!" .format(self.threadName,url, repeat_count)) self.getListHtml(url, repeat_count) else: print("{}列表页{}下载失败,添加至数据库" .format(self.threadName,url)) self.sqlInsertFailedUrl(url,"list")
def getListHtml(self, url, repeat_count=0): # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(url, timeout=8) if response.status_code == 200: page_no = url.replace( "http://so.ooopic.com/search-b1edc7e9b0fc-0-0_0__0__0_ooo_0_", "").replace( "_0_0_0_0_0_0_0___.html", "" ) if url != "http://so.ooopic.com/sousuo/2000346/" else 1 page_dir = "C:/oopic/page%s" % page_no self.makeDir(page_dir, "列表文件夹") # self.validip_que.put(validip) response.encoding = "gbk" soup = BeautifulSoup(response.text, "lxml") a_list = soup.select(".datapic") for a in a_list: arc_url = a.get("href") self.getArcHtml(arc_url, page_dir) else: raise StatusCodeError("状态码错误") except BaseException as e: print("%s列表页下载异常,错误信息为%s" % (self.threadName, str(e))) # self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s列表页下载失败,正在进行第%d次重新下载!" % (url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页下载失败,添加至数据库" % (url)) self.sqlInsertFailedUrl(url, "list")
def getListHtml(self, url, repeat_count=0): validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(url, proxies=proxy, timeout=5) if response.status_code == 200: self.validip_que.put(validip) response.encoding = "gb2312" soup = BeautifulSoup(response.text, "lxml") a_list = soup.select(".txt") for a in a_list: arc_url = a.get("href") if "qq.qqjia.com" in a.get("href") else "http://www.qqjia.com/" + a.get( "href") print(arc_url) # self.getArcHtml(arc_url) else: raise StatusCodeError("状态码错误") except BaseException as e: print("{}请求列表页异常,错误信息为{},行号为{}".format(self.threadName, str(e), e.__traceback__.tb_lineno)) self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s列表页%s下载失败,正在进行第%d次重新下载!" % (self.threadName, url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页%s下载失败,添加至数据库" % (self.threadName, url)) self.sqlInsertFailedUrl(url, "list")
def getListHtml(self,url,repeat_count=0): validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(url,proxies=proxy,timeout=7) if response.status_code == 200: self.validip_que.put(validip) page_no = url.replace("https://www.doutula.com/article/list/?page=", "") page_dir="C:/doutula/page" + page_no self.makeDir(page_dir,"列表页文件夹") response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") a_list = soup.select(".list-group-item") for a in a_list: arc_url=a.get("href") if arc_url: self.getArcHtml(arc_url,page_dir) else: raise StatusCodeError("状态码错误,状态码为%d"%(response.status_code)) except BaseException as e: print("列表页下载异常,错误信息为%s" % (str(e))) repeat_count += 1 if repeat_count < 4: print("%s列表页下载失败,正在进行第%d次重新下载!" % (url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页下载失败" % (url)) self.sqlInsertFailedUrl(url,"list")
def getListHtml(self, url, repeat_count=0): # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(url, timeout=7) if response.status_code == 200: page_no = url.split("index_")[-1].replace(".html", "") page_dir = "C:/chinaz/index_%s" % ( page_no ) if url != "http://sc.chinaz.com/biaoqing/index.html" else "C:/chinaz/index" self.makeDir(page_dir, "列表目录") # self.validip_que.put(validip) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") a_list = soup.select(".num_2 a") for a in a_list: arc_url = a.get("href") self.getArcHtml(arc_url, url) else: raise StatusCodeError("状态码错误") except BaseException as e: print("%s列表页请求异常,错误信息为%s" % (self.threadName, str(e))) # self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s列表页%s下载失败,正在进行第%d次重新下载!" % (self.threadName, url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页%s下载失败,添加至数据库" % (self.threadName, url)) self.sqlInsertFailedUrl(url, "list")
def sendAjaxRequest(self,repeat_count=0): self.param = { "from": 'en', "to": 'zh', "query": self.en_word, "transtype": "translang", "simple_means_flag": "3", "sign": self.sign, "token": self.token } validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.post(self.ajaxUrl,proxies=proxy, headers=self.ajaxRequestHeaders, data=self.param) if response.status_code==200 or response.status_code==304: self.validip_que.put(validip) self.data=response.text dict = json.loads(self.data) self.cn_word=dict['trans_result']['data'][0]['dst'] return True else: raise StatusCodeError("翻译模块sendAjaxRequest状态码错误,错误状态码为".format(response.status_code)) except Exception as e: repeat_count += 1 if repeat_count <3: if "ACCESS LIMIT" in self.data: print("出现ACCESS LIMIT,暂停20秒") time.sleep(20) self.sendAjaxRequest(repeat_count) else: print("翻译异常,异常内容为{}正在进行第{}次翻译,data为{}".format(str(e), repeat_count, str(self.data))) self.sendAjaxRequest(repeat_count) else: print("翻译异常次数超过上限,即将添加到翻译失败数据表,data为".format(self.data)) return False
def getHtmlCode(self): # 获取网页源码 validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(self.htmlUrl,proxies=proxy,headers=self.htmlRequestHeaders) if response.status_code == 200 or response.status_code == 304: self.validip_que.put(validip) response.encoding = 'utf-8' self.html=response else: raise StatusCodeError("翻译模块获取htmlcode状态码错误,错误状态码为".format(response.status_code)) except Exception as e: print("获取html出现异常,异常内容为{}".format(str(e)))
def getListHtml(self, url, repeat_count=0): #self.makePageDir(url) # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(url, timeout=5) if response.status_code == 200: # self.validip_que.put(validip) response.encoding = "utf-8" start_no = int( url.split("&start=")[-1].replace("&_=1550562108374", "")) page_no = math.floor(start_no / 120) + 1 page_dir = "C:/duitang/page" + str(page_no) self.makeDir(page_dir, "列表页根目录") imgInfos = re.findall( r'[\S\s\\s\r\n]{0,}?"path":"([\S\s]*?)","id"[\S\s\\s\r\n]{0,}?', response.text, re.I) for imgInfo in imgInfos: start_time = time.time() src = imgInfo.split('"},"msg":"')[0] alt = imgInfo.split('"},"msg":"')[-1] invalid_str_arr = [ "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|", "?", "?", ":" ] for invalid_str in invalid_str_arr: alt = alt.replace(invalid_str, "") if not alt: alt = src.split("/")[-1].split(".")[0] alt = alt[0:16] file_extension_name = src.split("/")[-1].split( ".")[-1].replace("jpeg", "jpg") file_name = alt + "." + file_extension_name file_path = page_dir + "/" + file_name if not os.path.exists(file_path): self.downloadImage(src, file_path) else: raise StatusCodeError("状态码错误,状态码为{}".format( response.status_code)) except BaseException as e: print("列表页下载异常,错误信息为{},错误行号为{}".format(str(e), e.__traceback__.tb_lineno)) # self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s列表页下载失败,正在进行第%d次重新下载!" % (url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页下载失败,添加至数据库" % (url)) self.sqlInsertFailedUrl(url, "list")
def getArcHtml(self, arc_url, page_dir, repeat_count=0, flag=0): start_time = time.time() # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(arc_url, timeout=5) if response.status_code == 200: # self.validip_que.put(validip) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") img_list = soup.select('.works_img') index = 0 for img in img_list: index += 1 src = img.get("src") alt = img.get("alt") + "-" + str(index) alt = img.get("alt") invalid_str_arr = [ "/", ".", "\\", "\r\n", "。", ":", "*", ":", '"', "<", ">", "|", "?", " | " ] for invalid_str in invalid_str_arr: alt.replace(invalid_str, "") file_extension_name = src.split("/")[-1].split(".")[-1] file_name = "{}-{}.{}".format(alt, index, file_extension_name) file_path = page_dir + "/" + file_name self.downloadImage(src, file_path) # 如果是第一页并且存在分页 if flag == 0 and len(soup.select(".paging_lists")) != 0: for i in range(2, len(soup.select(".paging_lists")) + 1): pageurl = arc_url.replace(".html", "_%d.html" % (i)) self.getArcHtml(arc_url, page_dir, repeat_count=0, flag=1) else: raise StatusCodeError("状态码错误") except BaseException as e: print("详情页请求异常,错误信息为{},所在行号为{}".format(str(e), e.__traceback__.tb_lineno)) # self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s表情详情页下载失败,正在进行第%d次重新下载!" % (arc_url, repeat_count)) self.getArcHtml(arc_url, page_dir, repeat_count, flag=flag) else: print("%s表情详情页下载失败" % (arc_url)) self.sqlInsertFailedUrl(page_dir, "article")
def getApiText(self, url, tag, repeat_count=0): try: response = requests.get(url, timeout=50) if response.status_code == 200: json_data = response.text data = json.loads(json_data) image_arr = data["data"] print("成功获取{}个image对象".format(len(image_arr))) num = 0 failed_num = 0 for image in image_arr: src = image["images"]["downsized"]["url"] existed = self.img_src_check_que.put_unique(src) self.redis.sadd("img_src_check", src) if existed == 0: title = image["title"].split("GIF")[0] en_img_info = "{}********{}********{}".format( src, title, tag) if self.en_img_info_que.put_unique(en_img_info) == 0: self.redis.sadd("en_img_info", en_img_info) num += 1 # print("成功插入一条记录{},当前队列长度为{},{}累计重复次数{}".format(en_img_info,self.en_img_info_que.qsize(),tag,self.failed_count)) else: failed_num += 1 print("图片地址检测程序出现异常") else: failed_num += 1 self.failed_count += 1 print("插入失败,记录{}已存在!累计失败次数为{}".format( src, self.failed_count)) print("{}成功插入{}条图片到英文队列,插入失败条数{},当前英文队列总量{},{}类目累计重复次数为{}". format(self.threadName, num, failed_num, self.en_img_info_que.qsize(), tag, self.failed_count)) else: raise StatusCodeError("状态码错误,错误状态码为{}".format( response.status_code)) except Exception as e: print("{}api接口{}请求异常,错误信息为{}".format(self.threadName, url, str(e))) repeat_count += 1 if repeat_count < 3: print("api接口{}请求失败,正在进行第{}次重新下载!".format(url, repeat_count)) self.getApiText(url, tag, repeat_count) else: print("api接口{}请求失败,添加至数据库".format(url)) self.redis.sadd("failed_api_url", url)
def getListHtml(self,url,repeat_count=0): # validip = self.validip_que.get() # proxy = {'http': validip} headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "www.fabiaoqing.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "%s" % (self.userAgents[self.count % 17]) } try: response = requests.get(url,headers=headers,timeout=9) if response.status_code == 200: # self.validip_que.put(validip) page_no = url.split("/")[-1].replace(".html", "") page_dir ="C:/fabiaoqing/page" + page_no self.makeDir(page_dir,"列表页文件夹") response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") img_list = soup.select(".lazy") for img in img_list: start_time = time.time() src=img.get("data-original") alt = img.get("alt") invalid_str_arr = ["/", ".", "\\", "\r\n", "。", ":", "*", ":", '"', "<", ">", "|", "?", "?"] for invalid_str in invalid_str_arr: alt.replace(invalid_str, "") file_extension_name = src.split("/")[-1].split(".")[-1] file_name = "{}.{}".format(alt, file_extension_name) file_path = page_dir + "/" + file_name self.downloadImage(src,file_path) else: raise StatusCodeError("状态码错误") except BaseException as e: print("%s列表页下载异常,错误信息为%s" % (self.threadName,str(e))) # self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s列表页%s下载失败,正在进行第%d次重新下载!" % (self.threadName,url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页%s下载失败,添加至数据库" % (self.threadName,url)) self.sqlInsertFailedUrl(url,"list")
def getListHtml(self, url, repeat_count=0): # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(url, timeout=10) # self.validip_que.put(validip) if response.status_code == 200: page_no = url.replace("http://www.sj33.cn/cg/chys/", "").replace(".html", "") page_dir = "C:/sj33/page{}".format(page_no) self.makeDir("C:/sj33/page{}".format(page_no), "列表文件夹") response.encoding = "utf-8" html = etree.HTML(response.text) page_no = int( url.replace("http://www.sj33.cn/cg/chys/P", "").replace(".html", "")) if page_no < 219: # 列表页在219页以下的使用以下规则 a_list = list( set(html.xpath('//div[@id="typelink3"]/a[1]'))) else: # 列表页在219页以上的使用以下规则 a_list = list( set(html.xpath('//ul[@class="imglist"]/li/a[1]'))) for a in a_list: arc_url = "http://www.sj33.cn" + a.get("href") self.getArcHtml(arc_url, page_dir) else: raise StatusCodeError("状态码错误") except BaseException as e: # self.validip_que.get(validip) print("{}请求列表页异常,错误信息为{},行号为{}".format(self.threadName, str(e), e.__traceback__.tb_lineno)) repeat_count += 1 if repeat_count < 4: print("{}列表页{}下载失败,正在进行第{}次重新下载!".format( self.threadName, url, repeat_count)) self.getListHtml(url, repeat_count) else: print("{}列表页{}下载失败".format(self.threadName, url)) self.sqlInsertFailedUrl(url, "list")
def getArcHtml(self, url, page_url, repeat_count=0): start_time = time.time() # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(url, timeout=5) if response.status_code == 200: # self.validip_que.put(validip) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") title = soup.select(".text_wrap h2")[0].text page_no = page_url.split("index_")[-1].replace(".html", "") page_dir = "C:/chinaz/index_%s" % ( page_no ) if page_url != "http://sc.chinaz.com/biaoqing/index.html" else "C:/chinaz/index" img_list = soup.select('.down_img img') index = 0 for img in img_list: index += 1 src = img.get("src") invalid_str_arr = [ "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|", "?", "?", ":" ] for invalid_str in invalid_str_arr: title = title.replace(invalid_str, "") file_extension_name = src.split("/")[-1].split(".")[-1] file_name = title + "-%d%s" % (index, ".") + file_extension_name file_path = page_dir + "/" + file_name self.downloadImage(src, file_path) else: raise StatusCodeError("状态码错误") except BaseException as e: print("%s详情页请求异常,错误信息为%s" % (self.threadName, str(e))) # self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s表情详情页%s下载失败,正在进行第%d次重新下载!" % (self.threadName, url, repeat_count)) self.getArcHtml(url, page_url, repeat_count) else: print("%s表情详情页%s下载失败" % (self.threadName, url)) self.sqlInsertFailedUrl(url, "article")
def getListHtml(self, url, repeat_count=0): # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(url, timeout=5) if response.status_code == 200: #self.validip_que.put(validip) page_no = url.replace( "http://588ku.com/sucai/0-default-0-0-renwubiaoqingbao-0-", "").replace("/", "") page_dir = "C:/588ku/page" + page_no self.makeDir(page_dir, "列表文件夹") response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") img_list = soup.select('.img-show .lazy') for img in img_list: start_time = time.time() src = "http:" + img.get("data-original") alt = img.get("alt") invalid_str_arr = [ "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|", "?", "?", ":" ] for invalid_str in invalid_str_arr: alt = alt.replace(invalid_str, "") file_extension_name = src.split("!")[0].split( "/")[-1].split(".")[-1] file_name = alt + "." + file_extension_name file_path = page_dir + "/" + file_name self.downloadImage(src, file_path) else: raise StatusCodeError("状态码错误") except BaseException as e: print("%s列表页请求异常,错误信息为%s" % (self.threadName, str(e))) #self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s列表页%s下载失败,正在进行第%d次重新下载!" % (self.threadName, url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页%s下载失败,添加至数据库" % (self.threadName, url)) self.sqlInsertFailedUrl("list", url)
def getArcHtml(self,url,page_dir,pagination_no=1,repeat_count=0,pagination=False): start_time=time.time() # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(url,timeout=7) if response.status_code == 200: # self.validip_que.put(validip) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") title = soup.select(".title h1")[0].text.replace("|","").replace("/","").replace(r":","").replace("*","").replace("?","").replace("<","").replace(">","").replace('"',"").replace('\\',"") img_list = soup.select('.face-list img') invalid_str_arr = ["/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|", "?", "?", ":"] for invalid_str in invalid_str_arr: title = title.replace(invalid_str, "") index=1 for img in img_list: src=img.get("src") file_extension_name = src.split("!")[0].split("/")[-1].split(".")[-1] file_name = "{}-{}-{}.{}".format(title,pagination_no,index,file_extension_name) file_path = page_dir + "/" + file_name index+=1 self.downloadImage(src,file_path) # 如果存在分页 pagination_list=soup.select('.am-pagination li a') if not pagination: for index in range(1,len(pagination_list)): pagination_no+=1 pagination_url="https://www.jiuwa.net"+pagination_list[index].get("href") self.getArcHtml(pagination_url,page_dir,pagination_no,repeat_count=0,pagination=True) else: raise StatusCodeError("状态码错误") except BaseException as e: print("{}详情页{}下载异常,错误信息为{},错误行号为" .format(self.threadName,url, str(e)),e.__traceback__.tb_lineno) # self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("{}表情详情页下载失败,正在进行第{}次重新下载!" .format(url, repeat_count)) self.getArcHtml(url, page_dir,pagination_no,repeat_count,pagination) else: print("{}表情详情页下载失败" .format(url)) self.sqlInsertFailedUrl(url,"article")
def getPaginationHtml(self, pageUrl, repeat_count=0): validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(pageUrl, timeout=8) if response.status_code == 200: self.validip_que.put(validip) category_list_str = pageUrl.replace("http://www.youmeitu.com/", "").replace(".html", "") pathArr = category_list_str.split("/") category = pathArr[0] list = pathArr[1] if len(pathArr) == 2 else "list_1" list_dir = "C:/youmeitu/{}_{}".format(category, list) self.makeDir(list_dir, "栏目页文件夹") response.encoding = "utf-8" html = etree.HTML(response.text) arcLinkTags1 = html.xpath( "//div[@class='TypeList'][1]//a[@class='TypeBigPics']") arcLinkTags2 = html.xpath( "//ul[@class='g-gxlist-imgbox'][1]//a") arcLinkTags = [] if len(arcLinkTags1) != 0: arcLinkTags = arcLinkTags1 elif len(arcLinkTags2) != 0: arcLinkTags = arcLinkTags2 for arcLinkTag in arcLinkTags: arcUrl = "http://www.youmeitu.com" + arcLinkTag.get("href") self.getArcHtml(arcUrl, list_dir) self.sqlInsert(pageUrl, "complete_categoryList") else: raise StatusCodeError("状态码错误,状态码为{}".format( response.status_code)) except BaseException as e: print("{}列表页下载异常,错误信息为{}".format(self.threadName, str(e))) self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("{}开始第{}次重新下载!".format(self.name, repeat_count)) self.getPaginationHtml(pageUrl, repeat_count) else: print("{}列表页\r\n{}添加到失败数据表".format(self.threadName, pageUrl)) self.sqlInsert(pageUrl, "failed_categoryList")
def getListHtml(self, url, repeat_count=0): # validip = self.validip_que.get() # proxy = {'http': validip} try: response = requests.get(url, timeout=7) if response.status_code == 200: page_no = url.split("/")[-1].replace(".html", "") page_dir = "C:/ibaotu/page" + page_no self.makeDir(page_dir, "列表页文件夹") # self.validip_que.put(validip) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") img_list = soup.select(".pic-box dt img") for img in img_list: start_time = time.time() src = "https:" + img.get('data-url').split("-0.jpg")[0] alt = img.get("alt") invalid_str_arr = [ "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|", "?", "?", ":" ] for invalid_str in invalid_str_arr: alt = alt.replace(invalid_str, "") file_extension_name = src.split("!")[0].split( "/")[-1].split(".")[-1] file_name = alt + "." + file_extension_name file_path = page_dir + "/" + file_name self.downloadImage(src, file_path) else: raise StatusCodeError("状态码错误,状态码为{}".format( response.status_code)) except BaseException as e: print("%s列表页%s下载异常,错误信息为%s" % (self.threadName, url, str(e))) repeat_count += 1 if repeat_count < 4: print("%s列表页%s下载失败,正在进行第%d次重新下载!" % (self.threadName, url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页%s下载失败" % (self.threadName, url)) self.sqlInsertFailedUrl(url, "list")
def getListHtml(self,url,repeat_count=0): try: response = requests.get(url,timeout=20) if response.status_code == 200: response.encoding = "utf-8" imgurls = re.findall(r'<picture>[\S\s\\s\r\n]{0,}?<source srcset="([\S\s\\s\r\n]{0,}?)" media=', response.text, re.I) for imgurl in imgurls: file_name = imgurl.split("/")[-1] file_path ="D:/dribbble/" + file_name self.downloadImage(imgurl,file_path) else: raise StatusCodeError("状态码错误") except BaseException as e: print("列表页下载异常,错误信息为%s" % (str(e))) repeat_count += 1 if repeat_count < 4: print("%s列表页下载失败,正在进行第%d次重新下载!" % (url, repeat_count)) self.getListHtml(url, repeat_count) else: print("%s列表页下载失败,添加至数据库" % (url)) self.sqlInsertFailedUrl(url,"list")
def getArcHtml(self, url, repeat_count=0): start_time = time.time() validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(url, proxies=proxy, timeout=16) if response.status_code == 200: self.validip_que.put(validip) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") img = soup.select('.jss5 img')[0] src = img.get("src") if (src != "http://image.bee-ji.com/undefined"): alt = img.get("alt") invalid_str_arr = [ "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|", "?", "?", ":" ] for invalid_str in invalid_str_arr: alt = alt.replace(invalid_str, "") self.downloadImage(src, alt) else: print("详情页图片失效%s" % (url)) self.sqlInsertFailedUrl(url, "article") else: raise StatusCodeError("%s状态码错误,返回状态码为%d" % (url, response.status_code)) except BaseException as e: print("%s详情页请求异常,错误信息为%s" % (self.name, str(e))) self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s表情详情页%s下载失败,正在进行第%d次重新下载!" % (self.name, url, repeat_count)) self.getArcHtml(url, repeat_count) else: print("%s表情详情页%s下载失败" % (self.name, url)) self.sqlInsertFailedUrl(url, "article")
def downloadImage(self, url, alt, repeat_count=0): start_time = time.time() validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(url, proxies=proxy, timeout=15) if response.status_code == 200: self.validip_que.put(validip) dir_name = time.strftime('%Y%m%d%H', time.localtime(time.time())) dir_path = "C:/bee-ji/%s" % (dir_name) self.makeDir(dir_path, "列表目录") extension = response.headers.get("Content-Type").replace( "image/", "").replace("jpeg", "jpg") if extension in ["jpg", "gif", "png", "bmp", "webp"]: file_path = "{}/{}.{}".format(dir_path, alt, extension) with open(file_path, "wb") as f: f.write(response.content) end_time = time.time() inter = end_time - start_time print("%s成功下载图片%s,共花费%f秒" % (self.name, file_path, inter)) else: raise StatusCodeError("%s状态码错误,返回状态码为%d" % (url, response.status_code)) except BaseException as e: print("%s图片下载异常,错误信息为%s" % (self.name, str(e))) self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("%s图片%s下载失败,正在进行第%d次重新下载!" % (self.name, url, repeat_count)) self.downloadImage(url, repeat_count) else: print("%s图片%s下载失败,添加至数据库" % (self.name, url)) self.sqlInsertFailedUrl(url, "image")
def getArcHtml(self, arc_url, list_dir, repeat_count=0): start_time = time.time() validip = self.validip_que.get() proxy = {'http': validip} try: response = requests.get(arc_url, timeout=8) if response.status_code == 200: self.validip_que.put(validip) response.encoding = "utf-8" html = etree.HTML(response.text) img_list = html.xpath("//p[@align='center'][1]/img") title = html.xpath("//title/text()")[0] invalid_str_arr = [ "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|", "?", "?", ":" ] for invalid_str in invalid_str_arr: title = title.replace(invalid_str, "") title = title.replace("_优美图", "") index = 0 for img in img_list: index += 1 src = img.get("src") file_extension_name = "" if ".jpg" in src: src = src.split(".jpg")[0] + ".jpg" file_extension_name = "jpg" if ".jpeg" in src: src = src.split(".jpg")[0] + ".jpeg" file_extension_name = "jpg" if ".png" in src: src = src.split(".png")[0] + ".png" file_extension_name = "png" if ".gif" in src: src = src.split(".gif")[0] + ".gif" file_extension_name = "gif" if ".bmp" in src: src = src.split(".bmp")[0] + ".bmp" file_extension_name = "bmp" if "http" in src and "|||" not in src: file_name = title + "." + file_extension_name file_path = list_dir + "/" + file_name self.downloadImage(src, file_path) # 如果详情页有分页的情况 if len( html.xpath( "//div[@class='NewPages']/ul/li/a[text()='下一页']") ) != 0: arc_file = arc_url.split("/")[-1] next_link = arc_url.replace(arc_file, "") + \ html.xpath("//div[@class='NewPages']/ul/li/a[text()='下一页']")[0].get("href") self.getArcHtml(next_link, list_dir, repeat_count=0) else: if response.status_code == 404: pass else: raise StatusCodeError("状态码错误") except BaseException as e: print("{}详情页下载异常,错误信息为{},错误行号为{}".format( self.threadName, str(e), e.__traceback__.tb_lineno)) self.validip_que.get(validip) repeat_count += 1 if repeat_count < 4: print("{}{}\r\n图片详情页开始第{}次重新下载!".format( self.threadName, arc_url, repeat_count)) self.getArcHtml(arc_url, list_dir, repeat_count) else: print("{}{}\r\n图片详情页下载失败".format(self.threadName, arc_url)) self.sqlInsert(arc_url, "failed_article")