Python StatusCodeError Examples, StatusCodeError.StatusCodeError Python Examples

Example #1

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def getListHtml(self, url, repeat_count=0):
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(url, proxies=proxy, timeout=5)
         if response.status_code == 200:
             self.validip_que.put(validip)
             response.encoding = "utf-8"
             imgurls = re.findall(
                 r'"file":{"bucket":"hbimg", "key":"([\S\s\\s\r\n]{0,}?)",',
                 response.text, re.I)
             for imgurl in imgurls:
                 src = "http://img.hb.aicdn.com/" + imgurl
                 self.sqlInsertFailedUrl(url, "image")
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("列表页下载异常，错误信息为%s" % (str(e)))
         self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页下载失败，正在进行第%d次重新下载!" % (url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页下载失败，添加至数据库" % (url))
             self.sqlInsertFailedUrl(url, "list")

Example #2

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def downloadImage(self, src, file_path, repeat_count=0):
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         start_time = time.time()
         response = requests.get(src, proxies=proxy)
         if response.status_code == 200:
             img_content = response.content
             with open(file_path, "wb") as f:
                 f.write(img_content)
                 end_time = time.time()
                 inter = end_time - start_time
                 print("%s成功下载图片%s，共花费%f秒" %
                       (self.threadName, file_path, inter))
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("%s图片下载异常，错误信息为%s" % (self.threadName, str(e)))
         self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s图片下载抛出异常，正在进行第%d次重新下载!" % (src, repeat_count))
             self.downloadImage(src, file_path, repeat_count)
         else:
             print("%s图片下载失败，将添加下载失败信息到数据表" % (src))
             self.sqlInsertFailedUrl(src, "image")

Example #3

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def downloadImage(self, src, file_path, repeat_count=0):
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         start_time = time.time()
         response = requests.get(src)
         if response.status_code == 200:
             img_content = response.content
             with open(file_path, "wb") as f:
                 f.write(img_content)
                 end_time = time.time()
                 inter = end_time - start_time
                 print("{}成功下载图片{}，共花费{}秒".format(self.threadName,
                                                  file_path, inter))
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("{}图片下载异常，错误信息为{}，行号为".format(self.threadName, str(e),
                                             e.__traceback__.tb_lineno))
         # self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("{}图片{}下载抛出异常，正在进行第{}次重新下载!".format(
                 self.threadName, src, repeat_count))
             self.downloadImage(src, file_path, repeat_count)
         else:
             print("{}图片{}下载失败，将添加下载失败信息到数据表".format(self.threadName, src))
             self.sqlInsertFailedUrl(src, "image")

Example #4

0

Show file

File: reload_image_urls.py Project: Zhangzitong161031/multithreading_crawlers

 def getListHtml(self, url, repeat_count=0):
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         response = requests.get(url, timeout=5)
         if response.status_code == 200:
             # self.validip_que.put(validip)
             page_no = url.replace(
                 "http://www.cndesign.com/Query/Works?key=%E6%8F%92%E7%94%BB&page=",
                 "")
             page_dir = "C:/cdndesign/page" + page_no
             self.makeDir(page_dir, "列表文件夹")
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             a_list = soup.select(".pl_img_box a")
             for a in a_list:
                 arc_url = "http://www.cndesign.com" + a.get("href")
                 self.getArcHtml(arc_url, page_dir)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("列表页请求异常，错误信息为%s" % (str(e)))
         # self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页下载失败，正在进行第%d次重新下载!" % (url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页下载失败，添加至数据库" % (url))
             self.sqlInsertFailedUrl(url, "list")

Example #5

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def getArcHtml(self,arc_url,page_dir,repeat_count=0):
     start_time=time.time()
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(arc_url,proxies=proxy,timeout=15)
         if response.status_code == 200:
             self.validip_que.put(validip)
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             img_list = soup.select('.pic-content img')
             for img in img_list:
                 src=img.get("src")
                 if src:
                     alt = img.get("alt")
                     invalid_str_arr = ["/", ".","\\", "\r\n","。","*", '"', "<", ">", "|", "?","？",":"]
                     for invalid_str in invalid_str_arr:
                         alt=alt.replace(invalid_str, "")
                     file_extension_name = src.split("/")[-1].split(".")[-1]
                     file_name = "{}.{}".format(alt,file_extension_name)
                     file_path = page_dir + "/" + file_name
                     if not os.path.exists(file_path):
                         self.downloadImage(src,file_path)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("{}详情页下载异常，错误信息为{}，所在行号为{}" .format (self.threadName,str(e),e.__traceback__.tb_lineno))
         self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s详情页%s下载失败，正在进行第%d次重新下载!" % (self.threadName,arc_url, repeat_count))
             self.getArcHtml(arc_url, page_dir, repeat_count)
         else:
             print("%s详情页%s下载失败" % (self.threadName,arc_url))
             self.sqlInsertFailedUrl(arc_url,"article")

Example #6

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def getListHtml(self, url, repeat_count=0):
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(url, proxies=proxy, timeout=5)
         if response.status_code == 200:
             page_no = url.replace("https://www.huiyi8.com/qqbq/",
                                   "").replace(".html", "")
             page_dir = "C:/huiyi8/page{}".format(page_no)
             self.makeDir(page_dir, "列表文件夹")
             self.validip_que.put(validip)
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             a_list = soup.select(".works-img-box")
             for a in a_list:
                 arc_url = a.get("href")
                 self.getArcHtml(arc_url, page_dir)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("%s列表页下载异常，错误信息为%s" % (self.threadName, str(e)))
         self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页%s下载失败，正在进行第%d次重新下载!" %
                   (self.threadName, url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页%s下载失败，添加至数据库" % (self.threadName, url))
             self.sqlInsertFailedUrl(url, "list")

Example #7

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def getListHtml(self, url, repeat_count=0):
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(url, proxies=proxy, timeout=5)
         if response.status_code == 200:
             self.validip_que.put(validip)
             page_no = url.split("%B1%ED%C7%E9%B0%FC/")[-1].replace("/", "")
             page_dir = "C:/lanrentuku/page" + page_no
             self.makeDir(page_dir, "列表文件夹")
             response.encoding = "gb2312"
             soup = BeautifulSoup(response.text, "lxml")
             a_list = soup.select(".list-qq dl dd a")
             for a in a_list:
                 arc_url = "http://www.lanrentuku.com/%s" % (a.get("href"))
                 self.getArcHtml(arc_url, page_dir)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页%s下载失败，正在进行第%d次重新下载!" %
                   (self.threadName, url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页%s下载失败，添加至数据库" % (self.threadName, url))
             self.sqlInsertFailedUrl(url, "list")

Example #8

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def getListHtml(self, url, repeat_count=0):
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         response = requests.get(url, timeout=5)
         if response.status_code == 200:
             # self.validip_que.put(validip)
             page_no = url.replace(
                 "https://www.zcool.com.cn/search/content?type=0&field=0&other=0&sort=5&word=%E6%8F%92%E7%94%BB&recommend=0&requestId=requestId_1550647688974&p=",
                 "").replace("#tab_anchor", "")
             page_dir = "C:/zcool/page" + page_no
             self.makeDir(page_dir, "列表文件夹")
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             a_list = soup.select(".card-img-hover")
             for a in a_list:
                 arc_url = a.get("href")
                 self.getArcHtml(arc_url, page_dir)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("列表页下载异常，错误信息为%s" % (str(e)))
         # self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页下载失败，正在进行第%d次重新下载!" % (url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页下载失败，添加至数据库" % (url))
             self.sqlInsertFailedUrl(url, "list")

Example #9

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def getArcHtml(self, arc_url, repeat_count=0):
     start_time = time.time()
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(arc_url, proxies=proxy, timeout=5)
         if response.status_code == 200:
             self.validip_que.put(validip)
             response.encoding = "gb2312"
             soup = BeautifulSoup(response.text, "lxml")
             img_list = soup.select('.content_word img')
             title = soup.select('h3')[0].text
             invalid_str_arr = ["/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|", "?", "？", ":"]
             for invalid_str in invalid_str_arr:
                 title = title.replace(invalid_str, "")
             index = 0
             for img in img_list:
                 index += 1
                 src = img.get("src")
                 alt = img.get("alt")
                 file_extension_name = src.split("!")[0].split("/")[-1].split(".")[-1]
                 file_name = "{}-{}.{}".format(title, index, file_extension_name)
                 file_path = "C:/QQ/qq" + "/" + file_name
                 self.downloadImage(src, file_path)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("%s详情页下载异常，错误信息为%s" % (self.threadName, str(e)))
         repeat_count += 1
         if repeat_count < 4:
             print("%s详情页%s下载失败，正在进行第%d次重新下载!" % (self.threadName, arc_url, repeat_count))
             self.getArcHtml(arc_url, repeat_count)
         else:
             print("%s详情页%s下载失败" % (self.threadName, arc_url))
             self.sqlInsertFailedUrl(arc_url, "article")

Example #10

0

Show file

 def getListHtml(self,url,repeat_count=0):
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         response = requests.get(url,timeout=7)
         if response.status_code == 200:
             # self.validip_que.put(validip)
             page_no = url.replace("https://www.jiuwa.net/face/p-","")
             page_dir = "C:/jiuwa/index_" + page_no
             self.makeDir(page_dir,"列表文件夹")
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             a_list = soup.select(".title a")
             for a in a_list:
                 arc_url=a.get("href")
                 self.getArcHtml("https://www.jiuwa.net"+arc_url,page_dir)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("{}列表页下载异常，错误信息为{}，错误行号为{}" .format(self.threadName, str(e),e.__traceback__.tb_lineno))
         # self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("{}列表页{}下载失败，正在进行第{}次重新下载!" .format(self.threadName,url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("{}列表页{}下载失败，添加至数据库" .format(self.threadName,url))
             self.sqlInsertFailedUrl(url,"list")

Example #11

0

Show file

 def getListHtml(self, url, repeat_count=0):
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         response = requests.get(url, timeout=8)
         if response.status_code == 200:
             page_no = url.replace(
                 "http://so.ooopic.com/search-b1edc7e9b0fc-0-0_0__0__0_ooo_0_",
                 "").replace(
                     "_0_0_0_0_0_0_0___.html", ""
                 ) if url != "http://so.ooopic.com/sousuo/2000346/" else 1
             page_dir = "C:/oopic/page%s" % page_no
             self.makeDir(page_dir, "列表文件夹")
             # self.validip_que.put(validip)
             response.encoding = "gbk"
             soup = BeautifulSoup(response.text, "lxml")
             a_list = soup.select(".datapic")
             for a in a_list:
                 arc_url = a.get("href")
                 self.getArcHtml(arc_url, page_dir)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("%s列表页下载异常，错误信息为%s" % (self.threadName, str(e)))
         # self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页下载失败，正在进行第%d次重新下载!" % (url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页下载失败，添加至数据库" % (url))
             self.sqlInsertFailedUrl(url, "list")

Example #12

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def getListHtml(self, url, repeat_count=0):
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(url, proxies=proxy, timeout=5)
         if response.status_code == 200:
             self.validip_que.put(validip)
             response.encoding = "gb2312"
             soup = BeautifulSoup(response.text, "lxml")
             a_list = soup.select(".txt")
             for a in a_list:
                 arc_url = a.get("href") if "qq.qqjia.com" in a.get("href") else "http://www.qqjia.com/" + a.get(
                     "href")
                 print(arc_url)
                 # self.getArcHtml(arc_url)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("{}请求列表页异常，错误信息为{},行号为{}".format(self.threadName, str(e), e.__traceback__.tb_lineno))
         self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页%s下载失败，正在进行第%d次重新下载!" % (self.threadName, url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页%s下载失败，添加至数据库" % (self.threadName, url))
             self.sqlInsertFailedUrl(url, "list")

Example #13

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def getListHtml(self,url,repeat_count=0):
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(url,proxies=proxy,timeout=7)
         if response.status_code == 200:
             self.validip_que.put(validip)
             page_no = url.replace("https://www.doutula.com/article/list/?page=", "")
             page_dir="C:/doutula/page" + page_no
             self.makeDir(page_dir,"列表页文件夹")
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             a_list = soup.select(".list-group-item")
             for a in a_list:
                 arc_url=a.get("href")
                 if arc_url:
                     self.getArcHtml(arc_url,page_dir)
         else:
             raise StatusCodeError("状态码错误，状态码为%d"%(response.status_code))
     except BaseException as e:
         print("列表页下载异常，错误信息为%s" % (str(e)))
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页下载失败，正在进行第%d次重新下载!" % (url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页下载失败" % (url))
             self.sqlInsertFailedUrl(url,"list")

Example #14

0

Show file

 def getListHtml(self, url, repeat_count=0):
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         response = requests.get(url, timeout=7)
         if response.status_code == 200:
             page_no = url.split("index_")[-1].replace(".html", "")
             page_dir = "C:/chinaz/index_%s" % (
                 page_no
             ) if url != "http://sc.chinaz.com/biaoqing/index.html" else "C:/chinaz/index"
             self.makeDir(page_dir, "列表目录")
             # self.validip_que.put(validip)
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             a_list = soup.select(".num_2 a")
             for a in a_list:
                 arc_url = a.get("href")
                 self.getArcHtml(arc_url, url)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("%s列表页请求异常，错误信息为%s" % (self.threadName, str(e)))
         # self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页%s下载失败，正在进行第%d次重新下载!" %
                   (self.threadName, url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页%s下载失败，添加至数据库" % (self.threadName, url))
             self.sqlInsertFailedUrl(url, "list")

Example #15

0

Show file

 def sendAjaxRequest(self,repeat_count=0):
     self.param = {
         "from": 'en',
         "to": 'zh',
         "query": self.en_word,
         "transtype": "translang",
         "simple_means_flag": "3",
         "sign": self.sign,
         "token": self.token
     }
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.post(self.ajaxUrl,proxies=proxy, headers=self.ajaxRequestHeaders, data=self.param)
         if response.status_code==200 or response.status_code==304:
             self.validip_que.put(validip)
             self.data=response.text
             dict = json.loads(self.data)
             self.cn_word=dict['trans_result']['data'][0]['dst']
             return True
         else:
             raise StatusCodeError("翻译模块sendAjaxRequest状态码错误，错误状态码为".format(response.status_code))
     except Exception as e:
         repeat_count += 1
         if repeat_count <3:
            if "ACCESS LIMIT" in self.data:
                print("出现ACCESS LIMIT，暂停20秒")
                time.sleep(20)
                self.sendAjaxRequest(repeat_count)
            else:
                print("翻译异常，异常内容为{}正在进行第{}次翻译，data为{}".format(str(e), repeat_count, str(self.data)))
                self.sendAjaxRequest(repeat_count)
         else:
             print("翻译异常次数超过上限，即将添加到翻译失败数据表,data为".format(self.data))
             return False

Example #16

0

Show file

 def getHtmlCode(self):
     # 获取网页源码
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(self.htmlUrl,proxies=proxy,headers=self.htmlRequestHeaders)
         if response.status_code == 200 or response.status_code == 304:
             self.validip_que.put(validip)
             response.encoding = 'utf-8'
             self.html=response
         else:
             raise StatusCodeError("翻译模块获取htmlcode状态码错误，错误状态码为".format(response.status_code))
     except Exception as e:
         print("获取html出现异常，异常内容为{}".format(str(e)))

Example #17

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def getListHtml(self, url, repeat_count=0):
     #self.makePageDir(url)
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         response = requests.get(url, timeout=5)
         if response.status_code == 200:
             # self.validip_que.put(validip)
             response.encoding = "utf-8"
             start_no = int(
                 url.split("&start=")[-1].replace("&_=1550562108374", ""))
             page_no = math.floor(start_no / 120) + 1
             page_dir = "C:/duitang/page" + str(page_no)
             self.makeDir(page_dir, "列表页根目录")
             imgInfos = re.findall(
                 r'[\S\s\\s\r\n]{0,}?"path":"([\S\s]*?)","id"[\S\s\\s\r\n]{0,}?',
                 response.text, re.I)
             for imgInfo in imgInfos:
                 start_time = time.time()
                 src = imgInfo.split('"},"msg":"')[0]
                 alt = imgInfo.split('"},"msg":"')[-1]
                 invalid_str_arr = [
                     "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|",
                     "?", "？", ":"
                 ]
                 for invalid_str in invalid_str_arr:
                     alt = alt.replace(invalid_str, "")
                 if not alt:
                     alt = src.split("/")[-1].split(".")[0]
                 alt = alt[0:16]
                 file_extension_name = src.split("/")[-1].split(
                     ".")[-1].replace("jpeg", "jpg")
                 file_name = alt + "." + file_extension_name
                 file_path = page_dir + "/" + file_name
                 if not os.path.exists(file_path):
                     self.downloadImage(src, file_path)
         else:
             raise StatusCodeError("状态码错误,状态码为{}".format(
                 response.status_code))
     except BaseException as e:
         print("列表页下载异常，错误信息为{},错误行号为{}".format(str(e),
                                                e.__traceback__.tb_lineno))
         # self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页下载失败，正在进行第%d次重新下载!" % (url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页下载失败，添加至数据库" % (url))
             self.sqlInsertFailedUrl(url, "list")

Example #18

0

Show file

File: reload_image_urls.py Project: Zhangzitong161031/multithreading_crawlers

 def getArcHtml(self, arc_url, page_dir, repeat_count=0, flag=0):
     start_time = time.time()
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         response = requests.get(arc_url, timeout=5)
         if response.status_code == 200:
             # self.validip_que.put(validip)
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             img_list = soup.select('.works_img')
             index = 0
             for img in img_list:
                 index += 1
                 src = img.get("src")
                 alt = img.get("alt") + "-" + str(index)
                 alt = img.get("alt")
                 invalid_str_arr = [
                     "/", ".", "\\", "\r\n", "。", ":", "*", "：", '"', "<",
                     ">", "|", "?", " | "
                 ]
                 for invalid_str in invalid_str_arr:
                     alt.replace(invalid_str, "")
                 file_extension_name = src.split("/")[-1].split(".")[-1]
                 file_name = "{}-{}.{}".format(alt, index,
                                               file_extension_name)
                 file_path = page_dir + "/" + file_name
                 self.downloadImage(src, file_path)
             # 如果是第一页并且存在分页
             if flag == 0 and len(soup.select(".paging_lists")) != 0:
                 for i in range(2, len(soup.select(".paging_lists")) + 1):
                     pageurl = arc_url.replace(".html", "_%d.html" % (i))
                     self.getArcHtml(arc_url,
                                     page_dir,
                                     repeat_count=0,
                                     flag=1)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("详情页请求异常，错误信息为{}，所在行号为{}".format(str(e),
                                                e.__traceback__.tb_lineno))
         # self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s表情详情页下载失败，正在进行第%d次重新下载!" % (arc_url, repeat_count))
             self.getArcHtml(arc_url, page_dir, repeat_count, flag=flag)
         else:
             print("%s表情详情页下载失败" % (arc_url))
             self.sqlInsertFailedUrl(page_dir, "article")

Example #19

0

Show file

File: image_infos_collecter.py Project: Zhangzitong161031/multithreading_crawlers

 def getApiText(self, url, tag, repeat_count=0):
     try:
         response = requests.get(url, timeout=50)
         if response.status_code == 200:
             json_data = response.text
             data = json.loads(json_data)
             image_arr = data["data"]
             print("成功获取{}个image对象".format(len(image_arr)))
             num = 0
             failed_num = 0
             for image in image_arr:
                 src = image["images"]["downsized"]["url"]
                 existed = self.img_src_check_que.put_unique(src)
                 self.redis.sadd("img_src_check", src)
                 if existed == 0:
                     title = image["title"].split("GIF")[0]
                     en_img_info = "{}********{}********{}".format(
                         src, title, tag)
                     if self.en_img_info_que.put_unique(en_img_info) == 0:
                         self.redis.sadd("en_img_info", en_img_info)
                         num += 1
                         # print("成功插入一条记录{}，当前队列长度为{}，{}累计重复次数{}".format(en_img_info,self.en_img_info_que.qsize(),tag,self.failed_count))
                     else:
                         failed_num += 1
                         print("图片地址检测程序出现异常")
                 else:
                     failed_num += 1
                     self.failed_count += 1
                     print("插入失败，记录{}已存在！累计失败次数为{}".format(
                         src, self.failed_count))
             print("{}成功插入{}条图片到英文队列，插入失败条数{},当前英文队列总量{},{}类目累计重复次数为{}".
                   format(self.threadName, num, failed_num,
                          self.en_img_info_que.qsize(), tag,
                          self.failed_count))
         else:
             raise StatusCodeError("状态码错误，错误状态码为{}".format(
                 response.status_code))
     except Exception as e:
         print("{}api接口{}请求异常，错误信息为{}".format(self.threadName, url, str(e)))
         repeat_count += 1
         if repeat_count < 3:
             print("api接口{}请求失败，正在进行第{}次重新下载!".format(url, repeat_count))
             self.getApiText(url, tag, repeat_count)
         else:
             print("api接口{}请求失败，添加至数据库".format(url))
             self.redis.sadd("failed_api_url", url)

Example #20

0

Show file

 def getListHtml(self,url,repeat_count=0):
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     headers = {
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
         "Accept-Encoding": "gzip, deflate, br",
         "Accept-Language": "zh-CN,zh;q=0.9",
         "Cache-Control": "max-age=0",
         "Connection": "keep-alive",
         "Host": "www.fabiaoqing.com",
         "Upgrade-Insecure-Requests": "1",
         "User-Agent": "%s" % (self.userAgents[self.count % 17])
     }
     try:
         response = requests.get(url,headers=headers,timeout=9)
         if response.status_code == 200:
             # self.validip_que.put(validip)
             page_no = url.split("/")[-1].replace(".html", "")
             page_dir ="C:/fabiaoqing/page" + page_no
             self.makeDir(page_dir,"列表页文件夹")
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             img_list = soup.select(".lazy")
             for img in img_list:
                 start_time = time.time()
                 src=img.get("data-original")
                 alt = img.get("alt")
                 invalid_str_arr = ["/", ".", "\\", "\r\n", "。", ":", "*", "：", '"', "<", ">", "|", "?", "?"]
                 for invalid_str in invalid_str_arr:
                     alt.replace(invalid_str, "")
                 file_extension_name = src.split("/")[-1].split(".")[-1]
                 file_name = "{}.{}".format(alt, file_extension_name)
                 file_path = page_dir + "/" + file_name
                 self.downloadImage(src,file_path)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("%s列表页下载异常，错误信息为%s" % (self.threadName,str(e)))
         # self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页%s下载失败，正在进行第%d次重新下载!" % (self.threadName,url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页%s下载失败，添加至数据库" % (self.threadName,url))
             self.sqlInsertFailedUrl(url,"list")

Example #21

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

    def getListHtml(self, url, repeat_count=0):
        # validip = self.validip_que.get()
        # proxy = {'http': validip}
        try:
            response = requests.get(url, timeout=10)
            # self.validip_que.put(validip)
            if response.status_code == 200:
                page_no = url.replace("http://www.sj33.cn/cg/chys/",
                                      "").replace(".html", "")
                page_dir = "C:/sj33/page{}".format(page_no)
                self.makeDir("C:/sj33/page{}".format(page_no), "列表文件夹")
                response.encoding = "utf-8"
                html = etree.HTML(response.text)
                page_no = int(
                    url.replace("http://www.sj33.cn/cg/chys/P",
                                "").replace(".html", ""))

                if page_no < 219:
                    # 列表页在219页以下的使用以下规则
                    a_list = list(
                        set(html.xpath('//div[@id="typelink3"]/a[1]')))

                else:
                    # 列表页在219页以上的使用以下规则
                    a_list = list(
                        set(html.xpath('//ul[@class="imglist"]/li/a[1]')))

                for a in a_list:
                    arc_url = "http://www.sj33.cn" + a.get("href")
                    self.getArcHtml(arc_url, page_dir)
            else:
                raise StatusCodeError("状态码错误")
        except BaseException as e:
            # self.validip_que.get(validip)
            print("{}请求列表页异常，错误信息为{}，行号为{}".format(self.threadName, str(e),
                                                   e.__traceback__.tb_lineno))
            repeat_count += 1
            if repeat_count < 4:
                print("{}列表页{}下载失败，正在进行第{}次重新下载!".format(
                    self.threadName, url, repeat_count))
                self.getListHtml(url, repeat_count)
            else:
                print("{}列表页{}下载失败".format(self.threadName, url))
                self.sqlInsertFailedUrl(url, "list")

Example #22

0

Show file

 def getArcHtml(self, url, page_url, repeat_count=0):
     start_time = time.time()
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         response = requests.get(url, timeout=5)
         if response.status_code == 200:
             # self.validip_que.put(validip)
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             title = soup.select(".text_wrap h2")[0].text
             page_no = page_url.split("index_")[-1].replace(".html", "")
             page_dir = "C:/chinaz/index_%s" % (
                 page_no
             ) if page_url != "http://sc.chinaz.com/biaoqing/index.html" else "C:/chinaz/index"
             img_list = soup.select('.down_img img')
             index = 0
             for img in img_list:
                 index += 1
                 src = img.get("src")
                 invalid_str_arr = [
                     "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|",
                     "?", "？", ":"
                 ]
                 for invalid_str in invalid_str_arr:
                     title = title.replace(invalid_str, "")
                 file_extension_name = src.split("/")[-1].split(".")[-1]
                 file_name = title + "-%d%s" % (index,
                                                ".") + file_extension_name
                 file_path = page_dir + "/" + file_name
                 self.downloadImage(src, file_path)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("%s详情页请求异常，错误信息为%s" % (self.threadName, str(e)))
         # self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s表情详情页%s下载失败，正在进行第%d次重新下载!" %
                   (self.threadName, url, repeat_count))
             self.getArcHtml(url, page_url, repeat_count)
         else:
             print("%s表情详情页%s下载失败" % (self.threadName, url))
             self.sqlInsertFailedUrl(url, "article")

Example #23

0

Show file

 def getListHtml(self, url, repeat_count=0):
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         response = requests.get(url, timeout=5)
         if response.status_code == 200:
             #self.validip_que.put(validip)
             page_no = url.replace(
                 "http://588ku.com/sucai/0-default-0-0-renwubiaoqingbao-0-",
                 "").replace("/", "")
             page_dir = "C:/588ku/page" + page_no
             self.makeDir(page_dir, "列表文件夹")
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             img_list = soup.select('.img-show .lazy')
             for img in img_list:
                 start_time = time.time()
                 src = "http:" + img.get("data-original")
                 alt = img.get("alt")
                 invalid_str_arr = [
                     "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|",
                     "?", "？", ":"
                 ]
                 for invalid_str in invalid_str_arr:
                     alt = alt.replace(invalid_str, "")
                 file_extension_name = src.split("!")[0].split(
                     "/")[-1].split(".")[-1]
                 file_name = alt + "." + file_extension_name
                 file_path = page_dir + "/" + file_name
                 self.downloadImage(src, file_path)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("%s列表页请求异常，错误信息为%s" % (self.threadName, str(e)))
         #self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页%s下载失败，正在进行第%d次重新下载!" %
                   (self.threadName, url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页%s下载失败，添加至数据库" % (self.threadName, url))
             self.sqlInsertFailedUrl("list", url)

Example #24

0

Show file

 def getArcHtml(self,url,page_dir,pagination_no=1,repeat_count=0,pagination=False):
     start_time=time.time()
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         response = requests.get(url,timeout=7)
         if response.status_code == 200:
             # self.validip_que.put(validip)
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             title = soup.select(".title h1")[0].text.replace("|","").replace("/","").replace(r":","").replace("*","").replace("？","").replace("<","").replace(">","").replace('"',"").replace('\\',"")
             img_list = soup.select('.face-list img')
             invalid_str_arr = ["/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|", "?", "？", ":"]
             for invalid_str in invalid_str_arr:
                 title = title.replace(invalid_str, "")
             index=1
             for img in img_list:
                 src=img.get("src")
                 file_extension_name = src.split("!")[0].split("/")[-1].split(".")[-1]
                 file_name = "{}-{}-{}.{}".format(title,pagination_no,index,file_extension_name)
                 file_path = page_dir + "/" + file_name
                 index+=1
                 self.downloadImage(src,file_path)
             # 如果存在分页
             pagination_list=soup.select('.am-pagination li a')
             if not pagination:
                 for index in range(1,len(pagination_list)):
                     pagination_no+=1
                     pagination_url="https://www.jiuwa.net"+pagination_list[index].get("href")
                     self.getArcHtml(pagination_url,page_dir,pagination_no,repeat_count=0,pagination=True)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("{}详情页{}下载异常，错误信息为{},错误行号为" .format(self.threadName,url, str(e)),e.__traceback__.tb_lineno)
         # self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("{}表情详情页下载失败，正在进行第{}次重新下载!" .format(url, repeat_count))
             self.getArcHtml(url, page_dir,pagination_no,repeat_count,pagination)
         else:
             print("{}表情详情页下载失败" .format(url))
             self.sqlInsertFailedUrl(url,"article")

Example #25

0

Show file

File: cellect_failed_category_list.py Project: Zhangzitong161031/multithreading_crawlers

 def getPaginationHtml(self, pageUrl, repeat_count=0):
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(pageUrl, timeout=8)
         if response.status_code == 200:
             self.validip_que.put(validip)
             category_list_str = pageUrl.replace("http://www.youmeitu.com/",
                                                 "").replace(".html", "")
             pathArr = category_list_str.split("/")
             category = pathArr[0]
             list = pathArr[1] if len(pathArr) == 2 else "list_1"
             list_dir = "C:/youmeitu/{}_{}".format(category, list)
             self.makeDir(list_dir, "栏目页文件夹")
             response.encoding = "utf-8"
             html = etree.HTML(response.text)
             arcLinkTags1 = html.xpath(
                 "//div[@class='TypeList'][1]//a[@class='TypeBigPics']")
             arcLinkTags2 = html.xpath(
                 "//ul[@class='g-gxlist-imgbox'][1]//a")
             arcLinkTags = []
             if len(arcLinkTags1) != 0:
                 arcLinkTags = arcLinkTags1
             elif len(arcLinkTags2) != 0:
                 arcLinkTags = arcLinkTags2
             for arcLinkTag in arcLinkTags:
                 arcUrl = "http://www.youmeitu.com" + arcLinkTag.get("href")
                 self.getArcHtml(arcUrl, list_dir)
             self.sqlInsert(pageUrl, "complete_categoryList")
         else:
             raise StatusCodeError("状态码错误，状态码为{}".format(
                 response.status_code))
     except BaseException as e:
         print("{}列表页下载异常，错误信息为{}".format(self.threadName, str(e)))
         self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("{}开始第{}次重新下载!".format(self.name, repeat_count))
             self.getPaginationHtml(pageUrl, repeat_count)
         else:
             print("{}列表页\r\n{}添加到失败数据表".format(self.threadName, pageUrl))
             self.sqlInsert(pageUrl, "failed_categoryList")

Example #26

0

Show file

 def getListHtml(self, url, repeat_count=0):
     # validip = self.validip_que.get()
     # proxy = {'http': validip}
     try:
         response = requests.get(url, timeout=7)
         if response.status_code == 200:
             page_no = url.split("/")[-1].replace(".html", "")
             page_dir = "C:/ibaotu/page" + page_no
             self.makeDir(page_dir, "列表页文件夹")
             # self.validip_que.put(validip)
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             img_list = soup.select(".pic-box dt img")
             for img in img_list:
                 start_time = time.time()
                 src = "https:" + img.get('data-url').split("-0.jpg")[0]
                 alt = img.get("alt")
                 invalid_str_arr = [
                     "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|",
                     "?", "？", ":"
                 ]
                 for invalid_str in invalid_str_arr:
                     alt = alt.replace(invalid_str, "")
                 file_extension_name = src.split("!")[0].split(
                     "/")[-1].split(".")[-1]
                 file_name = alt + "." + file_extension_name
                 file_path = page_dir + "/" + file_name
                 self.downloadImage(src, file_path)
         else:
             raise StatusCodeError("状态码错误，状态码为{}".format(
                 response.status_code))
     except BaseException as e:
         print("%s列表页%s下载异常，错误信息为%s" % (self.threadName, url, str(e)))
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页%s下载失败，正在进行第%d次重新下载!" %
                   (self.threadName, url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页%s下载失败" % (self.threadName, url))
             self.sqlInsertFailedUrl(url, "list")

Example #27

0

Show file

File: main.py Project: Zhangzitong161031/multithreading_crawlers

 def getListHtml(self,url,repeat_count=0):
     try:
         response = requests.get(url,timeout=20)
         if response.status_code == 200:
             response.encoding = "utf-8"
             imgurls = re.findall(r'<picture>[\S\s\\s\r\n]{0,}?<source srcset="([\S\s\\s\r\n]{0,}?)" media=', response.text, re.I)
             for imgurl in imgurls:
                 file_name = imgurl.split("/")[-1]
                 file_path ="D:/dribbble/" + file_name
                 self.downloadImage(imgurl,file_path)
         else:
             raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("列表页下载异常，错误信息为%s" % (str(e)))
         repeat_count += 1
         if repeat_count < 4:
             print("%s列表页下载失败，正在进行第%d次重新下载!" % (url, repeat_count))
             self.getListHtml(url, repeat_count)
         else:
             print("%s列表页下载失败，添加至数据库" % (url))
             self.sqlInsertFailedUrl(url,"list")

Example #28

0

Show file

 def getArcHtml(self, url, repeat_count=0):
     start_time = time.time()
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(url, proxies=proxy, timeout=16)
         if response.status_code == 200:
             self.validip_que.put(validip)
             response.encoding = "utf-8"
             soup = BeautifulSoup(response.text, "lxml")
             img = soup.select('.jss5 img')[0]
             src = img.get("src")
             if (src != "http://image.bee-ji.com/undefined"):
                 alt = img.get("alt")
                 invalid_str_arr = [
                     "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|",
                     "?", "？", ":"
                 ]
                 for invalid_str in invalid_str_arr:
                     alt = alt.replace(invalid_str, "")
                 self.downloadImage(src, alt)
             else:
                 print("详情页图片失效%s" % (url))
                 self.sqlInsertFailedUrl(url, "article")
         else:
             raise StatusCodeError("%s状态码错误，返回状态码为%d" %
                                   (url, response.status_code))
     except BaseException as e:
         print("%s详情页请求异常，错误信息为%s" % (self.name, str(e)))
         self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s表情详情页%s下载失败，正在进行第%d次重新下载!" %
                   (self.name, url, repeat_count))
             self.getArcHtml(url, repeat_count)
         else:
             print("%s表情详情页%s下载失败" % (self.name, url))
             self.sqlInsertFailedUrl(url, "article")

Example #29

0

Show file

 def downloadImage(self, url, alt, repeat_count=0):
     start_time = time.time()
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(url, proxies=proxy, timeout=15)
         if response.status_code == 200:
             self.validip_que.put(validip)
             dir_name = time.strftime('%Y%m%d%H',
                                      time.localtime(time.time()))
             dir_path = "C:/bee-ji/%s" % (dir_name)
             self.makeDir(dir_path, "列表目录")
             extension = response.headers.get("Content-Type").replace(
                 "image/", "").replace("jpeg", "jpg")
             if extension in ["jpg", "gif", "png", "bmp", "webp"]:
                 file_path = "{}/{}.{}".format(dir_path, alt, extension)
                 with open(file_path, "wb") as f:
                     f.write(response.content)
                     end_time = time.time()
                     inter = end_time - start_time
                     print("%s成功下载图片%s，共花费%f秒" %
                           (self.name, file_path, inter))
         else:
             raise StatusCodeError("%s状态码错误，返回状态码为%d" %
                                   (url, response.status_code))
     except BaseException as e:
         print("%s图片下载异常，错误信息为%s" % (self.name, str(e)))
         self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("%s图片%s下载失败，正在进行第%d次重新下载!" %
                   (self.name, url, repeat_count))
             self.downloadImage(url, repeat_count)
         else:
             print("%s图片%s下载失败，添加至数据库" % (self.name, url))
             self.sqlInsertFailedUrl(url, "image")

Example #30

0

Show file

File: cellect_failed_category_list.py Project: Zhangzitong161031/multithreading_crawlers

 def getArcHtml(self, arc_url, list_dir, repeat_count=0):
     start_time = time.time()
     validip = self.validip_que.get()
     proxy = {'http': validip}
     try:
         response = requests.get(arc_url, timeout=8)
         if response.status_code == 200:
             self.validip_que.put(validip)
             response.encoding = "utf-8"
             html = etree.HTML(response.text)
             img_list = html.xpath("//p[@align='center'][1]/img")
             title = html.xpath("//title/text()")[0]
             invalid_str_arr = [
                 "/", ".", "\\", "\r\n", "。", "*", '"', "<", ">", "|", "?",
                 "？", ":"
             ]
             for invalid_str in invalid_str_arr:
                 title = title.replace(invalid_str, "")
             title = title.replace("_优美图", "")
             index = 0
             for img in img_list:
                 index += 1
                 src = img.get("src")
                 file_extension_name = ""
                 if ".jpg" in src:
                     src = src.split(".jpg")[0] + ".jpg"
                     file_extension_name = "jpg"
                 if ".jpeg" in src:
                     src = src.split(".jpg")[0] + ".jpeg"
                     file_extension_name = "jpg"
                 if ".png" in src:
                     src = src.split(".png")[0] + ".png"
                     file_extension_name = "png"
                 if ".gif" in src:
                     src = src.split(".gif")[0] + ".gif"
                     file_extension_name = "gif"
                 if ".bmp" in src:
                     src = src.split(".bmp")[0] + ".bmp"
                     file_extension_name = "bmp"
                 if "http" in src and "|||" not in src:
                     file_name = title + "." + file_extension_name
                     file_path = list_dir + "/" + file_name
                     self.downloadImage(src, file_path)
             # 如果详情页有分页的情况
             if len(
                     html.xpath(
                         "//div[@class='NewPages']/ul/li/a[text()='下一页']")
             ) != 0:
                 arc_file = arc_url.split("/")[-1]
                 next_link = arc_url.replace(arc_file, "") + \
                             html.xpath("//div[@class='NewPages']/ul/li/a[text()='下一页']")[0].get("href")
                 self.getArcHtml(next_link, list_dir, repeat_count=0)
         else:
             if response.status_code == 404:
                 pass
             else:
                 raise StatusCodeError("状态码错误")
     except BaseException as e:
         print("{}详情页下载异常，错误信息为{},错误行号为{}".format(
             self.threadName, str(e), e.__traceback__.tb_lineno))
         self.validip_que.get(validip)
         repeat_count += 1
         if repeat_count < 4:
             print("{}{}\r\n图片详情页开始第{}次重新下载!".format(
                 self.threadName, arc_url, repeat_count))
             self.getArcHtml(arc_url, list_dir, repeat_count)
         else:
             print("{}{}\r\n图片详情页下载失败".format(self.threadName, arc_url))
             self.sqlInsert(arc_url, "failed_article")