def _save_oneimage(self, index, url, basepages, img_savepath, logpath): """Download one target image, then multi-thread will call here Add retry decorator, if first try failed, it will auto-retry :param index: image index :param url: one image url :param basepages: referer basic pages list :param img_savepath: image save path :param logpath: log save path :return: none """ img_datatype = 'png' image_name = dl.FROM_URL_GET_IMG_NAME(url) img_save_path = img_savepath + '/' + image_name + '.' + img_datatype # use opener method headers = dl.build_original_headers(basepages[index]) proxy_handler = None response = None timeout = 30 list_headers = dl.dict2list(headers) self.opener.addheaders = list_headers urllib.request.install_opener(self.opener) # update install opener # first try default png format, if failed change to jpg try: response = self.opener.open(fullurl=url, timeout=timeout) except urllib.error.HTTPError as e: # http error 404, change image type if e.code == dl.HTTP_REP_404_CODE: img_datatype = 'jpg' jpg_img_url = url[:-3] + img_datatype try: response = self.opener.open(fullurl=jpg_img_url, timeout=timeout) except urllib.error.HTTPError: pass else: pass if response.getcode() == dl.HTTP_REP_OK_CODE: img_bindata = response.read() source_size = round(float(len(img_bindata) / 1024), 2) WkvCwApi._datastream_pool += source_size # multi-thread, no resource lock, it must use class name to call with open(img_save_path, 'wb') as img: img.write(img_bindata) else: pass
def _save_oneimage(self, index, url, basepages, img_savepath, log_path): """Download one target image, then multi-thread will call here Add retry decorator, if first try failed, it will auto-retry :param index: image index :param url: one image url :param basepages: referer basic pages list :param img_savepath: image save path :param log_path: log save path :return: none """ # setting image save info img_datatype = 'png' image_name = url[57:-4] # name artwork_id + _px img_save_path = (img_savepath + dataload.fs_operation[1] + image_name + '.' + img_datatype) # use opener method headers = dataload.build_original_headers(basepages[index]) proxy_handler = None response = None timeout = 30 list_headers = dataload.dict_transto_list(headers) self.opener.addheaders = list_headers urllib.request.install_opener(self.opener) # update install opener # this request image step will delay much time try: response = self.opener.open(fullurl=url, timeout=timeout) # first request fatal except urllib.error.HTTPError as e: ## log_context = "Error type: " + str(e) ## self.logprowork(logpath, log_context) # http error 404, change image type if e.code == dataload.HTTP_NOTFOUND_CODE_404: img_datatype = 'jpg' # change data type jpg_img_url = url[0:-3] + img_datatype # replace url content # after change image type word try again try: response = self.opener.open(fullurl=jpg_img_url, timeout=timeout) except urllib.error.HTTPError as e: ## log_context = "Error type: " + str(e) ## self.logprowork(logpath, log_context) # not 404 change proxy, cause request server forbidden if e.code != dataload.HTTP_NOTFOUND_CODE_404: log_context = "Add proxy server in request" self.logprowork(log_path, log_context) # preload a proxy handler, just run once if self.proxy_hasrun_flag == False: self.proxy_hasrun_flag = True proxy = self._getproxyserver(log_path) proxy_handler = urllib.request.ProxyHandler(proxy) # with proxy request again self.opener = urllib.request.build_opener( proxy_handler) response = self.opener.open(fullurl=jpg_img_url, timeout=timeout) else: pass # if timeout, use proxy reset request else: log_context = "Add proxy server in request" self.logprowork(log_path, log_context) # with proxy request again self.opener = urllib.request.build_opener(proxy_handler) response = self.opener.open(fullurl=url, timeout=timeout) # save image bin data to files if response.getcode() == dataload.HTTP_OK_CODE_200: img_bindata = response.read() # calcus target source data stream size # multi-thread, no resource lock, it must use class name to call source_size = round(float(len(img_bindata) / 1024), 2) PixivAPILib._datastream_pool += source_size # save image bin data with open(img_save_path, 'wb') as img: img.write(img_bindata)
def _save_oneimage(self, index, url, basepages, img_savepath, log_path): """Download one target image, then multi-process will call here Add retry decorator, if first try failed, it will auto-retry :param index: image index :param url: one image url :param basepages: referer basic pages list :param img_savepath: image save path :param log_path: log save path :return: none """ proxy_handler = None timeout = 30 img_datatype = 'png' # name artwork_id + _px image_name = url[57:-4] # setting new headers headers = dataload.build_original_headers(basepages[index]) list_headers = dataload.dict_transto_list(headers) self.opener.addheaders = list_headers # update install opener urllib.request.install_opener(self.opener) # this request image step will delay much time response = None try: response = self.opener.open(fullurl=url, timeout=timeout) except urllib.error.HTTPError as e: ## log_context = str(e.code) ## self.logprowork(logpath, log_context) # http error 404, change image type if e.code == dataload.HTTP_NOTFOUND_CODE_404: # change data type img_datatype = 'jpg' jpg_img_url = url[0:-3] + img_datatype try: response = self.opener.open(fullurl=jpg_img_url, timeout=timeout) except urllib.error.HTTPError as e: ## log_context = str(e.code) ## self.logprowork(logpath, log_context) # not 404 change proxy, cause request server forbidden if e.code != dataload.HTTP_NOTFOUND_CODE_404: # if timeout, use proxy reset request log_context = "change proxy server" self.logprowork(log_path, log_context) # preload a proxy handler, just run once if Matrix._proxy_hasrun_flag is False: Matrix._proxy_hasrun_flag = True proxy = self._getproxyserver(log_path) proxy_handler = urllib.request.ProxyHandler(proxy) else: pass # add proxy handler self.opener = urllib.request.build_opener( proxy_handler) # re-request response = self.opener.open(fullurl=jpg_img_url, timeout=timeout) else: pass # if timeout, use proxy reset request else: log_context = "change proxy server" self.logprowork(log_path, log_context) self.opener = urllib.request.build_opener(proxy_handler) response = self.opener.open(fullurl=url, timeout=timeout) # save image bin data to files if response.getcode() == dataload.HTTP_OK_CODE_200: img_bindata = response.read() # calcus download source whole size source_size = float(len(img_bindata) / 1024) Matrix._datastream_pool += source_size with open( img_savepath + dataload.fs_operation[1] + image_name + '.' + img_datatype, 'wb') as img: img.write(img_bindata) log_context = 'target no.%d image download finished,' \ ' image size: %dKB' % (index + 1, source_size) self.logprowork(log_path, log_context)