Example #1
0
    def _save_oneimage(self, index, url, basepages, img_savepath, logpath):
        """Download one target image, then multi-thread will call here

        Add retry decorator, if first try failed, it will auto-retry
        :param index:           image index
        :param url:             one image url
        :param basepages:       referer basic pages list
        :param img_savepath:    image save path
        :param logpath:         log save path
        :return:                none
        """
        img_datatype = 'png'
        image_name = dl.FROM_URL_GET_IMG_NAME(url)
        img_save_path = img_savepath + '/' + image_name + '.' + img_datatype

        # use opener method
        headers = dl.build_original_headers(basepages[index])
        proxy_handler = None
        response = None
        timeout = 30
        list_headers = dl.dict2list(headers)
        self.opener.addheaders = list_headers
        urllib.request.install_opener(self.opener)  # update install opener

        # first try default png format, if failed change to jpg
        try:
            response = self.opener.open(fullurl=url, timeout=timeout)
        except urllib.error.HTTPError as e:
            # http error 404, change image type
            if e.code == dl.HTTP_REP_404_CODE:
                img_datatype = 'jpg'
                jpg_img_url = url[:-3] + img_datatype
                try:
                    response = self.opener.open(fullurl=jpg_img_url,
                                                timeout=timeout)
                except urllib.error.HTTPError:
                    pass
            else:
                pass

        if response.getcode() == dl.HTTP_REP_OK_CODE:
            img_bindata = response.read()
            source_size = round(float(len(img_bindata) / 1024), 2)
            WkvCwApi._datastream_pool += source_size  # multi-thread, no resource lock, it must use class name to call
            with open(img_save_path, 'wb') as img:
                img.write(img_bindata)
        else:
            pass
    def _save_oneimage(self, index, url, basepages, img_savepath, log_path):
        """Download one target image, then multi-thread will call here

        Add retry decorator, if first try failed, it will auto-retry
        :param index:           image index
        :param url:             one image url
        :param basepages:       referer basic pages list
        :param img_savepath:    image save path
        :param log_path:        log save path
        :return:                none
        """
        # setting image save info
        img_datatype = 'png'
        image_name = url[57:-4]  # name artwork_id + _px
        img_save_path = (img_savepath + dataload.fs_operation[1] + image_name +
                         '.' + img_datatype)

        # use opener method
        headers = dataload.build_original_headers(basepages[index])
        proxy_handler = None
        response = None
        timeout = 30
        list_headers = dataload.dict_transto_list(headers)
        self.opener.addheaders = list_headers
        urllib.request.install_opener(self.opener)  # update install opener

        # this request image step will delay much time
        try:
            response = self.opener.open(fullurl=url, timeout=timeout)
        # first request fatal
        except urllib.error.HTTPError as e:
            ## log_context = "Error type: " + str(e)
            ## self.logprowork(logpath, log_context)
            # http error 404, change image type
            if e.code == dataload.HTTP_NOTFOUND_CODE_404:
                img_datatype = 'jpg'  # change data type
                jpg_img_url = url[0:-3] + img_datatype  # replace url content
                # after change image type word try again
                try:
                    response = self.opener.open(fullurl=jpg_img_url,
                                                timeout=timeout)
                except urllib.error.HTTPError as e:
                    ## log_context = "Error type: " + str(e)
                    ## self.logprowork(logpath, log_context)
                    # not 404 change proxy, cause request server forbidden
                    if e.code != dataload.HTTP_NOTFOUND_CODE_404:
                        log_context = "Add proxy server in request"
                        self.logprowork(log_path, log_context)
                        # preload a proxy handler, just run once
                        if self.proxy_hasrun_flag == False:
                            self.proxy_hasrun_flag = True
                            proxy = self._getproxyserver(log_path)
                            proxy_handler = urllib.request.ProxyHandler(proxy)
                        # with proxy request again
                        self.opener = urllib.request.build_opener(
                            proxy_handler)
                        response = self.opener.open(fullurl=jpg_img_url,
                                                    timeout=timeout)
                    else:
                        pass
            # if timeout, use proxy reset request
            else:
                log_context = "Add proxy server in request"
                self.logprowork(log_path, log_context)
                # with proxy request again
                self.opener = urllib.request.build_opener(proxy_handler)
                response = self.opener.open(fullurl=url, timeout=timeout)

        # save image bin data to files
        if response.getcode() == dataload.HTTP_OK_CODE_200:
            img_bindata = response.read()
            # calcus target source data stream size
            # multi-thread, no resource lock, it must use class name to call
            source_size = round(float(len(img_bindata) / 1024), 2)
            PixivAPILib._datastream_pool += source_size
            # save image bin data
            with open(img_save_path, 'wb') as img:
                img.write(img_bindata)
Example #3
0
    def _save_oneimage(self, index, url, basepages, img_savepath, log_path):
        """Download one target image, then multi-process will call here

        Add retry decorator, if first try failed, it will auto-retry
        :param index:           image index
        :param url:             one image url
        :param basepages:       referer basic pages list
        :param img_savepath:    image save path
        :param log_path:        log save path
        :return:                none
        """
        proxy_handler = None
        timeout = 30
        img_datatype = 'png'
        # name artwork_id + _px
        image_name = url[57:-4]

        # setting new headers
        headers = dataload.build_original_headers(basepages[index])
        list_headers = dataload.dict_transto_list(headers)
        self.opener.addheaders = list_headers
        # update install opener
        urllib.request.install_opener(self.opener)

        # this request image step will delay much time
        response = None
        try:
            response = self.opener.open(fullurl=url, timeout=timeout)
        except urllib.error.HTTPError as e:
            ## log_context = str(e.code)
            ## self.logprowork(logpath, log_context)

            # http error 404, change image type
            if e.code == dataload.HTTP_NOTFOUND_CODE_404:
                # change data type
                img_datatype = 'jpg'
                jpg_img_url = url[0:-3] + img_datatype
                try:
                    response = self.opener.open(fullurl=jpg_img_url,
                                                timeout=timeout)
                except urllib.error.HTTPError as e:
                    ## log_context = str(e.code)
                    ## self.logprowork(logpath, log_context)

                    # not 404 change proxy, cause request server forbidden
                    if e.code != dataload.HTTP_NOTFOUND_CODE_404:
                        # if timeout, use proxy reset request
                        log_context = "change proxy server"
                        self.logprowork(log_path, log_context)

                        # preload a proxy handler, just run once
                        if Matrix._proxy_hasrun_flag is False:
                            Matrix._proxy_hasrun_flag = True
                            proxy = self._getproxyserver(log_path)
                            proxy_handler = urllib.request.ProxyHandler(proxy)
                        else:
                            pass

                        # add proxy handler
                        self.opener = urllib.request.build_opener(
                            proxy_handler)
                        # re-request
                        response = self.opener.open(fullurl=jpg_img_url,
                                                    timeout=timeout)
                    else:
                        pass
            # if timeout, use proxy reset request
            else:
                log_context = "change proxy server"
                self.logprowork(log_path, log_context)
                self.opener = urllib.request.build_opener(proxy_handler)
                response = self.opener.open(fullurl=url, timeout=timeout)

        # save image bin data to files
        if response.getcode() == dataload.HTTP_OK_CODE_200:
            img_bindata = response.read()

            # calcus download source whole size
            source_size = float(len(img_bindata) / 1024)
            Matrix._datastream_pool += source_size

            with open(
                    img_savepath + dataload.fs_operation[1] + image_name +
                    '.' + img_datatype, 'wb') as img:
                img.write(img_bindata)
            log_context = 'target no.%d image download finished,' \
                          ' image size: %dKB' % (index + 1, source_size)
            self.logprowork(log_path, log_context)