def check_useful_task(self): check_buffer = [] count = 0 while True: count += 1 data = self.db.pop_temp_buffer() if data: eval_data = eval(data) if eval_data[4] < self.MAX_ERROR_NUM and eval_data[ 5] < self.MAX_ERROR_NUM: if count // 2: msgs = self.start_one_check(check_buffer, eval_data, 'http') else: msgs = self.start_one_check(check_buffer, eval_data, 'https') else: if eval_data[4] < self.MAX_ERROR_NUM: msgs = self.start_one_check(check_buffer, eval_data, 'http') elif eval_data[5] < self.MAX_ERROR_NUM: msgs = self.start_one_check(check_buffer, eval_data, 'https') else: continue downloader(msgs) else: for item in check_buffer: if not item[1].empty(): res = item[1].get() if res: res.encoding = res.apparent_encoding if item[2] == 'http': if re.findall("<title>腾讯首页</title>", res.text): item[0][4] = self.MAX_ERROR_NUM self.db.adds_http_pool( ((*item[0][:4], 0, item[0][6]), )) else: item[0][4] += 1 elif item[2] == 'https': if re.findall("<title>百度一下,你就知道</title>", res.text): item[0][5] = self.MAX_ERROR_NUM self.db.adds_https_pool( ((*item[0][:4], 0, item[0][6]), )) else: item[0][5] += 1 else: if item[2] == 'http': item[0][4] += 1 elif item[2] == 'https': item[0][5] += 1 if item[0][4] < self.MAX_ERROR_NUM or item[0][ 5] < self.MAX_ERROR_NUM: self.db.adds_temp_buffer((item[0], )) check_buffer.remove(item) time.sleep(0.01)
def engine(initUrls): total_pages = manager(initUrls) downloader() success_page = int(redis.get('success')) success_rate = success_page / total_pages print('success_page:', success_page) print('total_pages:', total_pages) print('success_rate: %0.2f%%' % (success_rate * 100))
def start(self): self.wait_proxy_ready() urls = self.get_page_urls() print(list(urls)) _headers = self.headers _headers['User-Agent'] = self.ua.random msgs = [(requests.get, self.queue, url, {'headers': _headers,'timeout':10,'proxies': {'http': self.get_http_proxy()}}) for url in urls] download.downloader(msgs) print('xicidaili ---> start') for i in range(len(msgs)): res = self.queue.get() if res: res.encoding = res.apparent_encoding for i in self.handle_html(res.text): yield i
def start(self): urls = self.get_page_urls() _headers = self.headers _headers['User-Agent'] = self.ua.random msgs = [(requests.get, self.queue, url, { 'headers': _headers, 'timeout': 10 }) for url in urls] download.downloader(msgs) for i in range(len(msgs)): res = self.queue.get() if res: res.encoding = res.apparent_encoding for i in self.handle_html(res.text): yield i
def download(): url = request.form.get('download_link') file_location = downloader(url, DOWNLOADS_FOLDER) if not url: return render_template('failure.html') else: return file_location
def proxy_requests(self,url): while True: _headers = self.headers _headers['User-Agent'] = self.ua.random proxy = self.get_http_proxy() if proxy: msgs = [(requests.get, self.queue, url, {'headers': _headers, 'timeout': 10, 'proxies': {'http': proxy}})] else: msgs = [(requests.get, self.queue, url, {'headers': _headers, 'timeout': 10})] download.downloader(msgs) res = self.queue.get() if not res: print('get_page_urls error') self.send_http_proxy_error(proxy) else: return res
def run_by_iid(session,Iid): # session : requests.Session() : None # Iid : integer : illust ID ''' get download url ''' url_and_title=url.get_base_url(session, Iid, True) try: print (url_and_title['msg']) return except: base_url=url_and_title['base_url'] title=url_and_title['title'] illuster_name=url.get_illuster_name(title) illust_title=url.get_illust_title(title) ''' _p0 --> _plast ''' counter=0 jpgFlag=True pngFlag=True while jpgFlag or pngFlag: ''' jpg code block ''' _data=url.get_whole_url(session,base_url,str(counter),'.jpg') if _data['status_code'] != 200: jpgFlag=False else: jpgFlag=True file_name=download.downloader(_data['img_url'],illuster_name,illust_title+'_p'+str(counter),str(Iid)+'.jpg') print('===>'+file_name+' download completed!') print() ''' png code block ''' _data=url.get_whole_url(session,base_url,str(counter),'.png') if _data['status_code'] != 200: pngFlag=False else: pngFlag=True file_name=download.downloader(_data['img_url'],illuster_name,illust_title+'_p'+str(counter),str(Iid)+'.png') print('===>'+illuster_name+':'+file_name+' download completed!') print() counter+=1
def get_page_urls(self): '''获取需要爬取页面的urls''' try: _headers = self.headers _headers['User-Agent'] = self.ua.random msgs = [(requests.get, self.queue, self.start_urls, { 'headers': _headers })] download.downloader(msgs) res = self.queue.get() if res: res.encoding = res.apparent_encoding htmlEmt = etree.HTML(res.text) tr_list = htmlEmt.xpath("//ul[@class='textlarge22']/li")[1:] return (self.start_urls + i.xpath("a/@href")[0] for i in tr_list) else: return [] except Exception as e: print('get_page_urls:', e)
def downloading(continue_toggle=True): ### Part 1: download images ### username = util.USERNAME password = util.PASSWORD OUTPUT_DIR = util.OUTPUT_DIR # intialize downloader downloader = download.downloader(username=util.USERNAME, password=util.PASSWORD, OUTPUT_DIR=util.OUTPUT_DIR) # download set of scenes: landsat_dir, modis_dir = downloader.download_all( continue_toggle=continue_toggle) return landsat_dir, modis_dir
def gen_dom(url): return parse(downloader(url).download())
def __init__(self, prep='http://cms.cern.ch/iCMS/prep/'): self.downloader = downloader(prep) self.request = None
print('login successfully!') # get Illust ID while True: Iid = input("Illust ID:") base_url = url.get_base_url(session, Iid) counter = 0 jpgFlag = True pngFlag = True while jpgFlag or pngFlag: _data = url.get_whole_url(session, base_url, str(counter), '.jpg') if _data['status_code'] != 200: jpgFlag = False else: jpgFlag = True file_name = download.downloader(_data['img_url'], str(counter), '.jpg') print(file_name + ' download completed!') _data = url.get_whole_url(session, base_url, str(counter), '.png') if _data['status_code'] != 200: pngFlag = False else: pngFlag = True file_name = download.downloader(_data['img_url'], str(counter), '.png') print(file_name + ' download completed!') counter += 1 Continue = input('Do you want download next illust?(Y/N)') if Continue is 'N' or 'n': break
def main(): log = logging.getLogger("main") """Runs program and handles command line options""" p = optparse.OptionParser(version = "%prog " + version) p.add_option('-v', '--verbose', action ='count',help='Change global log level, increasing log output.', metavar='LOGFILE') p.add_option('-q', '--quiet', action ='count',help='Change global log level, decreasing log output.', metavar='LOGFILE') p.add_option('--branch', action ='store',help='set branch to commit rpms to.', metavar='OBSREPOARCH_BRANCH') p.add_option('--repo-uri', action ='store',help='base uri to downlaod', metavar='OBSREPOARCH_URI') p.add_option('--git-master-repo', action ='store',help='local shared git pack object store path', metavar='OBSREPOARCH_MASTERREPO') p.add_option('--git-origin', action ='store',help='upstream git repo.', metavar='OBSREPOARCH_ORIGIN') p.add_option('--dir-work', action ='store',help='Working directory fro checkout of repo.', metavar='OBSREPOARCH_WORKINGDIR') p.add_option('--log-config', action ='store',help='Logfile configuration file, (overrides command line).', metavar='LOGFILE') options, arguments = p.parse_args() logFile = None workingdir = 'workingdir' origin = '' shared_clone = '' branch = "ibs_product_1.0" uri = "http://download.suse.de/ibs/Devel:/Storage:/1.0:/Staging/openSUSE_Factory/" if 'OBSREPOARCH_LOG_CONF' in os.environ: logFile = os.environ['OBSREPOARCH_LOG_CONF'] if 'OBSREPOARCH_ORIGIN' in os.environ: origin = os.environ['OBSREPOARCH_ORIGIN'] if 'OBSREPOARCH_WORKINGDIR' in os.environ: workingdir = os.environ['OBSREPOARCH_WORKINGDIR'] if 'OBSREPOARCH_BRANCH' in os.environ: branch = os.environ['OBSREPOARCH_BRANCH'] if 'OBSREPOARCH_URI' in os.environ: uri = os.environ['OBSREPOARCH_URI'] if 'OBSREPOARCH_MASTERREPO' in os.environ: shared_clone = os.environ['OBSREPOARCH_MASTERREPO'] LoggingLevel = logging.WARNING LoggingLevelCounter = 2 if options.verbose: LoggingLevelCounter = LoggingLevelCounter - options.verbose if options.verbose == 1: LoggingLevel = logging.INFO if options.verbose == 2: LoggingLevel = logging.DEBUG if options.quiet: LoggingLevelCounter = LoggingLevelCounter + options.quiet if LoggingLevelCounter <= 0: LoggingLevel = logging.DEBUG if LoggingLevelCounter == 1: LoggingLevel = logging.INFO if LoggingLevelCounter == 2: LoggingLevel = logging.WARNING if LoggingLevelCounter == 3: LoggingLevel = logging.ERROR if LoggingLevelCounter == 4: LoggingLevel = logging.FATAL if LoggingLevelCounter >= 5: LoggingLevel = logging.CRITICAL if options.log_config: logFile = options.log_config if logFile != None: if os.path.isfile(str(options.log_config)): logging.config.fileConfig(options.log_config) else: logging.basicConfig(level=LoggingLevel) log = logging.getLogger("main") log.error("Logfile configuration file '%s' was not found." % (options.log_config)) sys.exit(1) else: logging.basicConfig(level=LoggingLevel) log = logging.getLogger("main") if options.branch: branch = options.branch if options.repo_uri: uri = options.repo_uri if options.dir_work: workingdir = options.dir_work if options.git_master_repo: shared_clone = options.git_master_repo if not options.git_origin: log.error("No git origin given, use --git-origin!") sys.exit(1) origin = options.git_origin downloader = download.downloader( workingdir=workingdir, origin=origin, shared_clone=shared_clone ) downloader.work_dir_setup( branch=branch ) downloader.update( uri=uri ) return 0