def get_art(self, art): url_, t = self.get_artist_info(art['url']) _img_fd = gen_fd_from_image_url(url_) fd = os.path.join(self.base_dir, _img_fd) helper.mkdir_p(fd, True) os.chdir(fd) params = [ { 'title': art['title'], 'img_url': url_.format(_ + 1), } for _ in range(t)] _fail_count = 0 for pam in tqdm(params, ascii=True, desc='%10s' % _img_fd, leave=False): rs = self.download_and_save(pam) if rs == 1: _fail_count += 1 time.sleep(0.5) elif rs == 3: time.sleep(1.5) else: # 如果本地文件已存在, 则不进行等待 time.sleep(0.0001) if _fail_count > 5: log.warn('skip({}) => ({})'.format(_img_fd, url_)) break
def download_image(self, params): """ 下载相应图片 :param params: :type params: dict :return: :rtype: """ img_url, title = params.get('img_url'), params.get('title') filename = '{}'.format(img_url.split('/')[-1]) filename = filename.split('.')[0].zfill(2) _img_fd = gen_fd_from_image_url(img_url) _rel_pth = '{}/{}.jpg'.format(_img_fd, filename) fpth = '{}/{}/{}-{}.jpg'.format(self.base_dir, _img_fd, title, filename) if helper.is_file_ok(fpth): return if os.path.exists(fpth): log.debug('RETRY: ({})'.format(_rel_pth)) else: helper.mkdir_p(fpth) img = self.crawl(img_url) if not img: return with open(fpth, 'wb') as f: f.write(img)
def spawn(self, url, force_spawn=False): """use the url for creation of domain and fetch cookies - init cache dir by the url domain as ``<base>/domain`` - save the cookies to file ``<base>/domain/cookie.txt`` - init ``headers.get/post/json`` with response info - init ``site_dir/site_raw/site_media`` :param url: :type url: :param force_spawn: :type force_spawn: :return: :rtype: """ _url, domain = self.get_domain_home_from_url(url) if not _url: return False self.cache['site_dir'] = os.path.join(self.cache['base'], self.domain) for k in ['raw', 'media']: self.cache['site_' + k] = os.path.join(self.cache['site_dir'], k) helper.mkdir_p(self.cache['site_' + k], True) ck_pth = os.path.join(self.cache['site_dir'], 'cookie.txt') helper.mkdir_p(ck_pth) name = os.path.join(self.cache['site_raw'], 'homepage') # not force spawn and file ok if not force_spawn and helper.is_file_ok(name): # zlog.debug('{} exist!'.format(name)) self.sess.cookies = self.load_cookies(ck_pth) return True else: zlog.debug('{} not exist!'.format(name)) res = self.sess.get(url, headers=self.__header__) if res.status_code != 200: return False if res: helper.write_file(res.content, name) # self.load(url) for k, v in self.headers.items(): self.headers[k] = res.request.headers self.dump_cookies(cookies=self.sess.cookies, save_to=ck_pth) return True
def down(lec, which_type): """""" if which_type not in ['mp3', 'doc']: log.warn('Not Supported of ({})'.format(which_type)) return save_name = os.path.join(self.catalog_info['base_pth'], which_type, '{}.{}'.format(lec['name'], which_type)) helper.mkdir_p(save_name) if helper.is_file_ok(save_name): print() log.debug('SKIP {}'.format(save_name)) return # 默认等待2s, 如果是下载 mp3 随机等待 rd = 2 if which_type == 'mp3': rd = abc.randint(3, 6) url = lec[which_type] log.debug('[WAIT] {}s for ({}:{})...'.format(rd, save_name, url)) time.sleep(rd) wget.download(url, out=save_name)
def download_by_index(self, index): global cache_index global finished _e_list1 = ['33201', '34949', '35850', '45364', '47526'] if not index: log.info('no index found...') return dat = self.get_page_by_index(index) if not dat: log.error('fail@none: {}'.format(index)) return try: # 扩展名, 文件夹名, 自增名, 图片固定url _ext = dat['img_src'].split('.')[-1] _name_off = 3 + len(_ext) if index in _e_list1: _name_off = 4 + len(_ext) img = dat['img_src'][:-_name_off] if index == '54856': _name_pre = '' fd_img = 'a' elif index in _e_list1: fd_img, _ = get_fd_name(dat['img_src']) _name_pre = '' else: _name_pre = img.split('/')[-1] fd_img = img[-1] _path_local = os.path.join(dat['time'], fd_img) fd = os.path.join(self.base_dir, _path_local) helper.mkdir_p(fd, True) os.chdir(fd) if index in _e_list1: _img_fmt = '{}{}1.{}' else: _img_fmt = '{}{}.{}' params = [ { 'img_url': _img_fmt.format(img, str(x + 1).zfill(2), _ext), 'title': '{}-{}{}.{}'.format( dat['name'].decode(), _name_pre, str(x + 1).zfill(2), _ext, ) } for x in range(dat['total']) ] _fail_count = 0 for para in tqdm(params, ascii=True, desc='%8s ✈ %10s' % (index, _path_local)): rs = self.download_and_save(para) if rs == self.save_status['fail']: _fail_count += 1 time.sleep(0.5) elif rs == self.save_status['ok']: time.sleep(1.5) elif rs == self.save_status['skip']: # 如果本地文件已存在, 则不进行等待 time.sleep(0.0001) if _fail_count > 5: log.warn('fail@5 img of this, skip({}) => ({})'.format(index, _path_local)) break cache_index += 1 finished.append(index) helper.write_file(json.dumps(finished), '/tmp/mz.done') log.warn('Done:({}/{})'.format(cache_index, index)) except TypeError as _: log.error('fail@type: {}'.format(index))
def _spawn(self): self.ac.headers['post'] = SonimeiHeaders.post self.ac.headers['Host'] = self.ac.domain helper.mkdir_p(self.music_save_dir, is_dir=True)