def load(self, url, use_cache=True, show_log=False, **kwargs): """fetch the url ``raw info``, use cache first, if no cache hit, try get from Internet :param url: :type url: :param use_cache: :type use_cache: :param show_log: :type show_log: :return: the ``raw info`` of the url :rtype: ``str`` """ _name = self.map_url_to_cache_id(url) raw = '' hit = False if use_cache: hit = True raw = self.load_from_cache(_name) if not raw: if show_log: zlog.debug('from cache got nothing {}'.format(_name)) raw = self.do_sess_get(url, **kwargs) if raw: helper.write_file(raw, _name) # if not raw: # hit = True # raw = self.load_from_cache(_name) if show_log: zlog.debug('[{}:{:>8}] get {}'.format('Cache' if hit else 'Net', len(raw), url)) return raw
def load_post(self, url, data, headers=None, ret='json', use_cache=True, show_log=False): name = self.map_url_to_cache_id(url) if data: keys = sorted(data.keys()) for k in keys: name += '{}{}'.format(k, data[k]) raw = '' hit = False if use_cache: hit = True raw = self.load_from_cache(name) if not raw: if show_log: zlog.debug('cache miss: ({})'.format(name)) raw = self.do_sess_post(url, data, headers, ret) if raw: if ret == 'json': raw = json.dumps(raw) zlog.debug('write ({}) to {}'.format(len(raw), name)) helper.write_file(raw, name) if show_log: zlog.debug('[cache {}:{:>8}] post {}'.format( 'hit' if hit else 'miss', len(raw or ''), name)) return raw
def search_name(self, name): pth, raw = self.load_cache(SiteURL.search, data={'keywords': name}, use_str=True) if not raw or self.overwrite: raw = search_by_chrome(SiteURL.home, ('input.formhue', name)) helper.write_file(raw, pth) parser = DygodParser(raw_data=raw) parser.do_parse() return parser.data['movies']
def do_get(self, url, save_to): if helper.is_file_ok(save_to): return 200 res = self.sess.get(url, headers=self.headers, cookies=self.cookies, timeout=30) if res.status_code == 404: zlog.error('[{}]: {}'.format(res.status_code, url)) return 404 helper.write_file(res.content, save_to) zlog.debug('[{}] saved {}'.format(res.status_code, url)) return res.status_code
def spawn(self, url, force_spawn=False): """use the url for creation of domain and fetch cookies - init cache dir by the url domain as ``<base>/domain`` - save the cookies to file ``<base>/domain/cookie.txt`` - init ``headers.get/post/json`` with response info - init ``site_dir/site_raw/site_media`` :param url: :type url: :param force_spawn: :type force_spawn: :return: :rtype: """ _url, domain = self.get_domain_home_from_url(url) if not _url: return False self.cache['site_dir'] = os.path.join(self.cache['base'], self.domain) for k in ['raw', 'media']: self.cache['site_' + k] = os.path.join(self.cache['site_dir'], k) helper.mkdir_p(self.cache['site_' + k], True) ck_pth = os.path.join(self.cache['site_dir'], 'cookie.txt') helper.mkdir_p(ck_pth) name = os.path.join(self.cache['site_raw'], 'homepage') # not force spawn and file ok if not force_spawn and helper.is_file_ok(name): # zlog.debug('{} exist!'.format(name)) self.sess.cookies = self.load_cookies(ck_pth) return True else: zlog.debug('{} not exist!'.format(name)) res = self.sess.get(url, headers=self.__header__) if res.status_code != 200: return False if res: helper.write_file(res.content, name) # self.load(url) for k, v in self.headers.items(): self.headers[k] = res.request.headers self.dump_cookies(cookies=self.sess.cookies, save_to=ck_pth) return True
def get_album_images(self, index=1): self.get_album_pages(index) gap = GirlAlbumPageParser( file_name='{}/album_page/{}.html'.format(self.girl_id, index)) gap.do_parse() albums = gap.data.get('albums') albums = [x.split('/')[-2] for x in albums] zlog.debug('{} => {}/{}\n{}'.format(self.girl_id, index, len(albums), albums)) avail_images = [] for album in albums: avail_images += self.get_single_album_images(album) helper.write_file( '\n'.join(avail_images), self.base_dir / 'logs/{}-{}.txt'.format(self.girl_id, index))
def dump_failure_songs(self, song, action='add'): dat = self.load_failure_songs() or [] dat = [x for x in dat if x] if action == 'add': dat.append(song) elif action == 'del': if song in dat: dat.pop(dat.index(song)) elif action == 'clear': dat = [] else: zlog.error('unsupported action: ({})'.format(action)) return dat = list(set(dat)) dat = yaml.dump(dat) file_pth = os.path.expanduser(self.failure_store) helper.write_file(dat, file_pth)
def run(girl_id, page_num, generate_album_images, process, force, view, whom, delete_girl, album_id, auto_mode): gc = GirlCrawler(girl_id, header_file=str(SAVE_DIR / 'header.txt')) if view or whom is not None: gc.store.show(whom) return if delete_girl: gc.store.delete() return if auto_mode: girl_id = gc.store.get_minimum_girl_id() zlog.debug('start with girl-id: {}'.format(girl_id)) if generate_album_images: gc.get_all_albums() return if album_id: images = gc.get_single_album_images(album_id) helper.write_file( '\n'.join(images), gc.base_dir / 'logs/{}-{}.txt'.format(girl_id, album_id)) girl_info = gc.store.girls.get(girl_id, {}) if not girl_info.get('pages', 0): page_num = 0 if album_id: page_num = album_id img_file = str(SAVE_DIR / 'logs/{}-{}.txt'.format(girl_id, page_num)) e404_file = str(SAVE_DIR / 'logs/{}-{}-404.txt'.format(girl_id, page_num)) # print(img_file, e404_file) e404 = [] if not force and helper.is_file_ok(e404_file): e404 = [ x for x in helper.read_file(e404_file, use_str=True).split('\n') if x ] images = [ x for x in helper.read_file(img_file, use_str=True).split('\n') if x ] # gc = GirlCrawler(girl_id) todo = [] for img in images: d = gc.gen_pth_by_url(img) if img in e404: continue if d: todo.append(str(img)) zlog.debug('all/todo/404: {}/{}/{}'.format(len(images), len(todo), len(e404))) if not todo: zlog.info('all image of {} downloaded'.format(girl_id)) gc.store.mark_done(page_num) return e404_, ok = gc.parallel_download(urls=todo, procs=process) if e404_: e404 += e404_ helper.write_file('\n'.join(sorted(e404)), e404_file) else: if ok == len(todo): zlog.info('all image of {} downloaded'.format(girl_id)) gc.store.mark_done(page_num)
def sync(self): helper.write_file(json.dumps(self.girls), self._file)