def load_post(self, url, data, headers=None, ret='json', use_cache=True, show_log=False): name = self.map_url_to_cache_id(url) if data: keys = sorted(data.keys()) for k in keys: name += '{}{}'.format(k, data[k]) raw = '' hit = False if use_cache: hit = True raw = self.load_from_cache(name) if not raw: if show_log: zlog.debug('cache miss: ({})'.format(name)) raw = self.do_sess_post(url, data, headers, ret) if raw: if ret == 'json': raw = json.dumps(raw) zlog.debug('write ({}) to {}'.format(len(raw), name)) helper.write_file(raw, name) if show_log: zlog.debug('[cache {}:{:>8}] post {}'.format( 'hit' if hit else 'miss', len(raw or ''), name)) return raw
def fetch_one(self, depth=1): r = self.bs4markup(self.sess.get(M['one']).text) script_raw = r.find('div', role='main').find('script') if not script_raw: return token = '' for line in script_raw.text.split('\n'): if line.find('token') != -1: token = line.split('\'')[1] break log.debug('ajaxlist token: {}'.format(token)) all_data = [] while depth: ajax_url = '{}{}?_token={}'.format(M['ajaxlist'], self.current_id, token) log.debug('fetch {}'.format(ajax_url)) cnt = self.sess.get(ajax_url) dat = cnt.json() all_data += dat['data'] self.current_id = all_data[-1]['id'] time.sleep(1) depth -= 1 if len(dat['data']) < 10: break helper.write_file(json.dumps(all_data), 'one.all.json') return all_data
def load(self, url, use_cache=True, show_log=False): """fetch the url ``raw info``, use cache first, if no cache hit, try get from Internet :param url: :type url: :param use_cache: :type use_cache: :param show_log: :type show_log: :return: the ``raw info`` of the url :rtype: ``str`` """ _name = self.map_url_to_cache_id(url) raw = '' hit = False if use_cache: hit = True raw = self.load_from_cache(_name) if not raw: if show_log: zlog.debug('from cache got nothing {}'.format(_name)) raw = self.do_sess_get(url) if raw: helper.write_file(raw, _name) # if not raw: # hit = True # raw = self.load_from_cache(_name) if show_log: zlog.debug('[{}:{:>8}] get {}'.format('Cache' if hit else 'Net', len(raw), url)) return raw
def update_tags(self): """ 更新 44.style 的所有 tags 到本地缓存. :return: :rtype: """ tags_root = [] tags_raw = self.bs4get(D['tags']) if not tags_raw: log.error('cannot update tags.') return tagall = tags_raw.find_all('div', class_='tagall photo') for tag in tagall: tag_in = [] ul_li = tag.ul.find_all('li') if ul_li: for li in ul_li: tag_in.append({ 'src': li.a.get('href'), 'name': li.a.text, }) tags_root.append(tag_in) if tags_root: helper.write_file(json.dumps(tags_root).encode(), 'd4.tags.json')
def md2list(self): dat = [] for line in self.txt.split('\n'): line = line.lstrip() if not line.startswith('*'): continue try: k, desc = line.split(':') if line.find('redis') != -1: print(line) desc_, url_ = desc.rstrip().split('[官网](') dat.append({ 'name': k.split('*')[1].lstrip().rstrip(), 'desc': desc_, 'url': url_[:-1], }) except ValueError as _: pass # log.error('{} -> {}'.format(line, _)) helper.write_file(json.dumps(dat), 'gitflat.awesome.json', append=False) os._exit(-1)
def dump_my_page_config(txt): """ 保存个人登陆信息到本地缓存 :param txt: :type txt: :return: :rtype: """ def get_config(raw_mark): """ 从 config 字段中解析出账号的信息 :param raw_mark: :type raw_mark: :return: :rtype: """ _START = '<!-- $CONFIG -->' _END = '<!-- / $CONFIG -->' return raw_mark.split(_START)[1].split(_END)[0] txt = get_config(txt) txt = [ t[1:].rstrip() for t in txt.split('\n') if t and t.find('CONFIG') != -1 and t.find('var ') == -1 ] dat = {} keys = [ 'oid', 'page_id', 'uid', 'nick', 'sex', 'watermark', 'domain', 'lang', 'skin', 'avatar_large', 'pid', ] for t in txt: k, v = t[:-1].split('=') k = k.split('\'')[1] if k not in keys: continue dat[k] = v.replace( '\'', '') if k != 'avatar_large' else 'http:' + v.replace('\'', '') helper.write_file(json.dumps(dat), base.app_pth['personal']) return dat
def spawn(self, url, force_spawn=False): """use the url for creation of domain and fetch cookies - init cache dir by the url domain as ``<base>/domain`` - save the cookies to file ``<base>/domain/cookie.txt`` - init ``headers.get/post/json`` with response info - init ``site_dir/site_raw/site_media`` :param url: :type url: :param force_spawn: :type force_spawn: :return: :rtype: """ _url, domain = self.get_domain_home_from_url(url) if not _url: return False self.cache['site_dir'] = os.path.join(self.cache['base'], self.domain) for k in ['raw', 'media']: self.cache['site_' + k] = os.path.join(self.cache['site_dir'], k) helper.mkdir_p(self.cache['site_' + k], True) ck_pth = os.path.join(self.cache['site_dir'], 'cookie.txt') helper.mkdir_p(ck_pth) name = os.path.join(self.cache['site_raw'], 'homepage') # not force spawn and file ok if not force_spawn and helper.is_file_ok(name): # zlog.debug('{} exist!'.format(name)) self.sess.cookies = self.load_cookies(ck_pth) return True else: zlog.debug('{} not exist!'.format(name)) res = self.sess.get(url, headers=self.__header__) if res.status_code != 200: return False if res: helper.write_file(res.content, name) # self.load(url) for k, v in self.headers.items(): self.headers[k] = res.request.headers self.dump_cookies(cookies=self.sess.cookies, save_to=ck_pth) return True
def update_cache(self): pyres = self.main_page() flat = [] for nfr in tqdm(pyres, ascii=True): if nfr.get('sub_res'): flat += nfr.get('sub_res') continue items = self.sub_page(nfr.get('url')) items = sorted(items, key=lambda s: s['name']) nfr['sub_res'] = items flat += nfr.get('sub_res') helper.write_file(json.dumps(pyres).encode(), 'jobble.json') helper.write_file(json.dumps(flat).encode(), 'jobble.flat.json')
def update_tag_pages_cache(self, tag): """ 更新tag对应页面到本地缓存 :param tag: :type tag: :return: :rtype: """ _k = tag['src'] if tag['src'] in json.loads(helper.read_file('d4.t2i.json')): log.debug('{} already got'.format(tag['src'])) return pages = self.fetch_tags_pages_by_index(tag['src']) self.t2i[_k] = pages helper.write_file(json.dumps(self.t2i).encode(), 'd4.t2i.json')
def dump_failure_songs(self, song, action='add'): dat = self.load_failure_songs() or [] dat = [x for x in dat if x] if action == 'add': dat.append(song) elif action == 'del': if song in dat: dat.pop(dat.index(song)) elif action == 'clear': dat = [] else: zlog.error('unsupported action: ({})'.format(action)) return dat = list(set(dat)) dat = yaml.dump(dat) file_pth = os.path.expanduser(self.failure_store) helper.write_file(dat, file_pth)
def __init__(self, pth, dat=None, enable_default_log=True): """初始化配置文件 - 文件不存在: 将字典dat初始化到文件中 - 文件存在: 以字典数据类型来初始化配置文件 :param dat: ``字典类型`` :type dat: dict :param pth: ``文件存储路径`` :type pth: str :param enable_default_log: ``是否启用默认log配置参数`` :type enable_default_log: bool """ try: if not helper.is_file_ok(pth): helper.write_file('', pth) self._pth, t = os.path.split(pth) self._cfg_name = t.split('.')[0] except Exception as _: self._pth = '/tmp' self._cfg_name = 'izen' self.cfg = profig.Config(pth, encoding='utf-8') # 读取配置 self.cfg.read() # 初始化默认log字段类型 if enable_default_log: self.__spawn() # 初始化自定义字典 if dat: self.__do_init(dat) # 在配置不存在时, 需要首先初始化在内存中, 然后再同步到本地并退出执行程序 if not os.path.exists(os.path.expanduser(pth)): self.cfg.sync()
def gen_lesson_url(self, chapters): dat = [] for chap in tqdm(chapters, ascii=True): # get mp3 url res = self.sess.post(M['lecture_mp3'], data={'lectureId': chap['lid']}) cp = { 'lid': chap['lid'], 'name': chap['name'], 'mp3': unquote(res.json()['obj']), } time.sleep(1) # get doc url res = self.sess.post(M['lecture_doc'], data={'lectureId': chap['lid']}) cp['doc'] = unquote(res.json()['obj']) dat.append(cp) rd = abc.randint(1, 3) log.debug('sleep {}'.format(rd)) time.sleep(rd) helper.write_file(json.dumps(dat), self.catalog_info['cache_file'])
def download_by_index(self, index): global cache_index global finished _e_list1 = ['33201', '34949', '35850', '45364', '47526'] if not index: log.info('no index found...') return dat = self.get_page_by_index(index) if not dat: log.error('fail@none: {}'.format(index)) return try: # 扩展名, 文件夹名, 自增名, 图片固定url _ext = dat['img_src'].split('.')[-1] _name_off = 3 + len(_ext) if index in _e_list1: _name_off = 4 + len(_ext) img = dat['img_src'][:-_name_off] if index == '54856': _name_pre = '' fd_img = 'a' elif index in _e_list1: fd_img, _ = get_fd_name(dat['img_src']) _name_pre = '' else: _name_pre = img.split('/')[-1] fd_img = img[-1] _path_local = os.path.join(dat['time'], fd_img) fd = os.path.join(self.base_dir, _path_local) helper.mkdir_p(fd, True) os.chdir(fd) if index in _e_list1: _img_fmt = '{}{}1.{}' else: _img_fmt = '{}{}.{}' params = [ { 'img_url': _img_fmt.format(img, str(x + 1).zfill(2), _ext), 'title': '{}-{}{}.{}'.format( dat['name'].decode(), _name_pre, str(x + 1).zfill(2), _ext, ) } for x in range(dat['total']) ] _fail_count = 0 for para in tqdm(params, ascii=True, desc='%8s ✈ %10s' % (index, _path_local)): rs = self.download_and_save(para) if rs == self.save_status['fail']: _fail_count += 1 time.sleep(0.5) elif rs == self.save_status['ok']: time.sleep(1.5) elif rs == self.save_status['skip']: # 如果本地文件已存在, 则不进行等待 time.sleep(0.0001) if _fail_count > 5: log.warn('fail@5 img of this, skip({}) => ({})'.format(index, _path_local)) break cache_index += 1 finished.append(index) helper.write_file(json.dumps(finished), '/tmp/mz.done') log.warn('Done:({}/{})'.format(cache_index, index)) except TypeError as _: log.error('fail@type: {}'.format(index))
def md2dict(self): """ :return: """ flat_dat = [] dat = [] h3 = {'name': '', 'desc': '', 'groups': []} lis = { 'cate': '', 'libs': [], } line_count = 0 for lno, line in enumerate(self.txt.split('\n')): line = line.lstrip() if not line: continue line_count += 1 if line.startswith('*'): try: k, desc = line.split(':') desc_, url_ = desc.rstrip().split('[官网](') d = { 'name': k.split('*')[1].lstrip().rstrip(), 'desc': desc_, 'url': url_[:-1], } # if d['name'] == 'hot-redis': # print(d) flat_dat.append(d) lis['libs'].append(d) except ValueError as _: lis = { 'cate': '*'.join(line.split('*')[1:]).lstrip().rstrip(), 'libs': [], } # log.error('({}){} -> {}'.format(lno, line, _)) continue if line.startswith('###'): if h3.get('name'): h3['groups'].append(lis) dat.append(h3) lis = { 'cate': '', 'libs': [], } # print(dat) # print(line_count) # os._exit(-1) h3 = { 'name': line.split('###')[-1].lstrip().rstrip(), 'desc': '', 'groups': [], } continue h3['desc'] = line.lstrip().rstrip() helper.write_file(json.dumps(dat), 'github.awesome.json') helper.write_file(json.dumps(flat_dat), 'gitflat.awesome.json') dl_l = dbstore.rds.List(key='awesome.todo.libs') dl_l += dat os._exit(-1)
def save_img(dat, pth): if not dat: return helper.write_file(dat, pth)
def login(self, username='', password=''): form = { 'entry': 'weibo', 'gateway': '1', 'from': '', 'savestate': '7', 'qrcode_flag': False, 'useticket': '1', 'pagerefer': 'https://login.sina.com.cn/crossdomain2.php?action=logout&r=https%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%252F', 'vsnf': 1, 'service': 'miniblog', 'pwencode': 'rsa2', 'sr': '1280*800', 'encoding': 'UTF-8', 'prelt': '41', 'url': 'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype': 'META', 'sp': '3d841d27085a2fac6f5218f18f4ce5caf3cb020c49bc109608dc106f3e14cf8354e41ad03444672f492b490f81f155fff9364f2dff86677429e5b745fbe4ccffadd5458a0a6f29a5d769c63801753b49b3eaf7b52489da04a79f0724b9842c9aac7f546d2eb037d44484bd0ad3c8ac35ba9136a1eceaaa59587168881dc06f3b', 'servertime': '1512958714', 'nonce': 'ZIVLK5', 'rsakv': '1330428213', } dat = self.pre_login(username=username, password=password) form = dict(form, **dat) log.debug('STEP1: get {}'.format(M['login'])) res = self.sess.post(M['login'], data=form, headers=self.post_headers) # 分析 login.php 返回信息的重定向 url pa = r'location\.replace\([\'"](.*?)[\'"]\)' loop_url = re.findall(pa, res.content.decode('GBK'))[0] log.debug('STEP2: get {}'.format(loop_url)) # 获取返回第一次重定向 url 的返回信息 res = self.sess.get(loop_url) # 返回信息分两部分, 第一部分 setCrossDomainUrlList 出现 302 Moved Temporarily 错误, 故跳过 # 只取返回信息的第二部分 解析方式同 login.php 返回结果 final_url = re.findall(pa, res.content.decode('GBK'))[0] log.debug('STEP3: get {}'.format(final_url)) res = self.sess.get(final_url) uuid_pa = r'"uniqueid":"(.*?)"' uuid_res = re.findall(uuid_pa, res.text, re.S)[0] log.debug('STEP4:user_id: {}'.format(uuid_res)) url = M['profile'].format(uuid_res) raw = self.sess.get(url) def get_config(raw_mark): _START = '<!-- $CONFIG -->' _END = '<!-- / $CONFIG -->' return raw_mark.split(_START)[1].split(_END)[0] user_config = get_config(raw.text) user_config = self.dump_person_config(user_config) helper.write_file(json.dumps(user_config), 'personal.txt') raw = self.bs4markup(raw.text) log.debug('STEP5: title : {}'.format(raw.find('title').text)) abc.update_cfg('weibo.nickname', raw.find('title').text.replace('的微博_微博', '')) log.info('[LOGIN:SUCCESS] {}({})'.format(cfg.get('weibo.nickname'), username)) self.dump_cookies(self.sess.cookies)