def crawl_all_page(self): """ - 获取总页数 - 启动多个进程爬取 - 写入已完成的页数到 db :return: """ def get_total_num(): url = M['index'].format(1) raw = self.bs4markup(self.do_get(url)) pagi = raw.find('div', 'pagination') return pagi.find_all('a')[-2] def crawl_each(idx): return self.get_page(idx) # print('CRAWL: {}'.format(idx)) # return [] def update_done_num(idx): """ 更新已完成数字到 ``redis`` 中 :param idx: :return: """ abc.update_cfg('xici.last_index', idx) # print('DONE: {}'.format(idx)) # return idx # total = get_total_num() total = 2564 st = 0 each_size = 100 # dat = [] while True: start_str = 'START@{}'.format(st * each_size) for i in tqdm(range(st * each_size, (st + 1) * each_size), ascii=True, desc=start_str): if i <= int(cfg.get('xici.last_index', 0)): continue if i > total: return _ = crawl_each(i) if _: self.save_to_db(_) update_done_num(i) a = abc.randint(6, 14) time.sleep(a) st += 1
def get_free_pages(self): for i in tqdm(range(1963), ascii=True): i += 1 if i <= int(cfg.get('kdl.last_index', 0)): continue log.debug('START@({})'.format(i)) dat = self.get_page(M['free_index'], i) if dat: self.save_to_db(dat, 'kdl.free.proxies') abc.update_cfg('kdl.last_index', i) time.sleep(abc.randint(5, 10))
def get_ops_pages(self): """ total 10 pages :return: """ dat = [] for i in tqdm(range(10), ascii=True): i += 1 dat += self.get_page(M['ops_index'], i, id_='freelist') time.sleep(abc.randint(5, 10)) self.save_to_db(dat, 'kdl.ops.proxies') return dat
def update_albums(self, uid='1915268965'): """ 依据 ``uid`` 来获取该用户的所有照片, 并同步写入数据库中. 专辑数量较小, 且专辑内图片数量一直更新, 故需要采取每次更新的方式写入 :param uid: :type uid: str/dict :return: :rtype: """ if isinstance(uid, dict): uid = uid['uid'] page_count = cfg.get('weibo.album_page_count', 20) params = { 'uid': uid, 'count': page_count, } def fetch_album(page): page['index'] += 1 params['page'] = page['index'] raw = self.sess.get(M['album'], params=params, headers=self.get_headers) albums_dat = raw.json() albums = albums_dat['data'] _total_albums = albums['total'] page['updated'] += len(albums['album_list']) for album_ in albums['album_list']: wb_mg_doc.album_update(album_) return albums['album_list'] == _total_albums page = { 'index': 0, 'updated': 0, } log.debug('Try Update ({})'.format(uid)) while fetch_album(page): fetch_album(page) time.sleep(abc.randint(5, 9)) log.debug('Success update ({}) albums info'.format(page['updated']))
def down(lec, which_type): """""" if which_type not in ['mp3', 'doc']: log.warn('Not Supported of ({})'.format(which_type)) return save_name = os.path.join(self.catalog_info['base_pth'], which_type, '{}.{}'.format(lec['name'], which_type)) helper.mkdir_p(save_name) if helper.is_file_ok(save_name): print() log.debug('SKIP {}'.format(save_name)) return # 默认等待2s, 如果是下载 mp3 随机等待 rd = 2 if which_type == 'mp3': rd = abc.randint(3, 6) url = lec[which_type] log.debug('[WAIT] {}s for ({}:{})...'.format(rd, save_name, url)) time.sleep(rd) wget.download(url, out=save_name)
def gen_lesson_url(self, chapters): dat = [] for chap in tqdm(chapters, ascii=True): # get mp3 url res = self.sess.post(M['lecture_mp3'], data={'lectureId': chap['lid']}) cp = { 'lid': chap['lid'], 'name': chap['name'], 'mp3': unquote(res.json()['obj']), } time.sleep(1) # get doc url res = self.sess.post(M['lecture_doc'], data={'lectureId': chap['lid']}) cp['doc'] = unquote(res.json()['obj']) dat.append(cp) rd = abc.randint(1, 3) log.debug('sleep {}'.format(rd)) time.sleep(rd) helper.write_file(json.dumps(dat), self.catalog_info['cache_file'])
def update_photos(self, album_info, init_photos=False): """ 获取某个专辑下的所有照片信息, 并写入数据库中. - 如果是初始化, 则批量写入, 忽略错误. - 否则, 更新模式, 只更新大于查询最后一次记录的时间戳的数据 :param album_info: :type album_info: dict :param init_photos: 是否初始化 :type init_photos: :return: :rtype: """ page_count = cfg.get('weibo.photo_page_count', 32) params = { 'uid': album_info['uid'], 'album_id': album_info['album_id'], 'type': album_info.get('album_type', 3), 'count': page_count, 'page': 1, } # 如果更新相册照片时异常结束, 则会从最后一次有效记录位置开始更新 # TODO: 修改为保持进度到数据库中. _max_page = album_info['count']['photos'] // page_count # 最后一次更新的时间戳, 设置为0, 可以获取所有数据 latest_ts = 0 if not init_photos: # 如果不是初始化, 则尝试从数据库获取 last_doc = wb_mg_doc.WeiboPhotos.objects( __raw__={ 'album_id': album_info['album_id'] }).first() if last_doc: latest_ts = last_doc.timestamp # start = 0 while True: suc = 'DONE' # start += 1 # params['page'] = start raw = self.sess.get(M['photo'], params=params, headers=self.get_headers) log.debug('try crawl: {}'.format(raw.url)) photos = raw.json() raw_p_list = photos['data']['photo_list'] # 如果非初始化模式, 则依据时间戳过滤掉已经下载的记录 if init_photos: p_list = raw_p_list else: p_list = [x for x in raw_p_list if x['timestamp'] > latest_ts] if not p_list: log.debug('All records has Updated already!!!') return aff_row = dbstore.batch_write(p_list, 'weibo.photo.details') # 非初始化模式下, if not init_photos: # 如果原始数据与过滤后的数据长度不一致, 则已经更新到最新记录. if len(p_list) != len(raw_p_list): log.info('All Photos records updated!!!') return # 如果待更新的记录与实际更新记录数目不相同也为已更新到最新记录 if len(p_list) != aff_row: log.info('DONE@ ({}/{})'.format(len(p_list), aff_row)) return if not aff_row: suc = 'SKIP' # 如果当前页面数与最大值相同, 则结束 if params['page'] >= _max_page: return # 使用等待延迟, 防止快速爬取导致被禁止 _ri = abc.randint(2, 7) log.debug('{}: ({}) {}/{}, and sleep {}s'.format( suc, len(p_list), params['page'], _max_page, _ri)) params['page'] += 1 for _ in tqdm(range(_ri), ascii=True, desc='sleep {}s'.format(_ri)): time.sleep(1)