def crawl_video_info(template_page_url: str): max_page = 140 video_infos = [] parse_url = urlparse(template_page_url) for index in range(1, max_page): log.info('begin crawl page.({}/{})'.format(index, max_page)) html_content = u_file.get_content(template_page_url.format(index)) soup = BeautifulSoup(html_content, 'lxml') video_nodes = soup.select('div.stui-vodlist__detail') log.info('video size: {}'.format(len(video_nodes))) for video_node in video_nodes: a_node = video_node.select_one('h4 > a') span_node = video_node.select('p.sub > span') view_count = int(span_node[2].text.strip()) like_count = int(span_node[1].text.strip()) video_infos.append({ 'title': a_node.string, 'url': parse_url._replace(path=a_node['href']).geturl(), 'view': view_count, 'like': like_count }) video_infos.sort(key=lambda x: x['like'], reverse=True) u_file.cache_json(video_infos, r'result\video-infos.jon') return video_infos
def fill_download_url(book_infos: list) -> list: log.info('total book infos size: {}'.format(len(book_infos))) for book_info in book_infos: if 'download_url' in book_info: log.info( 'This books has filled download_url. {}'.format(book_info)) continue html_content = u_file.get_content(book_info['download_page'], encoding='gb2312') # 返回结果通过js处理成document download_info_pattern = re.compile( r'_downInfo = (\{Address:.+\})</script>') address_pattern = re.compile(r'_downInfo = \{Address:\"(.+)\",TypeID') search_download_content = re.search(download_info_pattern, html_content) search_address_content = re.search(address_pattern, html_content) if search_address_content is None: log.error('Can not match any data.') continue download_address = search_address_content.group(1) log.info('download_info: {}'.format(search_download_content.group(1))) book_info['download_url'] = DOWNLOAD_BASE_URL + download_address book_info['download_info'] = search_download_content.group(1) u_file.cache_json(book_infos, r'result/full_book_infos.json') return book_infos
def query_top_score_posts(count=1000) -> list: cache_file = r"cache\top_score_posts.json" if os.path.isfile(cache_file): return u_file.load_json_from_file(cache_file) results = session.query(Post.id, Post.score)\ .order_by(Post.score.desc()).limit(count).all() result = [dict(zip(v.keys(), v)) for v in results] u_file.cache_json(result, cache_file) return result
def query_posts_by_tag(tag, count=1000): cache_file = r'cache\tag_' + tag + '_posts.json' if os.path.isfile(cache_file): return u_file.load_json_from_file(cache_file) results = session.query(Post.id, Post.score) \ .filter(Post.tags.like('%{}%'.format(tag))) \ .order_by(Post.score.desc()).limit(count).all() results = [dict(zip(v.keys(), v)) for v in results] u_file.cache_json(results, cache_file) return results
def get_all_page_book_list(template_url: str) -> list: max_page_size = 100 book_infos = [] for index in range(1, max_page_size): url = template_url.format(index) page_book_infos = get_book_list(url) if len(page_book_infos) == 0: log.warn('The book infos is empty. end crawler.') break book_infos.extend(page_book_infos) log.info('end crawler url: {}, book size: {}'.format( url, len(page_book_infos))) u_file.cache_json(book_infos, r'result/total_book_info.json') return book_infos
def crawler_exam_questions(): """ 下载所有试卷题目列表 :return: """ log.info('--->begin crawler exam questions.') exam_list_url = 'https://share.jiemo.net/NSeries/getrealQuestionList' exam_question_url = 'https://share.jiemo.net/NSeries/getrealQuestionPaper' response = u_file.get_json(exam_list_url) exams = m_get(response, 'data') if m_get(response, 'result') != 0 or exams is None: log.error('request exam list error. response: {}'.format(response)) return exam_infos = [] log.info('request exam list success. exams size: {}'.format(len(exams))) for exam in exams: for sub_exam in m_get(exam, 'paperList'): exam_infos.append({ 'level': m_get(exam, 'level'), 'title': m_get(sub_exam, 'title').replace('年-', '年真题-') }) log.info('exam paper size: {}'.format(len(exam_infos))) for exam_info in exam_infos: log.info('--->begin download exam paper: {}-{}'.format(exam_info['level'], exam_info['title'])) # 检查本地缓存试卷题目 exam_question_cache_file = r'result\jiemo-exam\{}-{}.json'.format(exam_info['level'], exam_info['title']) u_file.ready_dir(exam_question_cache_file) if os.path.isfile(exam_question_cache_file): log.info('The exam question cache file is exist: {}'.format(exam_question_cache_file)) continue response = requests.post(exam_question_url, data={'level': exam_info['level'], 'title': exam_info['title']}, verify=False) if response.status_code != 200: log.error('request status code is not 200. code: {}'.format(response.status_code)) continue response = json.loads(response.text) exam_questions = m_get(response, 'data') if m_get(response, 'result') != 0 or exams is None: log.error('request exam questions error. response: {}'.format(response)) return log.info('get exam questions success. size: {}'.format(len(exam_questions))) u_file.cache_json(exam_questions, exam_question_cache_file) log.info('--->end download exam paper: {}-{}'.format(exam_info['level'], exam_info['title'])) log.info('--->end crawler exam questions.')
def get_album_track_info_from_cache(album_id) -> list: track_cache_file = r'cache\album-tracks-' + str(album_id) + '.json' if os.path.isfile(track_cache_file): u_log.info('use track info from cache file: {}'.format(track_cache_file)) return u_file.load_json_from_file(track_cache_file) track_index = 1 tracks: list = get_album_tracks(album_id) u_log.info('get_album_tracks return track size: {}'.format(len(tracks))) track_infos = [] for track in tracks: track_infos.append(get_track_info(track.get('trackId'))) u_log.info('end get track info: {}({}/{})'.format(track.get('trackId'), track_index, len(tracks))) track_index += 1 u_log.info('all track infos size: {}'.format(len(track_infos))) u_file.cache_json(track_infos) return track_infos
def crawler_special_knowledge(): # 词汇变形公式 formula_url = 'https://ns-api.jiemo.net/v2/book/formulaDetail' cache_file = r'result\jiemo-grammar\formula.json' if os.path.isfile(cache_file): log.info('The formula json is exist: {}'.format(cache_file)) else: data = post_special(formula_url) if data is None: log.info('request formula failed') else: log.info('request formula data success') u_file.cache_json(data, cache_file) # N1考试宝典获取 exam_book_url = 'https://ns-api.jiemo.net/v2/book/valuableBookDetail' cache_file = r'result\jiemo-grammar\exam-book-N1.json' if not os.path.isfile(cache_file): data = post_special(exam_book_url, {'level': 'N1'}) if data is None: log.info('request exam book N1 failed.') else: log.info('request exam book N1 success.') u_file.cache_json(data, cache_file) # N2考试宝典获取 exam_book_url = 'https://ns-api.jiemo.net/v2/book/valuableBookDetail' cache_file = r'result\jiemo-grammar\exam-book-N2.json' if not os.path.isfile(cache_file): data = post_special(exam_book_url, {'level': 'N2'}) if data is None: log.info('request exam book N2 failed.') else: log.info('request exam book N2 success.') u_file.cache_json(data, cache_file)
def crawler_grammar(): """ 芥末日语考级app,下载所有等级语法讲解json :return: """ grammar_url = 'https://ns-api.jiemo.net/v2/NSeries/getGrammarCategroy' levels = ['N1', 'N2', 'N3', 'N4', 'N5'] for level in levels: log.info('--->begin download grammar: {}'.format(level)) grammar_cache_file = r'result\jiemo-grammar\grammar-{}.json'.format(level) u_file.ready_dir(grammar_cache_file) if os.path.isfile(grammar_cache_file): log.info('The grammar is exist. file: {}'.format(grammar_cache_file)) continue param_json = COMMON_PARAMS.copy() param_json['level'] = level data = post_special(grammar_url, {'level': level}) if data is None: log.info('request grammar failed. level: {}'.format(level)) continue u_file.cache_json(data, grammar_cache_file) log.info('--->end download grammar: {}'.format(level))
def download_exam_questions(): """ 从羊驼日语单词app下载真题题目列表json数据 目前只有N1-N3三个等级的题库,缺少部分年份题目 :return: """ n_levels = [1, 2, 3] for n_level in n_levels: log.info('--->begin download exam question. category: N{}真题'.format(n_level)) exam_list_url = 'http://vocabulary.ytaxx.com/api/exam/getExamList?category={}'.format(n_level - 1) response = u_file.get_json(exam_list_url) if m_get(response, 'code') != 0 or m_get(response, 'data') is None: log.error('request exam list error. category: N{}真题'.format(n_level)) continue exams = m_get(response, 'data', []) log.info('request category exams success. exam size: {}'.format(len(exams))) for exam in exams: # 检测真题已经下载过则跳过 exam_cache_file = r'result\yt-exam\N{}-{}-{}-json'.format(n_level, exam['examName'], exam['id']) u_file.ready_dir(exam_cache_file) if os.path.isfile(exam_cache_file): log.info('The exam questions is downloaded. id: {}, name: {}'.format(exam['id'], exam['examName'])) continue # 下载真题json,并保存到本地文件 log.info('begin download exam question. exam name: {}'.format(exam['examName'])) exam_question_url = 'http://vocabulary.ytaxx.com/api/exam/questions?examId={}'.format(exam['id']) response = u_file.get_json(exam_question_url) if m_get(response, 'code') != 0 or m_get(response, 'data') is None: log.error('request exam questions error. category: N{}真题'.format(n_level)) continue questions = response['data'][0]['questionList'] exam['question'] = questions log.info('request exam question success. question size: {}'.format(len(questions))) u_file.cache_json(exam, exam_cache_file) time.sleep(0.2) log.info('--->end download exam question. category: N{}真题'.format(n_level))
def get_book_list(url: str) -> list: html_content = u_file.get_content(url, encoding='gb2312') soup = BeautifulSoup(html_content, 'lxml') book_elements = soup.select('li.item > a') log.info('get book elements size: {}'.format(len(book_elements))) book_infos = [] for book_element in book_elements: book_infos.append({ 'download_page': BASE_HOST + book_element['href'], 'cover_image_url': book_element.find('img', {'class': 'tu'})['src'], 'title': book_element.select('div.info > p.name')[0].string, 'update_time': book_element.select('div.info > p.type > span')[0].string, 'size': book_element.select('div.info > p.type > span')[1].string }) u_file.cache_json(book_infos, r'result/book_info.json') return book_infos
'This books has filled download_url. {}'.format(book_info)) continue html_content = u_file.get_content(book_info['download_page'], encoding='gb2312') # 返回结果通过js处理成document download_info_pattern = re.compile( r'_downInfo = (\{Address:.+\})</script>') address_pattern = re.compile(r'_downInfo = \{Address:\"(.+)\",TypeID') search_download_content = re.search(download_info_pattern, html_content) search_address_content = re.search(address_pattern, html_content) if search_address_content is None: log.error('Can not match any data.') continue download_address = search_address_content.group(1) log.info('download_info: {}'.format(search_download_content.group(1))) book_info['download_url'] = DOWNLOAD_BASE_URL + download_address book_info['download_info'] = search_download_content.group(1) u_file.cache_json(book_infos, r'result/full_book_infos.json') return book_infos if __name__ == '__main__': book_infos = u_file.load_json_from_file(r'result/full_book_infos.json') book_infos.sort(key=lambda x: x['title']) u_file.cache_json(book_infos, r'result/sort_book_infos.json')
def test_cache_json(): json_data = {'a': 1, 'b': 2} cache_file = u_file.cache_json(json_data) u_unittest.assert_true(os.path.isfile(cache_file))