def parse_and_save_grammar_json(file_path: str): """ 讲语法讲解存入数据库中 :param file_path: :return: """ grammar_categories = u_file.load_json_from_file(file_path) if not grammar_categories or not 'data' in grammar_categories: log.warn('The grammar json is invalid: {}'.format(str)) return log.info('load grammar json success. category size: {}'.format(len(grammar_categories))) grammar_categories = grammar_categories.get('data') for grammar_category in grammar_categories: log.info('parse grammar category: {}'.format(grammar_category.get('title'))) if grammar_category.get('title') != grammar_category.get('label'): log.warn('The grammar title and label is not same.') grammars = grammar_category.get('grammerList') log.info('parse grammar category sub grammar. category: {}, grammar size: {}' .format(grammar_category.get('title'), len(grammars))) for grammar in grammars: if grammar.get('explain') != grammar.get('comment') or grammar.get('type') != grammar.get('category') \ or grammar.get('category') != grammar_category.get('title'): log.warn('The grammar category is special. grammar: {}'.format(grammar.get('grammar'))) log.info('get grammar: {}'.format(grammar.get('grammar'))) db_grammar = Grammar(id=grammar.get('id'), content=grammar.get('content')) db_grammar.level = grammar.get('level') db_grammar.category = grammar.get('category') db_grammar.type = grammar.get('category') db_grammar.link = grammar.get('link') db_grammar.explain = grammar.get('explain') db_grammar.example = re.sub('[#@][0-9]*', '', grammar.get('exmple')) db_grammar.postscript = grammar.get('ps') save_grammar(db_grammar)
def query_top_score_posts(count=1000) -> list: cache_file = r"cache\top_score_posts.json" if os.path.isfile(cache_file): return u_file.load_json_from_file(cache_file) results = session.query(Post.id, Post.score)\ .order_by(Post.score.desc()).limit(count).all() result = [dict(zip(v.keys(), v)) for v in results] u_file.cache_json(result, cache_file) return result
def query_posts_by_tag(tag, count=1000): cache_file = r'cache\tag_' + tag + '_posts.json' if os.path.isfile(cache_file): return u_file.load_json_from_file(cache_file) results = session.query(Post.id, Post.score) \ .filter(Post.tags.like('%{}%'.format(tag))) \ .order_by(Post.score.desc()).limit(count).all() results = [dict(zip(v.keys(), v)) for v in results] u_file.cache_json(results, cache_file) return results
def get_album_track_info_from_cache(album_id) -> list: track_cache_file = r'cache\album-tracks-' + str(album_id) + '.json' if os.path.isfile(track_cache_file): u_log.info('use track info from cache file: {}'.format(track_cache_file)) return u_file.load_json_from_file(track_cache_file) track_index = 1 tracks: list = get_album_tracks(album_id) u_log.info('get_album_tracks return track size: {}'.format(len(tracks))) track_infos = [] for track in tracks: track_infos.append(get_track_info(track.get('trackId'))) u_log.info('end get track info: {}({}/{})'.format(track.get('trackId'), track_index, len(tracks))) track_index += 1 u_log.info('all track infos size: {}'.format(len(track_infos))) u_file.cache_json(track_infos) return track_infos
def output_course_list(course_data_path: str): course_info = u_file.load_json_from_file(course_data_path) template = u_file.read_content(r'cache/template.html') html_content = '<ul>\n' for stage_course in course_info['stageCourses']: html_content += '<li>' + stage_course['courseName'] # 如果有题目的话,列出题目 questions = stage_course['questions'] if len(questions) > 0: html_content += '\n<ul>' for question in questions: # question_detail_content = question['detail']['content'] html_content += '<li>' + question['name'] + '---' + question['summary'] + '\n' html_content += '</ul>' html_content += '</li>\n' html_content += '</ul>' template = template.replace('{{title}}', course_info['name']) template = template.replace('{{content}}', html_content) u_file.write_content(r'cache\output.html', template)
def output_course_chapter_notes(name): course_data_path = r'cache\course-info-{}.json'.format(name) course_info = u_file.load_json_from_file(course_data_path) content = '# {}\n\n'.format(name) log.info('stage_course size: {}'.format(len(course_info['stageCourses']))) for stage_course in course_info['stageCourses']: chapters = stage_course['chapters'] content += '## {}\n\n'.format(stage_course['courseName']) log.info('course {} chapters size: {}'.format(stage_course['courseName'], len(chapters))) if len(chapters) <= 0: continue # 遍历每一章节 for chapter in chapters: content += '\n### {}\n\n'.format(chapter['name']) periods = chapter['periods'] log.info('chapter: {}, periods size: {}'.format(chapter['name'], len(periods))) if len(periods) <= 0: continue # 遍历每个视频讲解 for period in periods: # 获取笔记并保存 content += '\n#### {}\n\n'.format(period['name']) notes = get_video_notes(period['id']) log.info('period: {}, notes size: {}'.format(period['name'], len(notes))) if len(notes) <= 0: log.info('The period: {}, notes is empty.'.format(period['name'])) continue for note in notes: if len(note['content']) <= 5: log.info('The note is short: {}'.format(note['content'])) continue content += note['content'] + '\n---------{}\n'.format(note['likeNum']) u_file.write_content(r'cache\output-note-{}.md'.format(name), content) u_file.write_content(r'cache\output-note-{}.md'.format(name), content)
def output_course_question(name): course_data_path = r'cache\course-info-{}.json'.format(name) course_info = u_file.load_json_from_file(course_data_path) template = u_file.read_content(r'cache/template.html') html_content = '' for stage_course in course_info['stageCourses']: html_content += '<h1><a href="{}" target="_blank">{}</a></h1>\n'\ .format(stage_course['url'], stage_course['courseName']) # 如果有题目的话,列出题目 questions = stage_course['questions'] if len(questions) > 0: for question in questions: # question_detail_content = question['detail']['content'] html_content += '<h4><a href="{}" target="_blank">{}</a></h4>\n'\ .format(question['url'], question['title']) # html_content += question['detail']['content'] template = template.replace('{{title}}', course_info['name']) template = template.replace('{{content}}', html_content) u_file.write_content(r'cache\output-title-{}.html'.format(name), template)
'This books has filled download_url. {}'.format(book_info)) continue html_content = u_file.get_content(book_info['download_page'], encoding='gb2312') # 返回结果通过js处理成document download_info_pattern = re.compile( r'_downInfo = (\{Address:.+\})</script>') address_pattern = re.compile(r'_downInfo = \{Address:\"(.+)\",TypeID') search_download_content = re.search(download_info_pattern, html_content) search_address_content = re.search(address_pattern, html_content) if search_address_content is None: log.error('Can not match any data.') continue download_address = search_address_content.group(1) log.info('download_info: {}'.format(search_download_content.group(1))) book_info['download_url'] = DOWNLOAD_BASE_URL + download_address book_info['download_info'] = search_download_content.group(1) u_file.cache_json(book_infos, r'result/full_book_infos.json') return book_infos if __name__ == '__main__': book_infos = u_file.load_json_from_file(r'result/full_book_infos.json') book_infos.sort(key=lambda x: x['title']) u_file.cache_json(book_infos, r'result/sort_book_infos.json')
for img_element in img_elements: image_url = img_element.find('img')['data-src'] image_url = 'http:' + re.sub(r"@[^\n]+", '-', image_url) u_file.download_file(image_url, title + '-' + u_file.get_file_name_from_url(image_url), r'result') return [] def get_all_urls(url: str) -> list: html_content = u_file.get_content(url, encoding='UTF-8') soup = BeautifulSoup(html_content, 'lxml') infos = [] comment_node = soup.select('div.is-top p.text') texts = comment_node[0].string.split('\n') a_nodes = comment_node[0].find('img') index = 1 for a_node in a_nodes: infos.append({ 'url': a_node.href, 'title': texts[index] }) index += 1 return infos if __name__ == '__main__': infos = u_file.load_json_from_file(r'result\source.json') for info in infos: download_pictures(info['url'], u_file.convert_windows_path(info['title']))