def get_cookie(self): filename = ExtraTools.md5(ExtraTools.get_time()) with open(filename, 'w') as f: pass self.cookieJar.save(filename) with open(filename) as f: content = f.read() os.remove(filename) return content
def parse_date(date='1357-08-12'): if u":" in date: if u'昨天' in date: return ExtraTools.get_yesterday() else: return ExtraTools.get_today() if u'今天' in date: return ExtraTools.get_today() return ParserTools.match_content(r'\d{4}-\d{2}-\d{2}', date, '1357-08-12') # 一三五七八十腊,三十一天永不差!
def generate_article_page(self, article): """ :type article: src.container.data.article.Article :return: :rtype: """ answer_content = Template.answer.format( **{ 'author_avatar_url': article.author_avatar_url, 'author_name': article.author_name, 'author_id': article.author_id, 'author_headline': article.author_headline, 'content': article.content, 'comment_count': article.comment_count, 'voteup_count': article.voteup_count, 'updated_time': ExtraTools.format_date('Y-m-d H:i:s', article.updated_time), } ) filename = self.get_random_html_file_name() content = Template.question.format( **{ 'title': article.title, 'description': '', 'answer': answer_content } ) uri = Path.html_pool_path + '/' + filename buf_file = open(uri, 'w') buf_file.write(content) buf_file.close() return uri
def create(self): self.image_container.set_save_path(Path.image_pool_path) self.image_container.start_download() title = "_".join([book.property.epub.title for book in self.book_list]) title = title.strip() Path.chdir(Path.base_path + u"/知乎电子书临时资源库/") epub = Book(title, 27149527) html_tmp_path = Path.html_pool_path + "/" image_tmp_path = Path.image_pool_path + "/" for book in self.book_list: page = book.page_list[0] with open(html_tmp_path + page.filename, "w") as html: html.write(page.content) epub.createChapter(html_tmp_path + page.filename, ExtraTools.get_time(), page.title) for page in book.page_list[1:]: with open(html_tmp_path + page.filename, "w") as html: html.write(page.content) epub.addHtml(html_tmp_path + page.filename, page.title) for image in self.book["image_list"]: epub.addImg(image_tmp_path + image["filename"]) epub.addLanguage("zh-cn") epub.addCreator("ZhihuHelp1.7.0") epub.addDesc(u"该电子书由知乎助手生成,知乎助手是姚泽源为知友制作的仅供个人使用的简易电子书制作工具,源代码遵循WTFPL,希望大家能认真领会该协议的真谛,为飞面事业做出自己的贡献 XD") epub.addRight("CC") epub.addPublisher("ZhihuHelp") Debug.logger.debug(u"当前目录为") Path.pwd() epub.addCss(Path.base_path + u"/epubResource/markdown.css") epub.addCss(Path.base_path + u"/epubResource/front.css") epub.buildingEpub() return
def login(self, account, password, captcha=''): content = Http.get_content('https://www.zhihu.com/') xsrf = Match.xsrf(content) if not xsrf: Debug.logger.info(u'登陆失败') Debug.logger.info(u'敲击回车重新发送登陆请求') return False xsrf = xsrf.split('=')[1] # add xsrf as cookie into cookieJar, cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com') self.cookieJar.set_cookie(cookie) if captcha: post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True, 'captcha': captcha} else: post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True} header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本 'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4', 'Host': 'www.zhihu.com', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://www.zhihu.com', 'Referer': 'https://www.zhihu.com/', } result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header) if not result: Debug.logger.info(u'登陆失败,请敲击回车重新登陆') return False response = json.loads(result) if response['r'] == 0: print u'登陆成功!' print u'登陆账号:', account print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认' if raw_input() == 'yes': Config.account, Config.password, Config.remember_account = account, password, True print u'帐号密码已保存,可通过修改config.json修改设置' else: Config.account, Config.password, Config.remember_account = '', '', False print u'跳过保存环节,进入下一流程' Config._save() cookie = self.get_cookie() DB.execute('delete from LoginRecord') # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录 data = {} data['account'] = account data['password'] = password data['recordDate'] = ExtraTools.get_today() data['cookieStr'] = cookie DB.save(data, 'LoginRecord') DB.commit() return True else: print u'登陆失败' Debug.print_dict(response) return False
def get_random_html_file_name(self): u""" 生成一个随机html :return: :rtype: """ filename = ExtraTools.md5(str(uuid.uuid4())) + '.xhtml' return filename
def create_filename(self, href): u""" 根据 href 创建md5编码之后的文件名 :param href: :return: """ filename = ExtraTools.md5(href) + '.jpg' return filename
def set_info(self, info): self.info.update(info) if self.kind == Type.question: self.property.epub.title = '知乎问题集锦({})'.format(ExtraTools.get_time()) self.property.epub.id = ExtraTools.get_time() elif self.kind == Type.answer: self.property.epub.title = '知乎回答集锦({})'.format(ExtraTools.get_time()) self.property.epub.id = ExtraTools.get_time() if self.kind == Type.article: self.property.epub.title = '知乎专栏文章集锦({})'.format(ExtraTools.get_time()) self.property.epub.id = ExtraTools.get_time() if self.kind in [Type.answer, Type.question, Type.article]: self.info['title'] = self.property.epub.title if self.kind == Type.topic: self.property.epub.title = '话题_{}({})'.format(info['title'], info['topic_id']) self.property.epub.id = info['topic_id'] if self.kind == Type.collection: self.property.epub.title = '收藏夹_{}({})'.format(info['title'], info['collection_id']) self.property.epub.id = info['collection_id'] if self.kind == Type.author: self.property.epub.title = '作者_{}({})'.format(info['name'], info['author_id']) self.property.epub.id = info['author_id'] if self.kind == Type.column: self.property.epub.title = '专栏_{}({})'.format(info['name'], info['column_id']) self.property.epub.id = info['column_id'] return
def generate_question_page(self, question): """ :type question: src.container.task_result.Question :return: :rtype: """ # 先输出answer的内容 answer_content = u'' for answer in question.answer_list: answer_content += Template.answer.format( **{ 'author_avatar_url': answer.author_avatar_url, 'author_name': answer.author_name, 'author_id': answer.author_id, 'author_headline': answer.author_headline, 'content': answer.content, 'comment_count': answer.comment_count, 'voteup_count': answer.voteup_count, 'updated_time': ExtraTools.format_date(u'%Y-%m-%d %H:%M:%S', answer.updated_time), }) filename = self.get_random_html_file_name() content = Template.question.format( **{ 'title': question.question_info.title, 'description': question.question_info.detail, 'answer': answer_content }) uri = Path.html_pool_path + '/' + filename buf_file = open(uri, 'w') buf_file.write(content) buf_file.close() return uri
def create_single_html_book(self): title = '_'.join([book.epub.title for book in self.book_list]) title = title.strip()[:128] # 避开window文件名长度限制 title = ExtraTools.fix_filename(title) # 移除特殊字符 Path.reset_path() Path.chdir(Path.result_path) Path.rmdir(u'./' + title) Path.mkdir(u'./' + title) Path.chdir(u'./' + title) page = [] for book in self.book_list: page += book.page_list content = ' \r\n<hr /> \r\n '.join([Match.html_body(x.content) for x in page]).replace('../images/', './images/') with open(Path.base_path + '/src/template/content/single_html.html') as html: template = html.read().format(title=title, content=content) with open(title + u'.html', 'w') as html: html.write(template) shutil.copytree(Path.html_pool_path + u'/../{}/OEBPS/images'.format(title), './images') shutil.copy(Path.www_css + '/front.css' , './front.css') shutil.copy(Path.www_css + '/markdown.css' , './markdown.css') Path.reset_path() return
def generate_article_page(self, article): """ :type article: src.container.data.article.Article :return: :rtype: """ answer_content = Template.answer.format( **{ 'author_avatar_url': article.author_avatar_url, 'author_name': article.author_name, 'author_id': article.author_id, 'author_headline': article.author_headline, 'content': article.content, 'comment_count': article.comment_count, 'voteup_count': article.voteup_count, 'updated_time': ExtraTools.format_date(u'%Y-%m-%d %H:%M:%S', article.updated_time), }) filename = self.get_random_html_file_name() content = Template.question.format(**{ 'title': article.title, 'description': '', 'answer': answer_content }) uri = Path.html_pool_path + '/' + filename buf_file = open(uri, 'w') buf_file.write(content) buf_file.close() return uri
def create(self): self.image_container.set_save_path(Path.image_pool_path) self.image_container.start_download() title = '_'.join([book.epub.title for book in self.book_list]) title = title.strip()[:128] # 避开window文件名长度限制 title = ExtraTools.fix_filename(title) # 移除特殊字符 if not title: # 电子书题目为空时自动跳过 # 否则会发生『rm -rf / 』的惨剧。。。 return Path.chdir(Path.base_path + u'/知乎电子书临时资源库/') epub = Book(title, 27149527) html_tmp_path = Path.html_pool_path + '/' image_tmp_path = Path.image_pool_path + '/' for book in self.book_list: page = book.page_list[0] with open(html_tmp_path + page.filename, 'w') as html: html.write(page.content) #epub.createChapter(html_tmp_path + page.filename, ExtraTools.get_time(), page.title) epub.addInfoPage(html_tmp_path + page.filename, page.title) for page in book.page_list[1:]: with open(html_tmp_path + page.filename, 'w') as html: html.write(page.content) epub.addHtml(html_tmp_path + page.filename, page.title) for image in self.book.image_list: epub.addImg(image_tmp_path + image['filename']) epub.addLanguage('zh-cn') epub.addCreator('ZhihuHelp1.7.0') epub.addDesc(u'该电子书由知乎助手生成,知乎助手是姚泽源为知友制作的仅供个人使用的简易电子书制作工具,源代码遵循WTFPL,希望大家能认真领会该协议的真谛,为飞面事业做出自己的贡献 XD') epub.addRight('CC') epub.addPublisher('ZhihuHelp') epub.addCss(Path.base_path + u'/www/css/markdown.css') epub.addCss(Path.base_path + u'/www/css/front.css') epub.buildingEpub() Path.reset_path() return
def generate_question_page(self, question): """ :type question: src.container.task_result.Question :return: :rtype: """ # 先输出answer的内容 answer_content = u'' for answer in question.answer_list: answer_content += Template.answer.format( **{ 'author_avatar_url': answer.author_avatar_url, 'author_name': answer.author_name, 'author_id': answer.author_id, 'author_headline': answer.author_headline, 'content': answer.content, 'comment_count': answer.comment_count, 'voteup_count': answer.voteup_count, 'updated_time': ExtraTools.format_date('Y-m-d H:i:s', answer.updated_time), } ) filename = self.get_random_html_file_name() content = Template.question.format( **{ 'title': question.question_info.title, 'description': question.question_info.detail, 'answer': answer_content } ) uri = Path.html_pool_path + '/' + filename buf_file = open(uri, 'w') buf_file.write(content) buf_file.close() return uri
def login(self, account, password, captcha=''): content = Http.get_content('https://www.zhihu.com/') xsrf = Match.xsrf(content) if not xsrf: Debug.logger.info(u'登陆失败') Debug.logger.info(u'敲击回车重新发送登陆请求') return False xsrf = xsrf.split('=')[1] # add xsrf as cookie into cookieJar, cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com') self.cookieJar.set_cookie(cookie) if captcha: post_data = { '_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True, 'captcha': captcha } else: post_data = { '_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True } header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本 'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4', 'Host': 'www.zhihu.com', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://www.zhihu.com', 'Referer': 'https://www.zhihu.com/', } result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header) if not result: Debug.logger.info(u'登陆失败,请敲击回车重新登陆') return False response = json.loads(result) if response['r'] == 0: print u'登陆成功!' print u'登陆账号:', account print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认' if raw_input() == 'yes': Config.account, Config.password, Config.remember_account = account, password, True print u'帐号密码已保存,可通过修改config.json修改设置' else: Config.account, Config.password, Config.remember_account = '', '', False print u'跳过保存环节,进入下一流程' Config._save() cookie = self.get_cookie() DB.execute( 'delete from LoginRecord') # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录 data = {} data['account'] = account data['password'] = password data['recordDate'] = ExtraTools.get_today() data['cookieStr'] = cookie DB.save(data, 'LoginRecord') DB.commit() return True else: print u'登陆失败' Debug.print_dict(response) return False
def create_filename(self, href): filename = ExtraTools.md5(href) + '.jpg' return filename
def parse_article_id(self): from src.tools.extra_tools import ExtraTools article_id = ExtraTools.md5(self.info['title']) self.info['article_id'] = article_id
def parse_date(date="1357-08-12"): if u"昨天" in date: return ExtraTools.get_yesterday() if u"今天" in date: return ExtraTools.get_today() return ParserTools.match_content(r"\d{4}-\d{2}-\d{2}", date, date) # 一三五七八十腊,三十一天永不差!