def download_img(self): from src.container.image_container import ImageContainer img_container = ImageContainer() img_src_dict = Match.match_img_with_src_dict(self.content) self.img_filename_list = [] for img in img_src_dict: src = img_src_dict[img] filename = img_container.add(src) self.img_filename_list.append(filename) self.content = self.content.replace(img, Match.create_img_element_with_file_name(filename)) # 下载文章封面图像 filename = img_container.add(self.image_url) self.img_filename_list.append(filename) self.image_url = Match.create_local_img_src(filename) # 下载用户头像 filename = img_container.add(self.author_avatar_url) self.img_filename_list.append(filename) self.author_avatar_url = Match.create_local_img_src(filename) img_container.start_download() # 下载完成后,更新图片大小 for filename in self.img_filename_list: self.total_img_size_kb += Path.get_img_size_by_filename_kb(filename) return
def download_img(self): from src.container.image_container import ImageContainer img_container = ImageContainer() img_src_dict = Match.match_img_with_src_dict(self.content) self.img_filename_list = [] for img in img_src_dict: src = img_src_dict[img] filename = img_container.add(src) self.img_filename_list.append(filename) self.content = self.content.replace( img, Match.create_img_element_with_file_name(filename)) # 答案作者的头像也要下载 filename = img_container.add(self.author_avatar_url) self.img_filename_list.append(filename) self.author_avatar_url = Match.create_local_img_src(filename) img_container.start_download() # 下载完成后,更新图片大小 for filename in self.img_filename_list: self.total_img_size_kb += Path.get_img_size_by_filename_kb( filename) print self.total_img_size_kb return
def main(): debug = False def version(): log.info_log('version %s' % __version__) try: opts, args = getopt.getopt(sys.argv[1:], short_options, long_options) except getopt.GetoptError as err: log.error_log(u"Try ee-book --help for more options") sys.exit(2) for option, args in opts: if option in ('-V', '--version'): version() sys.exit() elif option in ('-d', '--debug'): print u"Debug mode..." debug = True elif option in ('-h', '--help'): version() print(help_info) sys.exit() elif option in ('-g', '--gui'): print(u"Under developing...") sys.exit() # graphviz = GraphvizOutput(output_file='filter_gui.png') # with PyCallGraph(output=graphviz, config=config): # from PyQt4.QtGui import QApplication # from PyQt4.QtGui import QIcon # from src.gui.ui import MainWindow # from src.resources import qrc_resources # app = QApplication(sys.argv) # app.setWindowIcon(QIcon(":/icon.png")) # app.setApplicationName('EE-Book') # window = MainWindow() # window.show() # sys.exit(app.exec_()) elif option in ('-l', '--login'): url = args try: recipe_kind = Match.get_url_kind(url) except UnsupportTypeException, e: print e print u"Please try again." sys.exit() zhihu = EEBook( recipe_kind=recipe_kind ) # Init path, e.g. config, only zhihu are supported now login = Login(recipe_kind=recipe_kind) login.start() sys.exit() elif option in ('-u', '--url'): url = args try: recipe_kind = Match.get_website_kind(url) except UnsupportTypeException, e: print e print u"Please check url." sys.exit()
def get_article_info(self): data = {} try: try: title_tationl = self.dom.find_all('h1', class_="article-title") # print u"标题 {}".format(span_dom.text.strip()), resultstr = title_tationl[0].text if resultstr.__contains__('/'): resultstr = Match.replace_specile_chars(resultstr) data['title'] = resultstr data['title'] = resultstr.strip() except IndexError: data['title'] = Match.replace_specile_chars(self.dom.title) data['title'] = str(data['title']).strip() article_body = "" content = self.dom.find_all('article', class_="article-content")[0] article_body += str(content) strOfinfos = self.dom.find_all('p', style="text-align: center;") for x in strOfinfos: article_body = article_body.replace(str(x), '', 1) data['content'] = str(article_body) time_tationl = self.dom.find_all('div', class_="article-meta")[0] data['updated_time'] = time_tationl.find_all('span', class_="item")[0].text # print data['updated_time'] data['voteup_count'] = "" data['comment_count'] = "" data['image_url'] = '' data['author_id'] = 'meng-qing-xue-81' tempName = time_tationl.find_all('span', class_="item")[1].text data['author_name'] = (str(tempName).split(':'))[-1] data['author_headline'] = '' data[ 'author_avatar_url'] = 'https://pic4.zhimg.com/v2-38a89e42b40baa7d26d99cab9a451623_xl.jpg' data['author_gender'] = '0' except Exception as e: print e.message return [] return data
def download_img(self): from src.container.image_container import ImageContainer if str(self.content).__contains__('<div class="duokan-image-single">'): # print img_src_dict xtep = str(self.content) xxsoup = BeautifulSoup(xtep, 'lxml') list_tiezhi_tit = xxsoup.find_all('div', class_="duokan-image-single") for x in list_tiezhi_tit: list_pcyc_li = x.find_all('img') for li in list_pcyc_li: # print li src = li.get('src') st = str(src).split('/images/')[-1] newT = u'<img class="ke_img" src="file:///Users/ink/Desktop/images/{}" />'.format(st) xtep = xtep.replace(str(x), newT, 1) self.content = xtep # print xtep img_container = ImageContainer() img_src_dict = Match.match_img_with_src_dict(self.content) self.img_filename_list = [] for img in img_src_dict: src = img_src_dict[img] filename = img_container.add(src) # print 'src:' + src + ' and filename ' + filename self.img_filename_list.append(filename) if str(img).__contains__(u"class=\"avatar\""): self.content = self.content.replace(img, Match.avatar_create_img_element_with_file_name(filename)) else: self.content = self.content.replace(img, Match.create_img_element_with_file_name(filename)) # 下载文章封面图像 filename = img_container.add(self.image_url) self.img_filename_list.append(filename) self.image_url = Match.create_local_img_src(filename) # 下载用户头像 filename = img_container.add(self.author_avatar_url) self.img_filename_list.append(filename) self.author_avatar_url = Match.create_local_img_src(filename) img_container.start_download() # 下载完成后,更新图片大小 for filename in self.img_filename_list: self.total_img_size_kb += Path.get_img_size_by_filename_kb(filename) return
def main(): debug = False def version(): log.info_log('version %s' % __version__) try: opts, args = getopt.getopt(sys.argv[1:], short_options, long_options) except getopt.GetoptError as err: log.error_log(u"Try ee-book --help for more options") sys.exit(2) for option, args in opts: if option in ('-V', '--version'): version() sys.exit() elif option in ('-d', '--debug'): print u"Debug mode..." debug = True elif option in ('-h', '--help'): version() print(help_info) sys.exit() elif option in ('-g', '--gui'): print(u"Under developing...") sys.exit() # graphviz = GraphvizOutput(output_file='filter_gui.png') # with PyCallGraph(output=graphviz, config=config): # from PyQt4.QtGui import QApplication # from PyQt4.QtGui import QIcon # from src.gui.ui import MainWindow # from src.resources import qrc_resources # app = QApplication(sys.argv) # app.setWindowIcon(QIcon(":/icon.png")) # app.setApplicationName('EE-Book') # window = MainWindow() # window.show() # sys.exit(app.exec_()) elif option in ('-l', '--login'): url = args try: recipe_kind = Match.get_url_kind(url) except UnsupportTypeException, e: print e print u"Please try again." sys.exit() zhihu = EEBook(recipe_kind=recipe_kind) # Init path, e.g. config, only zhihu are supported now login = Login(recipe_kind=recipe_kind) login.start() sys.exit() elif option in ('-u', '--url'): url = args try: recipe_kind = Match.get_website_kind(url) except UnsupportTypeException, e: print e print u"Please check url." sys.exit()
def get_article_info(self): data = {} try: try: title_tationl = self.dom.find_all('h1', class_="entry-title") # print u"标题 {}".format(span_dom.text.strip()), resultstr = title_tationl[0].text if resultstr.__contains__('/'): resultstr = Match.replace_specile_chars(resultstr) data['title'] = resultstr data['title'] = resultstr.strip() except IndexError: data['title'] = Match.replace_specile_chars(self.dom.title) data['title'] = str(data['title']).strip() article_body = "" content = self.dom.find_all('div', class_="entry-content")[0] article_body += str(content) data['content'] = str(article_body) time_tationl = self.dom.find_all('div', class_="post-meta")[0] ttd = str(time_tationl.text) date_time = datetime.datetime.strptime(str(ttd.split('•')[1]).strip(), '%Y年%m月%d日') print '转化后时间' print date_time.strftime('%Y-%m-%d') data['updated_time'] = date_time.strftime('%Y-%m-%d') data['voteup_count'] = "" data['comment_count'] = "" data['image_url'] = '' data['author_id'] = 'meng-qing-xue-81' data['author_name'] = ttd.split('•')[0] data['author_headline'] = '' data['author_avatar_url'] = 'https://pic4.zhimg.com/v2-38a89e42b40baa7d26d99cab9a451623_xl.jpg' data['author_gender'] = '0' except Exception as e: print e.message return [] return data
def get_article_info(self): data = {} try: try: title_tationl = self.dom.find_all( 'h2', class_="Buffett-clipVideoHeroHeaderTitle") # print u"标题 {}".format(span_dom.text.strip()), resultstr = title_tationl[0].text if resultstr.__contains__('/'): resultstr = Match.replace_specile_chars(resultstr) data['title'] = resultstr.strip() except IndexError: data['title'] = Match.replace_specile_chars(self.dom.title) data['title'] = str(data['title']).strip() data['content'] = str(self.parse_answer_content()) time_tationl = self.dom.find_all( 'div', class_="Buffett-clipVideoHeroHeaderTimestamp") tt = time_tationl[0].text print tt[:-4] # print sp[0].text + ' '+sp[1].text+' '+sp[2].text data['updated_time'] = tt[:-4] # print data['updated_time'] data['voteup_count'] = "" data['comment_count'] = "" data['image_url'] = '' data['author_id'] = 'meng-qing-xue-81' data['author_name'] = 'CNBC' data['author_headline'] = '' data[ 'author_avatar_url'] = 'https://pic4.zhimg.com/v2-38a89e42b40baa7d26d99cab9a451623_xl.jpg' data['author_gender'] = '0' except Exception as e: print e.message return [] return data
def create_work_set(self, target_url): u""" 根据target_url(例:http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles)的内容, 先获得creator_id, 再根据文章的数目, 获得页面数, 依次打开每个页面, 将文章的地址放入work_set中 :param target_url: :return: """ if target_url in self.task_complete_set: return id_result = Match.jianshu_author(target_url) jianshu_id = id_result.group('jianshu_id') article_num, article_list = self.get_jianshu_question_list(target_url) self.task_complete_set.add(target_url) if article_num % 9 != 0: page_num = article_num/9 + 1 # 9 href on one page else: page_num = article_num / 9 for item in article_list: self.work_set.add(item) for page in range(page_num-1): # page+2, don't need to get the first page url = 'http://www.jianshu.com/users/{}/latest_articles?page={}'.format(jianshu_id, page+2) content_article_list = Http.get_content(url) article_list = self.parse_get_article_list(content_article_list) for item in article_list: self.work_set.add(item) return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return result = Match.column(target_url) self.column_id = result.group('column_id') content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' + self.column_id) if not content: return raw_info = json.loads(content) info = {} info['creator_id'] = raw_info['creator']['slug'] info['creator_hash'] = raw_info['creator']['hash'] info['creator_sign'] = raw_info['creator']['bio'] info['creator_name'] = raw_info['creator']['name'] info['creator_logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['creator']['avatar'][ 'id']).replace('_{size}', '') info['column_id'] = raw_info['slug'] info['name'] = raw_info['name'] info['logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['avatar']['id']).replace( '_{size}', '') info['article'] = raw_info['postsCount'] info['follower'] = raw_info['followersCount'] info['description'] = raw_info['description'] self.info_list.append(info) self.task_complete_set.add(target_url) detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format(self.column_id) for i in range(info['article'] / 10 + 1): self.work_set.add(detect_url + str(i * 10)) return
def get_column_info(self): data = {} headcontent = self.dom.find_all('div', class_="header")[0] img_src_dict = Match.match_img_with_src_dict(str(headcontent)) for img in img_src_dict: src = img_src_dict[img] print src data['image_url'] = src reInfo = BeautifulSoup(str(headcontent), 'html.parser') links = reInfo.findAll('a') article_u = (links[1]).text description = self.dom.find_all('span', class_="f12 gray")[0] data[u'title'] = article_u # data['image_url'] = '' data['article_count'] = 0 data['follower_count'] = 0 data['description'] = '' return data
def login(self, account, password, captcha=''): content = Http.get_content('https://www.zhihu.com/') xsrf = Match.xsrf(content) if not xsrf: Debug.logger.info(u'登陆失败') Debug.logger.info(u'敲击回车重新发送登陆请求') return False xsrf = xsrf.split('=')[1] # add xsrf as cookie into cookieJar, cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com') self.cookieJar.set_cookie(cookie) if captcha: post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True, 'captcha': captcha} else: post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True} header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本 'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4', 'Host': 'www.zhihu.com', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://www.zhihu.com', 'Referer': 'https://www.zhihu.com/', } result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header) if not result: Debug.logger.info(u'登陆失败,请敲击回车重新登陆') return False response = json.loads(result) if response['r'] == 0: print u'登陆成功!' print u'登陆账号:', account print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认' if raw_input() == 'yes': Config.account, Config.password, Config.remember_account = account, password, True print u'帐号密码已保存,可通过修改config.json修改设置' else: Config.account, Config.password, Config.remember_account = '', '', False print u'跳过保存环节,进入下一流程' Config._save() cookie = self.get_cookie() DB.execute('delete from LoginRecord') # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录 data = {} data['account'] = account data['password'] = password data['recordDate'] = ExtraTools.get_today() data['cookieStr'] = cookie DB.save(data, 'LoginRecord') DB.commit() return True else: print u'登陆失败' Debug.print_dict(response) return False
def fix_image(self, content): content = Match.fix_html(content) for img in re.findall(r'<img[^>]*', content): # fix img if img[-1] == '/': img = img[:-1] img += '>' src = re.search(r'(?<=src=").*?(?=")', img) if not src: new_image = img + '</img>' content = content.replace(img, new_image) continue else: src = src.group(0) if src.replace(' ', '') == '': new_image = img + '</img>' content = content.replace(img, new_image) continue src_download = HtmlCreator.fix_image_src(src) if src_download: filename = self.image_container.add(src_download) else: filename = '' new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename)) new_image = new_image.replace('//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg', '../images/{}'.format(filename)) new_image += '</img>' content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image)) return content
def create_work_set(self, target_url): if target_url in self.task_complete_set: return result = Match.column(target_url) column_id = result.group('column_id') content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' + column_id) if not content: return raw_info = json.loads(content) info = {} info['creator_id'] = raw_info['creator']['slug'] info['creator_hash'] = raw_info['creator']['hash'] info['creator_sign'] = raw_info['creator']['bio'] info['creator_name'] = raw_info['creator']['name'] info['creator_logo'] = raw_info['creator']['avatar'][ 'template'].replace('{id}', raw_info['creator']['avatar']['id']).replace( '_{size}', '') info['column_id'] = raw_info['slug'] info['name'] = raw_info['name'] info['logo'] = raw_info['creator']['avatar']['template'].replace( '{id}', raw_info['avatar']['id']).replace('_{size}', '') info['article'] = raw_info['postsCount'] info['follower'] = raw_info['followersCount'] info['description'] = raw_info['description'] self.info_list.append(info) self.task_complete_set.add(target_url) detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format( column_id) for i in range(info['article'] / 10 + 1): self.work_set.add(detect_url + str(i * 10)) return
def download(self, index): image = self.container[index] filename = image['filename'] href = image['href'] # filename=href.split('/')[-1] if os.path.isfile(self.save_path + '/' + filename): return print 'Downloading picture:' + href + ' filename ' + filename # urllib.urlretrieve(href, self.save_path + '/' + filename, cbk) if len(str(href)) < 300 and Match.isUrlOk(href): # Debug.print_in_single_line(u'Downloading picture: {}'.format(href)) rely_url = str(href).split('@')[0] content = Http.get_content(url=rely_url, timeout=Config.timeout_download_picture) else: Debug.print_in_single_line(u"Href of the Picture seems wrong...") content = None if not content: return with open(self.save_path + '/' + filename, 'wb') as image: image.write(content) return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return result = Match.column(target_url) self.column_id = result.group("column_id") content = Http.get_content("https://zhuanlan.zhihu.com/api/columns/" + self.column_id) if not content: return raw_info = json.loads(content) info = {} info["creator_id"] = raw_info["creator"]["slug"] info["creator_hash"] = raw_info["creator"]["hash"] info["creator_sign"] = raw_info["creator"]["bio"] info["creator_name"] = raw_info["creator"]["name"] info["creator_logo"] = ( raw_info["creator"]["avatar"]["template"] .replace("{id}", raw_info["creator"]["avatar"]["id"]) .replace("_{size}", "") ) info["column_id"] = raw_info["slug"] info["name"] = raw_info["name"] info["logo"] = ( raw_info["creator"]["avatar"]["template"].replace("{id}", raw_info["avatar"]["id"]).replace("_{size}", "") ) info["article"] = raw_info["postsCount"] info["follower"] = raw_info["followersCount"] info["description"] = raw_info["description"] self.info_list.append(info) self.task_complete_set.add(target_url) detect_url = "https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset=".format(self.column_id) for i in range(info["article"] / 10 + 1): self.work_set.add(detect_url + str(i * 10)) return
def create_single_html_book(self, book_package): title = book_package.get_title() if not title: # 电子书题目为空时自动跳过 # 否则会发生『rm -rf / 』的惨剧 return Path.reset_path() Path.chdir(Path.result_path) Path.rmdir(u'./' + title) Path.mkdir(u'./' + title) Path.chdir(u'./' + title) page = [] for book in book_package.book_list: page += book.page_list content = u' \r\n '.join([Match.html_body(x.content) for x in page]).replace(u'../images/', u'./images/') with open(TemplateConfig.content_base_uri) as html: content = html.read().format(title=title, body=content).replace(u'../style/', u'./') with open(title + u'.html', 'w') as html: html.write(content) Path.copy(Path.html_pool_path + u'/../{}/OEBPS/images'.format(title), u'./images') Path.copy(Path.www_css + u'/customer.css', u'./customer.css') Path.copy(Path.www_css + u'/markdown.css', u'./markdown.css') Path.copy(Path.www_css + u'/normalize.css', u'./normalize.css') Path.reset_path() return
def fix_image(self, content): content = Match.fix_html(content) for img in re.findall(r'<img[^>]*', content): # fix img if img[-1] == '/': img = img[:-1] img += '>' src = re.search(r'(?<=src=").*?(?=")', img) if not src: new_image = img + '</img>' content = content.replace(img, new_image) continue else: src = src.group(0) if src.replace(' ', '') == '': new_image = img + '</img>' content = content.replace(img, new_image) continue src_download = HtmlCreator.fix_image_src(src) if src_download: filename = self.image_container.add(src_download) else: filename = '' new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename)) new_image = new_image.replace( '//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg', '../images/{}'.format(filename)) new_image += '</img>' content = content.replace( img, '<div class="duokan-image-single">{}</div>'.format(new_image)) return content
def create_work_set(self, target_url): u""" 根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入sinablog_info 的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址, 放入work_set中 :param target_url: 博客首页的url :return: """ if target_url in self.task_complete_set: return result = Match.sinablog_author(target_url) sinablog_author_id = int(result.group('sinablog_people_id')) article_num = self.get_sinablog_question_list(sinablog_author_id) if article_num % 50 != 0: page_num = article_num/50 + 1 # 50 href on 1 page else: page_num = article_num / 50 self.question_list[0]['article_num'] = article_num # 上面这行, 暂时只能这样写, 因为"关于我"的页面没有文章的数量 self.task_complete_set.add(target_url) for page in range(page_num): url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(sinablog_author_id, page+1) content_article_list = Http.get_content(url) article_list = self.parse_get_article_list(content_article_list) for item in article_list: self.work_set.add(item) return
def fix_image(self, content): content = Match.fix_html(content) for img in re.findall(r'<img[^>]*', content): # fix img # if img[-1] == '/': # print u"修改前,img为:" + str(img) # img = img[:-1] # print u"修改后,img为:" + str(img[:-1]) img += '>' src = re.search(r'(?<=src=").*?(?=")', img) if not src: new_image = img + '</img>' content = content.replace(img, new_image) continue else: src = src.group(0) if src.replace(' ', '') == '': new_image = img + '</img>' content = content.replace(img, new_image) continue src_download = HtmlCreator.fix_image_src(src) if src_download: filename = self.image_container.add(src_download) else: filename = '' new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename)) new_image = new_image.replace('http://simg.sinajs.cn/blog7style/images/common/sg_trans.gif',\ '../images/{}'.format(filename)) # 硬编码, 可以优化?写到fix_html函数中 # new_image += '</img>' content = content.replace( img, '<div class="duokan-image-single">{}</div>'.format(new_image)) return content
def set_info(self, info): self.info.update(info) if self.kind == Type.question: self.epub.title = u'知乎问题集锦({})'.format(info['title']) self.epub.id = info['id'] elif self.kind == Type.answer: self.epub.title = u'知乎回答集锦({})'.format(info['title']) self.epub.id = info['id'] elif self.kind == Type.article: self.epub.title = u'知乎专栏文章集锦({})'.format(info['title']) self.epub.id = info['id'] if self.kind == Type.topic: self.epub.title = u'话题_{}({})'.format(info['title'], info['topic_id']) self.epub.id = info['topic_id'] if self.kind == Type.collection: self.epub.title = u'收藏夹_{}({})'.format(info['title'], info['collection_id']) self.epub.id = info['collection_id'] if self.kind == Type.author: self.epub.title = u'作者_{}({})'.format(info['name'], info['author_id']) self.epub.id = info['author_id'] if self.kind == Type.column: self.epub.title = u'专栏_{}({})'.format(info['name'], info['column_id']) self.epub.id = info['column_id'] self.epub.title = Match.fix_filename(self.epub.title) return
def main(): file_name = 'ReadList.txt' log.print_log(u'read from %s' % file_name) counter = 1 try: with open(file_name, 'r') as read_list: read_list = read_list.readlines() line = read_list[0] split_url = line.split('#')[0] recipe_kind = Match.get_website_kind(split_url) print recipe_kind counter += 1 if recipe_kind == 'Unsupport type': print('Unsupported website or url type. \nPlease check url.') sys.exit() except IOError as e: print(u"\nOops! No " + file_name + ". creating " + file_name + "...") with open(file_name, 'w') as read_list: read_list.close() sys.exit() except IndexError: if 1 == counter: print(u"\nOops! No content in " + file_name + u". Please check it out.") sys.exit() print(u"website type:" + str(recipe_kind)) game = TEEBook(recipe_kind=recipe_kind, url=None, read_list=file_name) game.begin() sys.exit()
def worker(self, target_url): if target_url in self.work_complete_set: # 自动跳过已抓取成功的网址 return Debug.logger.info(u'开始抓取{}的内容'.format(target_url)) content = Http.get_content(target_url) if not content: return from src.worker.sinablog_worker import sinablogAuthorWorker if isinstance(self, sinablogAuthorWorker): content = Match.fix_html(content=content, recipe_kind='sinablog_author') else: content = Match.fix_html(content=content) # 需要修正其中的<br>标签,避免爆栈 self.content_list.append(content) Debug.logger.debug(u'{}的内容抓取完成'.format(target_url)) self.work_complete_set.add(target_url) return
def parse_article_id(self): article_id = str(self.dom.find("div", class_="share-group")) if not article_id: Debug.logger.info(u"没有找到文章id") return result = Match.jianshu_article_id(article_id) article_id = result.group('jianshu_article_id') self.info['article_id'] = article_id
def download_img_in_question_content(self): # 下载问题详情中的图片,同时更新 from src.container.image_container import ImageContainer img_container = ImageContainer() img_src_dict = Match.match_img_with_src_dict(self.question_info.detail) self.question_content_img_filename_list = [] for img in img_src_dict: src = img_src_dict[img] filename = img_container.add(src) self.question_content_img_filename_list.append(filename) self.question_info.detail = self.question_info.detail.replace(img, Match.create_img_element_with_file_name(filename)) img_container.start_download() # 下载完成后,更新图片大小 for filename in self.question_content_img_filename_list: self.question_content_img_size += Path.get_img_size_by_filename_kb(filename) return
def set_info(self, info): self.info.update(info) if self.kind == Type.csdnblog_author: self.epub.title = u'csdn博客作者_{}({})文章集锦'.format(info['creator_name'], info['creator_id']) self.epub.id = info['creator_id'] elif self.kind == Type.cnblogs_author: self.epub.title = u'cnblogs作者_{}({})文章集锦'.format(info['creator_name'], info['creator_id']) self.epub.id = info['creator_id'] elif self.kind == Type.jianshu_author: # 该博客所有的博文 self.epub.title = u'简书作者_{}({})文章集锦'.format(info['creator_name'], info['creator_id']) self.epub.id = info['creator_id'] elif self.kind == Type.jianshu_collection: self.epub.title = u'简书专题_{}({})'.format(info['title'], info['collection_fake_id']) self.epub.id = info['collection_fake_id'] elif self.kind == Type.jianshu_notebooks: self.epub.title = u'简书文集_{}({})'.format(info['title'], info['notebooks_id']) self.epub.id = info['notebooks_id'] elif self.kind == Type.jianshu_article: # 单篇博文 TODO self.epub.title = u'简书博文集锦({})'.format(info['title']) self.epub.id = info['id'] # TODO elif self.kind == Type.sinablog_author: # 该博客所有的博文 self.epub.title = u'新浪博客_{}({})'.format(info['creator_name'], info['creator_id']) self.epub.id = info['creator_id'] elif self.kind == Type.sinablog_article: # 新浪单篇博文 TODO self.epub.title = u'新浪博客博文集锦({})'.format(info['title']) self.epub.id = info['id'] # TODO elif self.kind == Type.question: self.epub.title = u'知乎问题集锦({})'.format(info['title']) self.epub.id = info['id'] elif self.kind == Type.answer: self.epub.title = u'知乎回答集锦({})'.format(info['title']) self.epub.id = info['id'] elif self.kind == Type.article: self.epub.title = u'知乎专栏文章集锦({})'.format(info['title']) self.epub.id = info['id'] elif self.kind == Type.topic: self.epub.title = u'知乎话题_{}({})'.format(info['title'], info['topic_id']) self.epub.id = info['topic_id'] elif self.kind == Type.collection: self.epub.title = u'知乎收藏夹_{}({})'.format(info['title'], info['collection_id']) self.epub.id = info['collection_id'] elif self.kind == Type.author: self.epub.title = u'知乎作者_{}({})'.format(info['name'], info['author_id']) self.epub.id = info['author_id'] elif self.kind == Type.column: self.epub.title = u'知乎专栏_{}({})'.format(info['name'], info['column_id']) self.epub.id = info['column_id'] elif self.kind == Type.yiibai: self.epub.title = u'易百教程_{}'.format(info['title']) self.epub.id = info['creator_id'] elif self.kind == Type.talkpython: self.epub.title = u'TalkPythonToMe' self.epub.id = info['creator_id'] from src.html5lib.constants import entities_reverse self.epub.title = Match.replace_words(self.epub.title, entities_reverse) return
def parse_update_date(self): u""" 获取最近更新日期 """ dom = self.node.select(u"div.zg-gray-normal span.time")[0] raw_update_date = Tag.get_content(dom) update_date = Match.parse_date(raw_update_date) self.set_attr("update_date", update_date) return
def get_article_info(self): data = {} try: try: title_tationl = self.dom.find_all('p', align="center") # print u"标题 {}".format(span_dom.text.strip()), resultstr = title_tationl[0].text data['title'] = resultstr ttd = title_tationl[1].text td = (str(ttd).split('(')[-1]).split(')')[0] # date_time = datetime.datetime.strptime(td, '%Y年%m月%d日') print td ye = td.split('年')[0] mo = (td.split('年')[-1]).split('月')[0] da = (td.split('月')[-1]).split('日')[0] yey = chinese2datsty(ye) print u'{}年{}月{}'.format(yey, mo, da) data['updated_time'] = u'{}年{}月{}'.format(yey, mo, da) except IndexError: data['title'] = Match.replace_specile_chars(self.dom.title) data['updated_time'] = '' data['title'] = str(data['title']).strip() article_body = "" content = self.dom.find_all('p') for ii in range(2, len(content)): x = content[ii] # print x xxt = u'<p>{}</p>'.format(x.text) article_body += str(xxt) data['content'] = str(article_body) data['voteup_count'] = "" data['comment_count'] = "" data['image_url'] = '' data['author_id'] = 'meng-qing-xue-81' data['author_name'] = ' ' data['author_headline'] = '' data[ 'author_avatar_url'] = 'https://pic4.zhimg.com/v2-38a89e42b40baa7d26d99cab9a451623_xl.jpg' data['author_gender'] = '0' except Exception as e: print e.message return [] return data
def parse_collection_id(self): u""" 获取收藏夹id """ anchor = self.node.select( u'div.zg-gray-normal a[data-za-l="collection_followers_count"]')[0] log_src = Tag.get_attr(anchor, u"href") collection_id = Match.match_collection_id(log_src) self.set_attr("collection_id", collection_id) return
def create_book(self): # 确定文件信息 title = Match.fix_filename(self.book_title) if self.is_split: title = self.book_title + u'_卷{}'.format(self.chapter_no) # 先切换到电子书临时资源目录下 Path.chdir(Path.book_pool_path) epub = Epub(title) for task_result in self.task_result_list: chapter_src = '' # info_page if task_result.task.task_type == Type.question: chapter_src = self.generate_question_info_page(task_result.info_page) elif task_result.task.task_type == Type.answer: chapter_src = self.generate_question_info_page(task_result.info_page) elif task_result.task.task_type == Type.collection: chapter_src = self.generate_collection_info_page(task_result.info_page) elif task_result.task.task_type == Type.topic: chapter_src = self.generate_topic_info_page(task_result.info_page) elif task_result.task.task_type == Type.author: chapter_src = self.generate_author_info_page(task_result.info_page) elif task_result.task.task_type == Type.column: chapter_src = self.generate_column_info_page(task_result.info_page) elif task_result.task.task_type == Type.article: chapter_src = self.generate_article_info_page(task_result.info_page) epub.create_chapter(chapter_src, task_result.get_title()) for question in task_result.question_list: # 添加图片文件 for filename in question.img_filename_list: epub.add_image(Path.image_pool_path + '/' + filename) question_src = self.generate_question_page(question) epub.add_html(question_src, question.question_info.title) for column in task_result.column_list: # 添加图片文件 for filename in column.img_filename_list: epub.add_image(Path.image_pool_path + '/' + filename) for article in column.article_list: article_src = self.generate_article_page(article) epub.add_html(article_src, article.title) epub.finish_chapter() epub.set_creator(u'ZhihuHelp1.8.0') epub.set_language(u'zh-cn') epub.set_book_id() epub.set_output_path(Path.result_path) epub.add_css(Path.base_path + u'/www/css/markdown.css') epub.add_css(Path.base_path + u'/www/css/customer.css') epub.add_css(Path.base_path + u'/www/css/normalize.css') epub.add_css(Path.base_path + u'/www/css/bootstrap.css') epub.create() Path.reset_path() return
def parse_column(command): result = Match.column(command) column_id = result.group('column_id') task = SingleTask() task.kind = 'column' task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format(column_id) task.book.kind = 'column' task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format(column_id) task.book.sql.question = '' task.book.sql.answer = 'select * from Article where column_id = "{}" '.format(column_id) return task
def set_dom(self, dom): self.info = {} if dom and not (dom.select('div.answer-status')): self.header = dom.find('div', class_='zm-item-vote-info') self.body = dom.find('textarea', class_='content') self.footer = dom.find('div', class_='zm-meta-panel') if self.body: content = self.get_tag_content(self.body) self.content = BeautifulSoup(Match.fix_html(content), 'html.parser') self.author_parser.set_dom(dom) return
def create_work_set(self, target_url): u""" 根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入SinaBlog_Info 的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址, 放入work_set中 :param target_url: 博客首页的url :return: """ Debug.logger.debug(u"target_url是:" + str(target_url)) if target_url in self.task_complete_set: return result = Match.SinaBlog(target_url) SinaBlog_author_id = int(result.group('SinaBlog_people_id')) href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format( SinaBlog_author_id) href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format( SinaBlog_author_id) # ############下面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化 content_profile = Http.get_content(href_profile) parser = SinaBlogParser(content_profile) self.question_list += parser.get_SinaBlog_info_list() # Debug.logger.debug(u"create_work_set中的question_list是什么??" + str(self.question_list)) # #############上面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化 # content_index = Http.get_content(href_index) content_article_list = Http.get_content(href_article_list) article_num = int(self.parse_article_num(content_article_list)) Debug.logger.debug(u"article_num:" + str(article_num)) if article_num % 50 != 0: page_num = article_num / 50 + 1 # 博客目录页面, 1页放50个博客链接 else: page_num = article_num / 50 self.question_list[0][ 'article_num'] = article_num # 这样的话, 每行只能放一个新浪博客地址!!! # 上面这行, 暂时只能这样写, 因为"关于我"的页面, 没有文章的数量 self.task_complete_set.add(target_url) for page in range(page_num): url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format( SinaBlog_author_id, page + 1) content_article_list = Http.get_content(url) article_list = self.parse_get_article_list(content_article_list) for item in article_list: self.work_set.add(item) # self.work_set.add(article_list[0]) return
def parse_author(command): result = Match.author(command) author_id = result.group('author_id') task = SingleTask() task.kind = 'author' task.spider.href = 'https://www.zhihu.com/people/{}'.format(author_id) task.book.kind = 'author' task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(author_id) task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where author_id = "{}")'.format( author_id) task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(author_id) return task
def parse_article(command): result = Match.article(command) column_id = result.group('column_id') article_id = result.group('article_id') task = SingleTask() task.kind = 'article' task.spider.href = 'https://zhuanlan.zhihu.com/{}/{}'.format(column_id, article_id) task.book.kind = 'article' task.book.sql.info = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id) task.book.sql.question = '' task.book.sql.answer = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id) return task
def parse_question(command): result = Match.question(command) question_id = result.group('question_id') task = SingleTask() task.kind = 'question' task.spider.href = 'https://www.zhihu.com/question/{}'.format(question_id) task.book.kind = 'question' task.book.sql.info = ' question_id = "{}" '.format(question_id) task.book.sql.question = 'question_id = "{}"'.format(question_id) task.book.sql.answer = 'question_id = "{}"'.format(question_id) return task
def parse_jianshu_collection(command): result = Match.jianshu_collection(command) collection_id = result.group('collection_id') task = SingleTask() task.kind = 'jianshu_collection' task.spider.href = 'http://www.jianshu.com/collection/{}'.format(collection_id) task.book.kind = 'jianshu_collection' task.book.sql.info = 'select * from jianshu_collection_info where collection_fake_id = "{}"'.format( collection_id ) task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \ 'jianshu_collection_index where collection_fake_id = "{}")'.format(collection_id) return task
def parse_answer(command): result = Match.answer(command) question_id = result.group('question_id') answer_id = result.group('answer_id') task = SingleTask() task.kind = 'answer' task.spider.href = 'http://www.zhihu.com/question/{}/answer/{}'.format(question_id, answer_id) task.book.kind = 'answer' task.book.property.sql.info = '' task.book.property.sql.question = 'question_id = "{}"'.format(question_id) task.book.property.sql.answer = 'question_id = "{}" and answer_id = "{}"'.format(question_id, answer_id) return task
def parse_jianshu_notebooks(command): result = Match.jianshu_notebooks(command) notebooks_id = result.group('notebooks_id') task = SingleTask() task.kind = 'jianshu_notebooks' task.spider.href = 'http://www.jianshu.com/notebooks/{}/latest'.format(notebooks_id) # config file??? task.book.kind = 'jianshu_notebooks' task.book.sql.info = 'select * from jianshu_notebooks_info where notebooks_id = "{}"'.format( notebooks_id ) task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \ 'jianshu_notebooks_index where notebooks_id = "{}")'.format(notebooks_id) return task
def parse_question(command): result = Match.question(command) question_id = result.group('question_id') task = SingleTask() task.kind = 'question' task.spider.href = 'https://www.zhihu.com/question/{}'.format( question_id) task.book.kind = 'question' task.book.sql.info = ' question_id = "{}" '.format(question_id) task.book.sql.question = 'question_id = "{}"'.format(question_id) task.book.sql.answer = 'question_id = "{}"'.format(question_id) return task
def parse_topic(command): result = Match.topic(command) topic_id = result.group('topic_id') task = SingleTask() task.kind = 'topic' task.spider.href = 'https://www.zhihu.com/topic/{}'.format(topic_id) task.book.kind = 'topic' task.book.sql.info = 'select * from TopicInfo where topic_id = "{}"'.format(topic_id) task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where href in (select href from TopicIndex where topic_id = "{}"))'.format( topic_id) task.book.sql.answer = 'select * from Answer where href in (select href from TopicIndex where topic_id = "{}")'.format( topic_id) return task
def fix_image(self, content, recipe): content = Match.fix_html(content=content, recipe_kind=recipe) for img in re.findall(r'<img[^>]*', content): if recipe not in [Type.sinablog_author, Type.cnblogs_author]: # fix img if img[-1] == '/': img = img[:-1] img += '>' src = re.search(r'(?<=src=").*?(?=")', img) if not src: new_image = img + '</img>' content = content.replace(img, new_image) continue else: src = src.group(0) if src.replace(' ', '') == '': new_image = img + '</img>' content = content.replace(img, new_image) continue src_download = HtmlCreator.fix_image_src(src) if src_download: if recipe in Type.zhihu and not src_download.startswith('http'): # fix zhuanlan image href src_download = src_download.split('.')[0] filename = self.image_container.add('https://pic2.zhimg.com/'+src_download+'_b.jpg') elif recipe in Type.generic: filename = '' # TODO else: filename = self.image_container.add(src_download) else: filename = '' new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename)) if recipe in Type.jianshu: new_image = new_image.replace('data-original-src', 'temppicsr') new_image = new_image.replace('src', 'falsesrc') new_image = new_image.replace('temppicsr', 'src') # 应该有更好的方式, 暂时先这样写 new_image += '</img>' elif recipe in Type.sinablog: # 硬编码, 可以优化?写到fix_html函数中 new_image = new_image.replace('http://simg.sinajs.cn/blog7style/images/common/sg_trans.gif',\ '../images/{}'.format(filename)) elif recipe in Type.zhihu: new_image = new_image.replace('//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg', '../images/{}'.format(filename)) new_image += '</img>' elif recipe in Type.cnblogs: pass content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image)) return content
def parse_collection(command): result = Match.collection(command) collection_id = result.group('collection_id') task = SingleTask() task.kind = 'collection' task.spider.href = 'https://www.zhihu.com/collection/{}'.format(collection_id) task.book.kind = 'collection' task.book.sql.info = 'select * from CollectionInfo where collection_id = "{}"'.format( collection_id) task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where href in (select href from CollectionIndex where collection_id = "{}"))'.format( collection_id) task.book.sql.answer = 'select * from Answer where href in (select href from CollectionIndex where collection_id = "{}")'.format( collection_id) return task
def worker(self, target_url): if target_url in self.work_complete_set: # 自动跳过已抓取成功的网址 return Debug.logger.info(u'开始抓取{}的内容'.format(target_url)) content = Http.get_content(target_url) if not content: return content = Match.fix_html(content) # 需要修正其中的<br>标签,避免爆栈 self.content_list.append(content) Debug.logger.debug(u'{}的内容抓取完成'.format(target_url)) self.work_complete_set.add(target_url) return
def parse_column(command): result = Match.column(command) column_id = result.group('column_id') task = SingleTask() task.kind = 'column' task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format( column_id) task.book.kind = 'column' task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format( column_id) task.book.sql.question = '' task.book.sql.answer = 'select * from Article where column_id = "{}" '.format( column_id) return task
def parse_creator_profile_id(self, dom): u""" dom => div#zh-single-answer-author-info 获取用户profile_id :param dom: 用户信息节点 :type dom: bs4.Tag :return: :rtype: None """ anchor = dom.select(u"h2.zm-list-content-title a")[0] link = Tag.get_attr(anchor, u"href") profile_id = Match.match_author_id(link) self.set_attr("profile_id", profile_id) return
def parse_collection(command): result = Match.collection(command) collection_id = result.group('collection_id') task = SingleTask() task.kind = 'collection' task.spider.href = 'https://www.zhihu.com/collection/{}'.format( collection_id) task.book.kind = 'collection' task.book.sql.info = 'select * from CollectionInfo where collection_id = "{}"'.format( collection_id) task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where href in (select href from CollectionIndex where collection_id = "{}"))'.format( collection_id) task.book.sql.answer = 'select * from Answer where href in (select href from CollectionIndex where collection_id = "{}")'.format( collection_id) return task
def parse_article(command): result = Match.article(command) column_id = result.group('column_id') article_id = result.group('article_id') task = SingleTask() task.kind = 'article' task.spider.href = 'https://zhuanlan.zhihu.com/{}/{}'.format( column_id, article_id) task.book.kind = 'article' task.book.sql.info = ' column_id = "{}" and article_id = "{}" '.format( column_id, article_id) task.book.sql.question = '' task.book.sql.answer = ' column_id = "{}" and article_id = "{}" '.format( column_id, article_id) return task
def parse_raw_creator_avatar(self): u""" dom => self.node 获取用户头像地址 :return: :rtype: None """ dom = self.node.select(u"div.zm-side-section-inner a.zm-list-avatar-link")[0] img = dom.select(u".zm-list-avatar-medium")[0] src = Tag.get_attr(img, u"src") self.set_attr("raw_avatar", src) avatar = Match.format_avatar(src) self.set_attr("avatar", avatar) return
def parse_topic(command): result = Match.topic(command) topic_id = result.group('topic_id') task = SingleTask() task.kind = 'topic' task.spider.href = 'https://www.zhihu.com/topic/{}'.format( topic_id) task.book.kind = 'topic' task.book.sql.info = 'select * from TopicInfo where topic_id = "{}"'.format( topic_id) task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where href in (select href from TopicIndex where topic_id = "{}"))'.format( topic_id) task.book.sql.answer = 'select * from Answer where href in (select href from TopicIndex where topic_id = "{}")'.format( topic_id) return task
def parse_author(command): result = Match.author(command) author_id = result.group('author_id') task = SingleTask() task.kind = 'author' task.spider.href = 'https://www.zhihu.com/people/{}'.format( author_id) task.book.kind = 'author' task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format( author_id) task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where author_id = "{}")'.format( author_id) task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format( author_id) return task
def generate_book_title(self): """ 生成并设置 :return: :rtype:str """ title_list = [] for task_result in self.task_result_list: title_list.append(task_result.get_title()) title = u'_'.join(title_list) if len(title) > 50: title = title[:50] + u'。。。等' + str(len(title_list)) + u'本电子书' title = Match.replace_danger_char_for_filesystem(title) self.book_title = title return title
def parse_raw_creator_avatar(self): u""" dom => self.node 获取用户头像地址 :return: :rtype: None """ dom = self.node.select( u"div.zm-side-section-inner a.zm-list-avatar-link")[0] img = dom.select(u".zm-list-avatar-medium")[0] src = Tag.get_attr(img, u"src") self.set_attr("raw_avatar", src) avatar = Match.format_avatar(src) self.set_attr("avatar", avatar) return