Example #1
0
    def download_img(self):
        from src.container.image_container import ImageContainer
        img_container = ImageContainer()
        img_src_dict = Match.match_img_with_src_dict(self.content)
        self.img_filename_list = []
        for img in img_src_dict:
            src = img_src_dict[img]
            filename = img_container.add(src)
            self.img_filename_list.append(filename)
            self.content = self.content.replace(img, Match.create_img_element_with_file_name(filename))

        #   下载文章封面图像
        filename = img_container.add(self.image_url)
        self.img_filename_list.append(filename)
        self.image_url = Match.create_local_img_src(filename)

        #   下载用户头像
        filename = img_container.add(self.author_avatar_url)
        self.img_filename_list.append(filename)
        self.author_avatar_url = Match.create_local_img_src(filename)

        img_container.start_download()

        #   下载完成后,更新图片大小
        for filename in self.img_filename_list:
            self.total_img_size_kb += Path.get_img_size_by_filename_kb(filename)
        return
Example #2
0
    def download_img(self):
        from src.container.image_container import ImageContainer
        img_container = ImageContainer()
        img_src_dict = Match.match_img_with_src_dict(self.content)
        self.img_filename_list = []
        for img in img_src_dict:
            src = img_src_dict[img]
            filename = img_container.add(src)
            self.img_filename_list.append(filename)
            self.content = self.content.replace(
                img, Match.create_img_element_with_file_name(filename))

        #   答案作者的头像也要下载
        filename = img_container.add(self.author_avatar_url)
        self.img_filename_list.append(filename)
        self.author_avatar_url = Match.create_local_img_src(filename)

        img_container.start_download()

        #   下载完成后,更新图片大小
        for filename in self.img_filename_list:
            self.total_img_size_kb += Path.get_img_size_by_filename_kb(
                filename)

            print self.total_img_size_kb
        return
Example #3
0
def main():
    debug = False

    def version():
        log.info_log('version %s' % __version__)

    try:
        opts, args = getopt.getopt(sys.argv[1:], short_options, long_options)
    except getopt.GetoptError as err:
        log.error_log(u"Try ee-book --help for more options")
        sys.exit(2)
    for option, args in opts:
        if option in ('-V', '--version'):
            version()
            sys.exit()
        elif option in ('-d', '--debug'):
            print u"Debug mode..."
            debug = True
        elif option in ('-h', '--help'):
            version()
            print(help_info)
            sys.exit()
        elif option in ('-g', '--gui'):
            print(u"Under developing...")
            sys.exit()
            # graphviz = GraphvizOutput(output_file='filter_gui.png')
            # with PyCallGraph(output=graphviz, config=config):
            # from PyQt4.QtGui import QApplication
            # from PyQt4.QtGui import QIcon
            # from src.gui.ui import MainWindow
            # from src.resources import qrc_resources
            # app = QApplication(sys.argv)
            # app.setWindowIcon(QIcon(":/icon.png"))
            # app.setApplicationName('EE-Book')
            # window = MainWindow()
            # window.show()
            # sys.exit(app.exec_())
        elif option in ('-l', '--login'):
            url = args
            try:
                recipe_kind = Match.get_url_kind(url)
            except UnsupportTypeException, e:
                print e
                print u"Please try again."
                sys.exit()
            zhihu = EEBook(
                recipe_kind=recipe_kind
            )  # Init path, e.g. config, only zhihu are supported now
            login = Login(recipe_kind=recipe_kind)
            login.start()
            sys.exit()
        elif option in ('-u', '--url'):
            url = args
            try:
                recipe_kind = Match.get_website_kind(url)
            except UnsupportTypeException, e:
                print e
                print u"Please check url."
                sys.exit()
Example #4
0
    def get_article_info(self):
        data = {}
        try:

            try:
                title_tationl = self.dom.find_all('h1', class_="article-title")
                # print  u"标题 {}".format(span_dom.text.strip()),
                resultstr = title_tationl[0].text

                if resultstr.__contains__('/'):
                    resultstr = Match.replace_specile_chars(resultstr)
                data['title'] = resultstr

                data['title'] = resultstr.strip()

            except IndexError:
                data['title'] = Match.replace_specile_chars(self.dom.title)
            data['title'] = str(data['title']).strip()

            article_body = ""

            content = self.dom.find_all('article', class_="article-content")[0]

            article_body += str(content)

            strOfinfos = self.dom.find_all('p', style="text-align: center;")
            for x in strOfinfos:
                article_body = article_body.replace(str(x), '', 1)

            data['content'] = str(article_body)

            time_tationl = self.dom.find_all('div', class_="article-meta")[0]

            data['updated_time'] = time_tationl.find_all('span',
                                                         class_="item")[0].text

            # print data['updated_time']

            data['voteup_count'] = ""
            data['comment_count'] = ""

            data['image_url'] = ''

            data['author_id'] = 'meng-qing-xue-81'

            tempName = time_tationl.find_all('span', class_="item")[1].text

            data['author_name'] = (str(tempName).split(':'))[-1]
            data['author_headline'] = ''
            data[
                'author_avatar_url'] = 'https://pic4.zhimg.com/v2-38a89e42b40baa7d26d99cab9a451623_xl.jpg'
            data['author_gender'] = '0'
        except Exception as e:
            print e.message
            return []

        return data
Example #5
0
    def download_img(self):
        from src.container.image_container import ImageContainer

        if str(self.content).__contains__('<div class="duokan-image-single">'):
            # print img_src_dict

            xtep = str(self.content)
            xxsoup = BeautifulSoup(xtep, 'lxml')
            list_tiezhi_tit = xxsoup.find_all('div', class_="duokan-image-single")
            for x in list_tiezhi_tit:

                list_pcyc_li = x.find_all('img')
                for li in list_pcyc_li:
                    # print li
                    src = li.get('src')

                    st = str(src).split('/images/')[-1]

                    newT = u'<img class="ke_img" src="file:///Users/ink/Desktop/images/{}"  />'.format(st)

                    xtep = xtep.replace(str(x), newT, 1)

            self.content = xtep

            # print xtep

        img_container = ImageContainer()
        img_src_dict = Match.match_img_with_src_dict(self.content)

        self.img_filename_list = []
        for img in img_src_dict:
            src = img_src_dict[img]
            filename = img_container.add(src)

            # print 'src:' + src + '  and filename  ' + filename

            self.img_filename_list.append(filename)
            if str(img).__contains__(u"class=\"avatar\""):
                self.content = self.content.replace(img, Match.avatar_create_img_element_with_file_name(filename))
            else:
                self.content = self.content.replace(img, Match.create_img_element_with_file_name(filename))
        # 下载文章封面图像
        filename = img_container.add(self.image_url)
        self.img_filename_list.append(filename)
        self.image_url = Match.create_local_img_src(filename)

        #   下载用户头像
        filename = img_container.add(self.author_avatar_url)
        self.img_filename_list.append(filename)
        self.author_avatar_url = Match.create_local_img_src(filename)

        img_container.start_download()

        #   下载完成后,更新图片大小
        for filename in self.img_filename_list:
            self.total_img_size_kb += Path.get_img_size_by_filename_kb(filename)
        return
Example #6
0
def main():
    debug = False

    def version():
        log.info_log('version %s' % __version__)
    try:
        opts, args = getopt.getopt(sys.argv[1:], short_options, long_options)
    except getopt.GetoptError as err:
        log.error_log(u"Try ee-book --help for more options")
        sys.exit(2)
    for option, args in opts:
        if option in ('-V', '--version'):
            version()
            sys.exit()
        elif option in ('-d', '--debug'):
            print u"Debug mode..."
            debug = True
        elif option in ('-h', '--help'):
            version()
            print(help_info)
            sys.exit()
        elif option in ('-g', '--gui'):
            print(u"Under developing...")
            sys.exit()
            # graphviz = GraphvizOutput(output_file='filter_gui.png')
            # with PyCallGraph(output=graphviz, config=config):
            # from PyQt4.QtGui import QApplication
            # from PyQt4.QtGui import QIcon
            # from src.gui.ui import MainWindow
            # from src.resources import qrc_resources
            # app = QApplication(sys.argv)
            # app.setWindowIcon(QIcon(":/icon.png"))
            # app.setApplicationName('EE-Book')
            # window = MainWindow()
            # window.show()
            # sys.exit(app.exec_())
        elif option in ('-l', '--login'):
            url = args
            try:
                recipe_kind = Match.get_url_kind(url)
            except UnsupportTypeException, e:
                print e
                print u"Please try again."
                sys.exit()
            zhihu = EEBook(recipe_kind=recipe_kind)    # Init path, e.g. config, only zhihu are supported now
            login = Login(recipe_kind=recipe_kind)
            login.start()
            sys.exit()
        elif option in ('-u', '--url'):
            url = args
            try:
                recipe_kind = Match.get_website_kind(url)
            except UnsupportTypeException, e:
                print e
                print u"Please check url."
                sys.exit()
    def get_article_info(self):
        data = {}
        try:

            try:
                title_tationl = self.dom.find_all('h1', class_="entry-title")
                # print  u"标题 {}".format(span_dom.text.strip()),
                resultstr = title_tationl[0].text


                if resultstr.__contains__('/'):
                    resultstr = Match.replace_specile_chars(resultstr)
                data['title'] = resultstr

                data['title'] = resultstr.strip()

            except IndexError:
                data['title'] = Match.replace_specile_chars(self.dom.title)
            data['title'] = str(data['title']).strip()

            article_body = ""

            content = self.dom.find_all('div', class_="entry-content")[0]


            article_body += str(content)

            data['content'] = str(article_body)

            time_tationl = self.dom.find_all('div', class_="post-meta")[0]
            ttd = str(time_tationl.text)

            date_time = datetime.datetime.strptime(str(ttd.split('•')[1]).strip(), '%Y年%m月%d日')
            print '转化后时间'
            print date_time.strftime('%Y-%m-%d')

            data['updated_time'] = date_time.strftime('%Y-%m-%d')

            data['voteup_count'] =  ""
            data['comment_count'] = ""

            data['image_url'] = ''

            data['author_id'] = 'meng-qing-xue-81'


            data['author_name'] = ttd.split('•')[0]
            data['author_headline'] = ''
            data['author_avatar_url'] = 'https://pic4.zhimg.com/v2-38a89e42b40baa7d26d99cab9a451623_xl.jpg'
            data['author_gender'] = '0'
        except Exception as e:
            print e.message
            return []

        return data
    def get_article_info(self):
        data = {}
        try:
            try:
                title_tationl = self.dom.find_all(
                    'h2', class_="Buffett-clipVideoHeroHeaderTitle")
                # print  u"标题 {}".format(span_dom.text.strip()),
                resultstr = title_tationl[0].text

                if resultstr.__contains__('/'):
                    resultstr = Match.replace_specile_chars(resultstr)

                data['title'] = resultstr.strip()

            except IndexError:
                data['title'] = Match.replace_specile_chars(self.dom.title)
            data['title'] = str(data['title']).strip()

            data['content'] = str(self.parse_answer_content())

            time_tationl = self.dom.find_all(
                'div', class_="Buffett-clipVideoHeroHeaderTimestamp")

            tt = time_tationl[0].text

            print tt[:-4]

            # print  sp[0].text + ' '+sp[1].text+'  '+sp[2].text

            data['updated_time'] = tt[:-4]

            # print data['updated_time']
            data['voteup_count'] = ""
            data['comment_count'] = ""

            data['image_url'] = ''

            data['author_id'] = 'meng-qing-xue-81'

            data['author_name'] = 'CNBC'
            data['author_headline'] = ''
            data[
                'author_avatar_url'] = 'https://pic4.zhimg.com/v2-38a89e42b40baa7d26d99cab9a451623_xl.jpg'
            data['author_gender'] = '0'
        except Exception as e:
            print e.message
            return []

        return data
Example #9
0
    def create_work_set(self, target_url):
        u"""
        根据target_url(例:http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles)的内容,
        先获得creator_id, 再根据文章的数目, 获得页面数, 依次打开每个页面, 将文章的地址放入work_set中
        :param target_url:
        :return:
        """
        if target_url in self.task_complete_set:
            return
        id_result = Match.jianshu_author(target_url)
        jianshu_id = id_result.group('jianshu_id')
        article_num, article_list = self.get_jianshu_question_list(target_url)
        self.task_complete_set.add(target_url)
        if article_num % 9 != 0:
            page_num = article_num/9 + 1      # 9 href on one page
        else:
            page_num = article_num / 9

        for item in article_list:
            self.work_set.add(item)
        for page in range(page_num-1):          # page+2, don't need to get the first page
            url = 'http://www.jianshu.com/users/{}/latest_articles?page={}'.format(jianshu_id, page+2)
            content_article_list = Http.get_content(url)
            article_list = self.parse_get_article_list(content_article_list)
            for item in article_list:
                self.work_set.add(item)
        return
Example #10
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return
        result = Match.column(target_url)
        self.column_id = result.group('column_id')
        content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' + self.column_id)
        if not content:
            return
        raw_info = json.loads(content)
        info = {}
        info['creator_id'] = raw_info['creator']['slug']
        info['creator_hash'] = raw_info['creator']['hash']
        info['creator_sign'] = raw_info['creator']['bio']
        info['creator_name'] = raw_info['creator']['name']
        info['creator_logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['creator']['avatar'][
            'id']).replace('_{size}', '')

        info['column_id'] = raw_info['slug']
        info['name'] = raw_info['name']
        info['logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['avatar']['id']).replace(
            '_{size}', '')
        info['article'] = raw_info['postsCount']
        info['follower'] = raw_info['followersCount']
        info['description'] = raw_info['description']
        self.info_list.append(info)
        self.task_complete_set.add(target_url)
        detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format(self.column_id)
        for i in range(info['article'] / 10 + 1):
            self.work_set.add(detect_url + str(i * 10))
        return
    def get_column_info(self):
        data = {}
        headcontent  = self.dom.find_all('div', class_="header")[0]
        img_src_dict = Match.match_img_with_src_dict(str(headcontent))

        for img in img_src_dict:
            src = img_src_dict[img]
            print src
            data['image_url'] = src

        reInfo = BeautifulSoup(str(headcontent), 'html.parser')
        links = reInfo.findAll('a')
        article_u = (links[1]).text

        description  = self.dom.find_all('span', class_="f12 gray")[0]


        data[u'title'] = article_u

        # data['image_url'] = ''

        data['article_count'] = 0
        data['follower_count'] = 0
        data['description'] = ''


        return data
Example #12
0
    def login(self, account, password, captcha=''):
        content = Http.get_content('https://www.zhihu.com/')
        xsrf = Match.xsrf(content)
        if not xsrf:
            Debug.logger.info(u'登陆失败')
            Debug.logger.info(u'敲击回车重新发送登陆请求')
            return False
        xsrf = xsrf.split('=')[1]
        # add xsrf as cookie into cookieJar,
        cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com')
        self.cookieJar.set_cookie(cookie)
        if captcha:
            post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True,
                         'captcha': captcha}
        else:
            post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True}

        header = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip,deflate',  # 主要属性,只要有此项知乎即认为来源非脚本
            'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4',
            'Host': 'www.zhihu.com',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest',
            'Origin': 'https://www.zhihu.com',
            'Referer': 'https://www.zhihu.com/',
        }
        result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header)
        if not result:
            Debug.logger.info(u'登陆失败,请敲击回车重新登陆')
            return False
        response = json.loads(result)

        if response['r'] == 0:
            print u'登陆成功!'
            print u'登陆账号:', account
            print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认'
            if raw_input() == 'yes':
                Config.account, Config.password, Config.remember_account = account, password, True
                print u'帐号密码已保存,可通过修改config.json修改设置'
            else:
                Config.account, Config.password, Config.remember_account = '', '', False
                print u'跳过保存环节,进入下一流程'
            Config._save()
            cookie = self.get_cookie()
            DB.execute('delete from LoginRecord')  # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录
            data = {}
            data['account'] = account
            data['password'] = password
            data['recordDate'] = ExtraTools.get_today()
            data['cookieStr'] = cookie
            DB.save(data, 'LoginRecord')
            DB.commit()
            return True
        else:
            print u'登陆失败'
            Debug.print_dict(response)
            return False
Example #13
0
    def fix_image(self, content):
        content = Match.fix_html(content)
        for img in re.findall(r'<img[^>]*', content):
            # fix img
            if img[-1] == '/':
                img = img[:-1]
            img += '>'

            src = re.search(r'(?<=src=").*?(?=")', img)
            if not src:
                new_image = img + '</img>'
                content = content.replace(img, new_image)
                continue
            else:
                src = src.group(0)
                if src.replace(' ', '') == '':
                    new_image = img + '</img>'
                    content = content.replace(img, new_image)
                    continue
            src_download = HtmlCreator.fix_image_src(src)
            if src_download:
                filename = self.image_container.add(src_download)
            else:
                filename = ''
            new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename))
            new_image = new_image.replace('//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg',
                                          '../images/{}'.format(filename))
            new_image += '</img>'
            content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image))

        return content
Example #14
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return
        result = Match.column(target_url)
        column_id = result.group('column_id')
        content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' +
                                   column_id)
        if not content:
            return
        raw_info = json.loads(content)
        info = {}
        info['creator_id'] = raw_info['creator']['slug']
        info['creator_hash'] = raw_info['creator']['hash']
        info['creator_sign'] = raw_info['creator']['bio']
        info['creator_name'] = raw_info['creator']['name']
        info['creator_logo'] = raw_info['creator']['avatar'][
            'template'].replace('{id}',
                                raw_info['creator']['avatar']['id']).replace(
                                    '_{size}', '')

        info['column_id'] = raw_info['slug']
        info['name'] = raw_info['name']
        info['logo'] = raw_info['creator']['avatar']['template'].replace(
            '{id}', raw_info['avatar']['id']).replace('_{size}', '')
        info['article'] = raw_info['postsCount']
        info['follower'] = raw_info['followersCount']
        info['description'] = raw_info['description']
        self.info_list.append(info)
        self.task_complete_set.add(target_url)
        detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format(
            column_id)
        for i in range(info['article'] / 10 + 1):
            self.work_set.add(detect_url + str(i * 10))
        return
    def download(self, index):
        image = self.container[index]
        filename = image['filename']
        href = image['href']
        # filename=href.split('/')[-1]

        if os.path.isfile(self.save_path + '/' + filename):
            return
        print 'Downloading picture:' + href + '   filename   ' + filename

        # urllib.urlretrieve(href, self.save_path + '/' + filename, cbk)

        if len(str(href)) < 300 and Match.isUrlOk(href):
            # Debug.print_in_single_line(u'Downloading picture: {}'.format(href))
            rely_url = str(href).split('@')[0]
            content = Http.get_content(url=rely_url,
                                       timeout=Config.timeout_download_picture)
        else:
            Debug.print_in_single_line(u"Href of the Picture seems wrong...")
            content = None
        if not content:
            return
        with open(self.save_path + '/' + filename, 'wb') as image:
            image.write(content)
        return
Example #16
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return
        result = Match.column(target_url)
        self.column_id = result.group("column_id")
        content = Http.get_content("https://zhuanlan.zhihu.com/api/columns/" + self.column_id)
        if not content:
            return
        raw_info = json.loads(content)
        info = {}
        info["creator_id"] = raw_info["creator"]["slug"]
        info["creator_hash"] = raw_info["creator"]["hash"]
        info["creator_sign"] = raw_info["creator"]["bio"]
        info["creator_name"] = raw_info["creator"]["name"]
        info["creator_logo"] = (
            raw_info["creator"]["avatar"]["template"]
            .replace("{id}", raw_info["creator"]["avatar"]["id"])
            .replace("_{size}", "")
        )

        info["column_id"] = raw_info["slug"]
        info["name"] = raw_info["name"]
        info["logo"] = (
            raw_info["creator"]["avatar"]["template"].replace("{id}", raw_info["avatar"]["id"]).replace("_{size}", "")
        )
        info["article"] = raw_info["postsCount"]
        info["follower"] = raw_info["followersCount"]
        info["description"] = raw_info["description"]
        self.info_list.append(info)
        self.task_complete_set.add(target_url)
        detect_url = "https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset=".format(self.column_id)
        for i in range(info["article"] / 10 + 1):
            self.work_set.add(detect_url + str(i * 10))
        return
Example #17
0
 def create_single_html_book(self, book_package):
     title = book_package.get_title()
     if not title:
         # 电子书题目为空时自动跳过
         # 否则会发生『rm -rf / 』的惨剧
         return
     Path.reset_path()
     Path.chdir(Path.result_path)
     Path.rmdir(u'./' + title)
     Path.mkdir(u'./' + title)
     Path.chdir(u'./' + title)
     page = []
     for book in book_package.book_list:
         page += book.page_list
     content = u' \r\n '.join([Match.html_body(x.content) for x in page]).replace(u'../images/', u'./images/')
     with open(TemplateConfig.content_base_uri) as html:
         content = html.read().format(title=title, body=content).replace(u'../style/', u'./')
     with open(title + u'.html', 'w') as html:
         html.write(content)
     Path.copy(Path.html_pool_path + u'/../{}/OEBPS/images'.format(title), u'./images')
     Path.copy(Path.www_css + u'/customer.css', u'./customer.css')
     Path.copy(Path.www_css + u'/markdown.css', u'./markdown.css')
     Path.copy(Path.www_css + u'/normalize.css', u'./normalize.css')
     Path.reset_path()
     return
Example #18
0
    def fix_image(self, content):
        content = Match.fix_html(content)
        for img in re.findall(r'<img[^>]*', content):
            # fix img
            if img[-1] == '/':
                img = img[:-1]
            img += '>'

            src = re.search(r'(?<=src=").*?(?=")', img)
            if not src:
                new_image = img + '</img>'
                content = content.replace(img, new_image)
                continue
            else:
                src = src.group(0)
                if src.replace(' ', '') == '':
                    new_image = img + '</img>'
                    content = content.replace(img, new_image)
                    continue
            src_download = HtmlCreator.fix_image_src(src)
            if src_download:
                filename = self.image_container.add(src_download)
            else:
                filename = ''
            new_image = img.replace('"{}"'.format(src),
                                    '"../images/{}"'.format(filename))
            new_image = new_image.replace(
                '//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg',
                '../images/{}'.format(filename))
            new_image += '</img>'
            content = content.replace(
                img,
                '<div class="duokan-image-single">{}</div>'.format(new_image))

        return content
Example #19
0
    def create_work_set(self, target_url):
        u"""
        根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入sinablog_info
        的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址,
        放入work_set中
        :param target_url: 博客首页的url
        :return:
        """
        if target_url in self.task_complete_set:
            return
        result = Match.sinablog_author(target_url)
        sinablog_author_id = int(result.group('sinablog_people_id'))

        article_num = self.get_sinablog_question_list(sinablog_author_id)
        if article_num % 50 != 0:
            page_num = article_num/50 + 1      # 50 href on 1 page
        else:
            page_num = article_num / 50

        self.question_list[0]['article_num'] = article_num
        # 上面这行, 暂时只能这样写, 因为"关于我"的页面没有文章的数量

        self.task_complete_set.add(target_url)

        for page in range(page_num):
            url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(sinablog_author_id, page+1)
            content_article_list = Http.get_content(url)
            article_list = self.parse_get_article_list(content_article_list)
            for item in article_list:
                self.work_set.add(item)
        return
Example #20
0
    def fix_image(self, content):
        content = Match.fix_html(content)
        for img in re.findall(r'<img[^>]*', content):
            # fix img
            # if img[-1] == '/':
            #     print u"修改前,img为:" + str(img)
            #     img = img[:-1]
            #     print u"修改后,img为:" + str(img[:-1])
            img += '>'
            src = re.search(r'(?<=src=").*?(?=")', img)
            if not src:
                new_image = img + '</img>'
                content = content.replace(img, new_image)
                continue
            else:
                src = src.group(0)
                if src.replace(' ', '') == '':
                    new_image = img + '</img>'
                    content = content.replace(img, new_image)
                    continue
            src_download = HtmlCreator.fix_image_src(src)
            if src_download:
                filename = self.image_container.add(src_download)
            else:
                filename = ''
            new_image = img.replace('"{}"'.format(src),
                                    '"../images/{}"'.format(filename))
            new_image = new_image.replace('http://simg.sinajs.cn/blog7style/images/common/sg_trans.gif',\
                                          '../images/{}'.format(filename)) # 硬编码, 可以优化?写到fix_html函数中
            # new_image += '</img>'
            content = content.replace(
                img,
                '<div class="duokan-image-single">{}</div>'.format(new_image))

        return content
Example #21
0
    def set_info(self, info):
        self.info.update(info)
        if self.kind == Type.question:
            self.epub.title = u'知乎问题集锦({})'.format(info['title'])
            self.epub.id = info['id']
        elif self.kind == Type.answer:
            self.epub.title = u'知乎回答集锦({})'.format(info['title'])
            self.epub.id = info['id']
        elif self.kind == Type.article:
            self.epub.title = u'知乎专栏文章集锦({})'.format(info['title'])
            self.epub.id = info['id']

        if self.kind == Type.topic:
            self.epub.title = u'话题_{}({})'.format(info['title'], info['topic_id'])
            self.epub.id = info['topic_id']
        if self.kind == Type.collection:
            self.epub.title = u'收藏夹_{}({})'.format(info['title'], info['collection_id'])
            self.epub.id = info['collection_id']
        if self.kind == Type.author:
            self.epub.title = u'作者_{}({})'.format(info['name'], info['author_id'])
            self.epub.id = info['author_id']
        if self.kind == Type.column:
            self.epub.title = u'专栏_{}({})'.format(info['name'], info['column_id'])
            self.epub.id = info['column_id']
        self.epub.title = Match.fix_filename(self.epub.title)
        return
Example #22
0
def main():
    file_name = 'ReadList.txt'
    log.print_log(u'read from %s' % file_name)

    counter = 1
    try:
        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()
            line = read_list[0]
            split_url = line.split('#')[0]

            recipe_kind = Match.get_website_kind(split_url)
            print recipe_kind
            counter += 1
            if recipe_kind == 'Unsupport type':
                print('Unsupported website or url type. \nPlease check url.')
                sys.exit()
    except IOError as e:
        print(u"\nOops! No " + file_name + ". creating " + file_name + "...")
        with open(file_name, 'w') as read_list:
            read_list.close()
        sys.exit()
    except IndexError:
        if 1 == counter:
            print(u"\nOops! No content in " + file_name + u". Please check it out.")
            sys.exit()

    print(u"website type:" + str(recipe_kind))
    game = TEEBook(recipe_kind=recipe_kind, url=None, read_list=file_name)
    game.begin()
    sys.exit()
Example #23
0
 def worker(self, target_url):
     if target_url in self.work_complete_set:
         # 自动跳过已抓取成功的网址
         return
     Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
     content = Http.get_content(target_url)
     if not content:
         return
     from src.worker.sinablog_worker import sinablogAuthorWorker
     if isinstance(self, sinablogAuthorWorker):
         content = Match.fix_html(content=content, recipe_kind='sinablog_author')
     else:
         content = Match.fix_html(content=content)  # 需要修正其中的<br>标签,避免爆栈
     self.content_list.append(content)
     Debug.logger.debug(u'{}的内容抓取完成'.format(target_url))
     self.work_complete_set.add(target_url)
     return
Example #24
0
 def parse_article_id(self):
     article_id = str(self.dom.find("div", class_="share-group"))
     if not article_id:
         Debug.logger.info(u"没有找到文章id")
         return
     result = Match.jianshu_article_id(article_id)
     article_id = result.group('jianshu_article_id')
     self.info['article_id'] = article_id
Example #25
0
 def parse_article_id(self):
     article_id = str(self.dom.find("div", class_="share-group"))
     if not article_id:
         Debug.logger.info(u"没有找到文章id")
         return
     result = Match.jianshu_article_id(article_id)
     article_id = result.group('jianshu_article_id')
     self.info['article_id'] = article_id
    def download_img_in_question_content(self):
        #   下载问题详情中的图片,同时更新
        from src.container.image_container import ImageContainer
        img_container = ImageContainer()
        img_src_dict = Match.match_img_with_src_dict(self.question_info.detail)
        self.question_content_img_filename_list = []
        for img in img_src_dict:
            src = img_src_dict[img]
            filename = img_container.add(src)
            self.question_content_img_filename_list.append(filename)
            self.question_info.detail = self.question_info.detail.replace(img, Match.create_img_element_with_file_name(filename))

        img_container.start_download()

        #   下载完成后,更新图片大小
        for filename in self.question_content_img_filename_list:
            self.question_content_img_size += Path.get_img_size_by_filename_kb(filename)
        return
Example #27
0
    def download_img_in_question_content(self):
        #   下载问题详情中的图片,同时更新
        from src.container.image_container import ImageContainer
        img_container = ImageContainer()
        img_src_dict = Match.match_img_with_src_dict(self.question_info.detail)
        self.question_content_img_filename_list = []
        for img in img_src_dict:
            src = img_src_dict[img]
            filename = img_container.add(src)
            self.question_content_img_filename_list.append(filename)
            self.question_info.detail = self.question_info.detail.replace(img, Match.create_img_element_with_file_name(filename))

        img_container.start_download()

        #   下载完成后,更新图片大小
        for filename in self.question_content_img_filename_list:
            self.question_content_img_size += Path.get_img_size_by_filename_kb(filename)
        return
Example #28
0
    def set_info(self, info):
        self.info.update(info)
        if self.kind == Type.csdnblog_author:
            self.epub.title = u'csdn博客作者_{}({})文章集锦'.format(info['creator_name'], info['creator_id'])
            self.epub.id = info['creator_id']
        elif self.kind == Type.cnblogs_author:
            self.epub.title = u'cnblogs作者_{}({})文章集锦'.format(info['creator_name'], info['creator_id'])
            self.epub.id = info['creator_id']
        elif self.kind == Type.jianshu_author:              # 该博客所有的博文
            self.epub.title = u'简书作者_{}({})文章集锦'.format(info['creator_name'], info['creator_id'])
            self.epub.id = info['creator_id']
        elif self.kind == Type.jianshu_collection:
            self.epub.title = u'简书专题_{}({})'.format(info['title'], info['collection_fake_id'])
            self.epub.id = info['collection_fake_id']
        elif self.kind == Type.jianshu_notebooks:
            self.epub.title = u'简书文集_{}({})'.format(info['title'], info['notebooks_id'])
            self.epub.id = info['notebooks_id']
        elif self.kind == Type.jianshu_article:    # 单篇博文 TODO
            self.epub.title = u'简书博文集锦({})'.format(info['title'])
            self.epub.id = info['id']       # TODO
        elif self.kind == Type.sinablog_author:              # 该博客所有的博文
            self.epub.title = u'新浪博客_{}({})'.format(info['creator_name'], info['creator_id'])
            self.epub.id = info['creator_id']
        elif self.kind == Type.sinablog_article:    # 新浪单篇博文 TODO
            self.epub.title = u'新浪博客博文集锦({})'.format(info['title'])
            self.epub.id = info['id']       # TODO
        elif self.kind == Type.question:
            self.epub.title = u'知乎问题集锦({})'.format(info['title'])
            self.epub.id = info['id']
        elif self.kind == Type.answer:
            self.epub.title = u'知乎回答集锦({})'.format(info['title'])
            self.epub.id = info['id']
        elif self.kind == Type.article:
            self.epub.title = u'知乎专栏文章集锦({})'.format(info['title'])
            self.epub.id = info['id']
        elif self.kind == Type.topic:
            self.epub.title = u'知乎话题_{}({})'.format(info['title'], info['topic_id'])
            self.epub.id = info['topic_id']
        elif self.kind == Type.collection:
            self.epub.title = u'知乎收藏夹_{}({})'.format(info['title'], info['collection_id'])
            self.epub.id = info['collection_id']
        elif self.kind == Type.author:
            self.epub.title = u'知乎作者_{}({})'.format(info['name'], info['author_id'])
            self.epub.id = info['author_id']
        elif self.kind == Type.column:
            self.epub.title = u'知乎专栏_{}({})'.format(info['name'], info['column_id'])
            self.epub.id = info['column_id']
        elif self.kind == Type.yiibai:
            self.epub.title = u'易百教程_{}'.format(info['title'])
            self.epub.id = info['creator_id']
        elif self.kind == Type.talkpython:
            self.epub.title = u'TalkPythonToMe'
            self.epub.id = info['creator_id']

        from src.html5lib.constants import entities_reverse
        self.epub.title = Match.replace_words(self.epub.title, entities_reverse)
        return
Example #29
0
 def parse_update_date(self):
     u"""
     获取最近更新日期
     """
     dom = self.node.select(u"div.zg-gray-normal span.time")[0]
     raw_update_date = Tag.get_content(dom)
     update_date = Match.parse_date(raw_update_date)
     self.set_attr("update_date", update_date)
     return
Example #30
0
    def get_article_info(self):
        data = {}
        try:

            try:
                title_tationl = self.dom.find_all('p', align="center")
                # print  u"标题 {}".format(span_dom.text.strip()),
                resultstr = title_tationl[0].text
                data['title'] = resultstr

                ttd = title_tationl[1].text
                td = (str(ttd).split('(')[-1]).split(')')[0]
                # date_time = datetime.datetime.strptime(td, '%Y年%m月%d日')
                print td
                ye = td.split('年')[0]
                mo = (td.split('年')[-1]).split('月')[0]
                da = (td.split('月')[-1]).split('日')[0]

                yey = chinese2datsty(ye)

                print u'{}年{}月{}'.format(yey, mo, da)
                data['updated_time'] = u'{}年{}月{}'.format(yey, mo, da)

            except IndexError:
                data['title'] = Match.replace_specile_chars(self.dom.title)
                data['updated_time'] = ''
            data['title'] = str(data['title']).strip()

            article_body = ""

            content = self.dom.find_all('p')
            for ii in range(2, len(content)):
                x = content[ii]
                # print x
                xxt = u'<p>{}</p>'.format(x.text)
                article_body += str(xxt)

            data['content'] = str(article_body)

            data['voteup_count'] = ""
            data['comment_count'] = ""

            data['image_url'] = ''

            data['author_id'] = 'meng-qing-xue-81'

            data['author_name'] = '   '
            data['author_headline'] = ''
            data[
                'author_avatar_url'] = 'https://pic4.zhimg.com/v2-38a89e42b40baa7d26d99cab9a451623_xl.jpg'
            data['author_gender'] = '0'
        except Exception as e:
            print e.message
            return []

        return data
Example #31
0
 def parse_collection_id(self):
     u"""
     获取收藏夹id
     """
     anchor = self.node.select(
         u'div.zg-gray-normal a[data-za-l="collection_followers_count"]')[0]
     log_src = Tag.get_attr(anchor, u"href")
     collection_id = Match.match_collection_id(log_src)
     self.set_attr("collection_id", collection_id)
     return
Example #32
0
    def create_book(self):
        #   确定文件信息
        title = Match.fix_filename(self.book_title)
        if self.is_split:
            title = self.book_title + u'_卷{}'.format(self.chapter_no)

        #   先切换到电子书临时资源目录下
        Path.chdir(Path.book_pool_path)
        epub = Epub(title)
        for task_result in self.task_result_list:
            chapter_src = ''
            # info_page
            if task_result.task.task_type == Type.question:
                chapter_src = self.generate_question_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.answer:
                chapter_src = self.generate_question_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.collection:
                chapter_src = self.generate_collection_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.topic:
                chapter_src = self.generate_topic_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.author:
                chapter_src = self.generate_author_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.column:
                chapter_src = self.generate_column_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.article:
                chapter_src = self.generate_article_info_page(task_result.info_page)
            epub.create_chapter(chapter_src, task_result.get_title())
            for question in task_result.question_list:
                #   添加图片文件
                for filename in question.img_filename_list:
                    epub.add_image(Path.image_pool_path + '/' + filename)
                question_src = self.generate_question_page(question)
                epub.add_html(question_src, question.question_info.title)

            for column in task_result.column_list:
                #   添加图片文件
                for filename in column.img_filename_list:
                    epub.add_image(Path.image_pool_path + '/' + filename)
                for article in column.article_list:
                    article_src = self.generate_article_page(article)
                    epub.add_html(article_src, article.title)
            epub.finish_chapter()

        epub.set_creator(u'ZhihuHelp1.8.0')
        epub.set_language(u'zh-cn')
        epub.set_book_id()
        epub.set_output_path(Path.result_path)
        epub.add_css(Path.base_path + u'/www/css/markdown.css')
        epub.add_css(Path.base_path + u'/www/css/customer.css')
        epub.add_css(Path.base_path + u'/www/css/normalize.css')
        epub.add_css(Path.base_path + u'/www/css/bootstrap.css')
        epub.create()

        Path.reset_path()
        return
Example #33
0
 def parse_column(command):
     result = Match.column(command)
     column_id = result.group('column_id')
     task = SingleTask()
     task.kind = 'column'
     task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format(column_id)
     task.book.kind = 'column'
     task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format(column_id)
     task.book.sql.question = ''
     task.book.sql.answer = 'select * from Article where column_id = "{}" '.format(column_id)
     return task
Example #34
0
 def set_dom(self, dom):
     self.info = {}
     if dom and not (dom.select('div.answer-status')):
         self.header = dom.find('div', class_='zm-item-vote-info')
         self.body = dom.find('textarea', class_='content')
         self.footer = dom.find('div', class_='zm-meta-panel')
         if self.body:
             content = self.get_tag_content(self.body)
             self.content = BeautifulSoup(Match.fix_html(content), 'html.parser')
         self.author_parser.set_dom(dom)
     return
Example #35
0
    def create_work_set(self, target_url):
        u"""
        根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入SinaBlog_Info
        的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址,
        放入work_set中

        :param target_url: 博客首页的url
        :return:
        """
        Debug.logger.debug(u"target_url是:" + str(target_url))
        if target_url in self.task_complete_set:
            return
        result = Match.SinaBlog(target_url)
        SinaBlog_author_id = int(result.group('SinaBlog_people_id'))

        href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format(
            SinaBlog_author_id)
        href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(
            SinaBlog_author_id)

        # ############下面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化
        content_profile = Http.get_content(href_profile)

        parser = SinaBlogParser(content_profile)
        self.question_list += parser.get_SinaBlog_info_list()
        # Debug.logger.debug(u"create_work_set中的question_list是什么??" + str(self.question_list))
        # #############上面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化

        # content_index = Http.get_content(href_index)
        content_article_list = Http.get_content(href_article_list)

        article_num = int(self.parse_article_num(content_article_list))
        Debug.logger.debug(u"article_num:" + str(article_num))
        if article_num % 50 != 0:
            page_num = article_num / 50 + 1  # 博客目录页面, 1页放50个博客链接
        else:
            page_num = article_num / 50

        self.question_list[0][
            'article_num'] = article_num  # 这样的话, 每行只能放一个新浪博客地址!!!
        # 上面这行, 暂时只能这样写, 因为"关于我"的页面, 没有文章的数量

        self.task_complete_set.add(target_url)

        for page in range(page_num):
            url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(
                SinaBlog_author_id, page + 1)
            content_article_list = Http.get_content(url)
            article_list = self.parse_get_article_list(content_article_list)
            for item in article_list:
                self.work_set.add(item)
            # self.work_set.add(article_list[0])
        return
Example #36
0
 def parse_author(command):
     result = Match.author(command)
     author_id = result.group('author_id')
     task = SingleTask()
     task.kind = 'author'
     task.spider.href = 'https://www.zhihu.com/people/{}'.format(author_id)
     task.book.kind = 'author'
     task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(author_id)
     task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where author_id = "{}")'.format(
         author_id)
     task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(author_id)
     return task
Example #37
0
 def parse_article(command):
     result = Match.article(command)
     column_id = result.group('column_id')
     article_id = result.group('article_id')
     task = SingleTask()
     task.kind = 'article'
     task.spider.href = 'https://zhuanlan.zhihu.com/{}/{}'.format(column_id, article_id)
     task.book.kind = 'article'
     task.book.sql.info = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id)
     task.book.sql.question = ''
     task.book.sql.answer = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id)
     return task
Example #38
0
        def parse_question(command):
            result = Match.question(command)
            question_id = result.group('question_id')
            task = SingleTask()
            task.kind = 'question'

            task.spider.href = 'https://www.zhihu.com/question/{}'.format(question_id)
            task.book.kind = 'question'
            task.book.sql.info = ' question_id = "{}" '.format(question_id)
            task.book.sql.question = 'question_id = "{}"'.format(question_id)
            task.book.sql.answer = 'question_id = "{}"'.format(question_id)
            return task
Example #39
0
 def parse_jianshu_collection(command):
     result = Match.jianshu_collection(command)
     collection_id = result.group('collection_id')
     task = SingleTask()
     task.kind = 'jianshu_collection'
     task.spider.href = 'http://www.jianshu.com/collection/{}'.format(collection_id)
     task.book.kind = 'jianshu_collection'
     task.book.sql.info = 'select * from jianshu_collection_info where collection_fake_id = "{}"'.format(
         collection_id
     )
     task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \
         'jianshu_collection_index where collection_fake_id = "{}")'.format(collection_id)
     return task
        def parse_answer(command):
            result = Match.answer(command)
            question_id = result.group('question_id')
            answer_id = result.group('answer_id')
            task = SingleTask()
            task.kind = 'answer'
            task.spider.href = 'http://www.zhihu.com/question/{}/answer/{}'.format(question_id, answer_id)

            task.book.kind = 'answer'
            task.book.property.sql.info = ''
            task.book.property.sql.question = 'question_id = "{}"'.format(question_id)
            task.book.property.sql.answer = 'question_id = "{}" and answer_id = "{}"'.format(question_id, answer_id)
            return task
Example #41
0
 def parse_jianshu_notebooks(command):
     result = Match.jianshu_notebooks(command)
     notebooks_id = result.group('notebooks_id')
     task = SingleTask()
     task.kind = 'jianshu_notebooks'
     task.spider.href = 'http://www.jianshu.com/notebooks/{}/latest'.format(notebooks_id)  # config file???
     task.book.kind = 'jianshu_notebooks'
     task.book.sql.info = 'select * from jianshu_notebooks_info where notebooks_id = "{}"'.format(
         notebooks_id
     )
     task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \
         'jianshu_notebooks_index where notebooks_id = "{}")'.format(notebooks_id)
     return task
Example #42
0
        def parse_question(command):
            result = Match.question(command)
            question_id = result.group('question_id')
            task = SingleTask()
            task.kind = 'question'

            task.spider.href = 'https://www.zhihu.com/question/{}'.format(
                question_id)
            task.book.kind = 'question'
            task.book.sql.info = ' question_id = "{}" '.format(question_id)
            task.book.sql.question = 'question_id = "{}"'.format(question_id)
            task.book.sql.answer = 'question_id = "{}"'.format(question_id)
            return task
Example #43
0
 def parse_topic(command):
     result = Match.topic(command)
     topic_id = result.group('topic_id')
     task = SingleTask()
     task.kind = 'topic'
     task.spider.href = 'https://www.zhihu.com/topic/{}'.format(topic_id)
     task.book.kind = 'topic'
     task.book.sql.info = 'select * from TopicInfo where topic_id = "{}"'.format(topic_id)
     task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where href in (select href from TopicIndex where topic_id = "{}"))'.format(
         topic_id)
     task.book.sql.answer = 'select * from Answer where href in (select href from TopicIndex where topic_id = "{}")'.format(
         topic_id)
     return task
Example #44
0
    def fix_image(self, content, recipe):
        content = Match.fix_html(content=content, recipe_kind=recipe)
        for img in re.findall(r'<img[^>]*', content):
            if recipe not in [Type.sinablog_author, Type.cnblogs_author]:
                # fix img
                if img[-1] == '/':
                    img = img[:-1]
            img += '>'
            src = re.search(r'(?<=src=").*?(?=")', img)
            if not src:
                new_image = img + '</img>'
                content = content.replace(img, new_image)
                continue
            else:
                src = src.group(0)
                if src.replace(' ', '') == '':
                    new_image = img + '</img>'
                    content = content.replace(img, new_image)
                    continue
            src_download = HtmlCreator.fix_image_src(src)
            if src_download:
                if recipe in Type.zhihu and not src_download.startswith('http'):
                    # fix zhuanlan image href
                    src_download = src_download.split('.')[0]
                    filename = self.image_container.add('https://pic2.zhimg.com/'+src_download+'_b.jpg')
                elif recipe in Type.generic:
                    filename = ''    # TODO
                else:
                    filename = self.image_container.add(src_download)
            else:
                filename = ''
            new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename))

            if recipe in Type.jianshu:
                new_image = new_image.replace('data-original-src', 'temppicsr')
                new_image = new_image.replace('src', 'falsesrc')
                new_image = new_image.replace('temppicsr', 'src')    # 应该有更好的方式, 暂时先这样写
                new_image += '</img>'
            elif recipe in Type.sinablog:
                # 硬编码, 可以优化?写到fix_html函数中
                new_image = new_image.replace('http://simg.sinajs.cn/blog7style/images/common/sg_trans.gif',\
                                          '../images/{}'.format(filename))
            elif recipe in Type.zhihu:
                new_image = new_image.replace('//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg',
                                              '../images/{}'.format(filename))
                new_image += '</img>'
            elif recipe in Type.cnblogs:
                pass
            content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image))

        return content
Example #45
0
 def parse_collection(command):
     result = Match.collection(command)
     collection_id = result.group('collection_id')
     task = SingleTask()
     task.kind = 'collection'
     task.spider.href = 'https://www.zhihu.com/collection/{}'.format(collection_id)
     task.book.kind = 'collection'
     task.book.sql.info = 'select * from CollectionInfo where collection_id = "{}"'.format(
         collection_id)
     task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where href in (select href from CollectionIndex where collection_id = "{}"))'.format(
         collection_id)
     task.book.sql.answer = 'select * from Answer where href in (select href from CollectionIndex where collection_id = "{}")'.format(
         collection_id)
     return task
Example #46
0
    def worker(self, target_url):
        if target_url in self.work_complete_set:
            # 自动跳过已抓取成功的网址
            return

        Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
        content = Http.get_content(target_url)
        if not content:
            return
        content = Match.fix_html(content)  # 需要修正其中的<br>标签,避免爆栈
        self.content_list.append(content)
        Debug.logger.debug(u'{}的内容抓取完成'.format(target_url))
        self.work_complete_set.add(target_url)
        return
Example #47
0
    def worker(self, target_url):
        if target_url in self.work_complete_set:
            # 自动跳过已抓取成功的网址
            return

        Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
        content = Http.get_content(target_url)
        if not content:
            return
        content = Match.fix_html(content)  # 需要修正其中的<br>标签,避免爆栈
        self.content_list.append(content)
        Debug.logger.debug(u'{}的内容抓取完成'.format(target_url))
        self.work_complete_set.add(target_url)
        return
Example #48
0
 def parse_column(command):
     result = Match.column(command)
     column_id = result.group('column_id')
     task = SingleTask()
     task.kind = 'column'
     task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format(
         column_id)
     task.book.kind = 'column'
     task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format(
         column_id)
     task.book.sql.question = ''
     task.book.sql.answer = 'select * from Article where column_id = "{}" '.format(
         column_id)
     return task
Example #49
0
    def parse_creator_profile_id(self, dom):
        u"""
        dom => div#zh-single-answer-author-info
        获取用户profile_id

        :param dom: 用户信息节点
        :type dom: bs4.Tag
        :return:
        :rtype: None
        """
        anchor = dom.select(u"h2.zm-list-content-title a")[0]
        link = Tag.get_attr(anchor, u"href")
        profile_id = Match.match_author_id(link)
        self.set_attr("profile_id", profile_id)
        return
Example #50
0
 def parse_collection(command):
     result = Match.collection(command)
     collection_id = result.group('collection_id')
     task = SingleTask()
     task.kind = 'collection'
     task.spider.href = 'https://www.zhihu.com/collection/{}'.format(
         collection_id)
     task.book.kind = 'collection'
     task.book.sql.info = 'select * from CollectionInfo where collection_id = "{}"'.format(
         collection_id)
     task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where href in (select href from CollectionIndex where collection_id = "{}"))'.format(
         collection_id)
     task.book.sql.answer = 'select * from Answer where href in (select href from CollectionIndex where collection_id = "{}")'.format(
         collection_id)
     return task
Example #51
0
    def parse_creator_profile_id(self, dom):
        u"""
        dom => div#zh-single-answer-author-info
        获取用户profile_id

        :param dom: 用户信息节点
        :type dom: bs4.Tag
        :return:
        :rtype: None
        """
        anchor = dom.select(u"h2.zm-list-content-title a")[0]
        link = Tag.get_attr(anchor, u"href")
        profile_id = Match.match_author_id(link)
        self.set_attr("profile_id", profile_id)
        return
Example #52
0
 def parse_article(command):
     result = Match.article(command)
     column_id = result.group('column_id')
     article_id = result.group('article_id')
     task = SingleTask()
     task.kind = 'article'
     task.spider.href = 'https://zhuanlan.zhihu.com/{}/{}'.format(
         column_id, article_id)
     task.book.kind = 'article'
     task.book.sql.info = ' column_id = "{}" and article_id = "{}" '.format(
         column_id, article_id)
     task.book.sql.question = ''
     task.book.sql.answer = ' column_id = "{}" and article_id = "{}" '.format(
         column_id, article_id)
     return task
Example #53
0
    def parse_raw_creator_avatar(self):
        u"""
        dom => self.node
        获取用户头像地址

        :return:
        :rtype: None
        """
        dom = self.node.select(u"div.zm-side-section-inner a.zm-list-avatar-link")[0]
        img = dom.select(u".zm-list-avatar-medium")[0]
        src = Tag.get_attr(img, u"src")
        self.set_attr("raw_avatar", src)
        avatar = Match.format_avatar(src)
        self.set_attr("avatar", avatar)
        return
Example #54
0
 def parse_topic(command):
     result = Match.topic(command)
     topic_id = result.group('topic_id')
     task = SingleTask()
     task.kind = 'topic'
     task.spider.href = 'https://www.zhihu.com/topic/{}'.format(
         topic_id)
     task.book.kind = 'topic'
     task.book.sql.info = 'select * from TopicInfo where topic_id = "{}"'.format(
         topic_id)
     task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where href in (select href from TopicIndex where topic_id = "{}"))'.format(
         topic_id)
     task.book.sql.answer = 'select * from Answer where href in (select href from TopicIndex where topic_id = "{}")'.format(
         topic_id)
     return task
Example #55
0
 def parse_author(command):
     result = Match.author(command)
     author_id = result.group('author_id')
     task = SingleTask()
     task.kind = 'author'
     task.spider.href = 'https://www.zhihu.com/people/{}'.format(
         author_id)
     task.book.kind = 'author'
     task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(
         author_id)
     task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where author_id = "{}")'.format(
         author_id)
     task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(
         author_id)
     return task
Example #56
0
    def generate_book_title(self):
        """
        生成并设置
        :return:
        :rtype:str
        """
        title_list = []
        for task_result in self.task_result_list:
            title_list.append(task_result.get_title())
        title = u'_'.join(title_list)
        if len(title) > 50:
            title = title[:50] + u'。。。等' + str(len(title_list)) + u'本电子书'
        title = Match.replace_danger_char_for_filesystem(title)

        self.book_title = title
        return title
Example #57
0
    def parse_raw_creator_avatar(self):
        u"""
        dom => self.node
        获取用户头像地址

        :return:
        :rtype: None
        """
        dom = self.node.select(
            u"div.zm-side-section-inner a.zm-list-avatar-link")[0]
        img = dom.select(u".zm-list-avatar-medium")[0]
        src = Tag.get_attr(img, u"src")
        self.set_attr("raw_avatar", src)
        avatar = Match.format_avatar(src)
        self.set_attr("avatar", avatar)
        return
Example #58
0
    def generate_book_title(self):
        """
        生成并设置
        :return:
        :rtype:str
        """
        title_list = []
        for task_result in self.task_result_list:
            title_list.append(task_result.get_title())
        title = u'_'.join(title_list)
        if len(title) > 50:
            title = title[:50] + u'。。。等' + str(len(title_list)) + u'本电子书'
        title = Match.replace_danger_char_for_filesystem(title)

        self.book_title = title
        return title