Beispiel #1
0
 def init_config(recipe_kind):
     if recipe_kind == 'zhihu':      # TODO: 再有一个需要登录的网站, 改掉硬编码
         login = Login(recipe_kind='zhihu')
     else:
         return
     # !!!!!发布的时候把Config.remember_account改成false!!!!!,第一次需要登录,之后用cookie即可
     # 登陆成功了,自动记录账户
     if Config.remember_account_set:
         Debug.logger.info(u'检测到有设置文件,直接使用之前的设置')
         # if raw_input():
         # login.start()
         # Config.picture_quality = guide.set_picture_quality()
         Config.picture_quality = 1
         # else:
         try:
             Http.set_cookie()   # sinablog, jianshu: DontNeed
         except TypeError:
             print u"没有找到登录成功的cookie记录, 请重新登录"
             login.start()
     else:
         log.warning_log(u"Please login...")
         login.start()
         # Config.picture_quality = guide.set_picture_quality()
         Config.picture_quality = 1
         Config.remember_account_set = True
     # save config
     Config._save()
     return
Beispiel #2
0
    def login(self, account, password, captcha=''):
        content = Http.get_content('https://www.zhihu.com/')
        xsrf = Match.xsrf(content)
        if not xsrf:
            Debug.logger.info(u'登陆失败')
            Debug.logger.info(u'敲击回车重新发送登陆请求')
            return False
        xsrf = xsrf.split('=')[1]
        # add xsrf as cookie into cookieJar,
        cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com')
        self.cookieJar.set_cookie(cookie)
        if captcha:
            post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True,
                         'captcha': captcha}
        else:
            post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True}

        header = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip,deflate',  # 主要属性,只要有此项知乎即认为来源非脚本
            'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4',
            'Host': 'www.zhihu.com',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest',
            'Origin': 'https://www.zhihu.com',
            'Referer': 'https://www.zhihu.com/',
        }
        result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header)
        if not result:
            Debug.logger.info(u'登陆失败,请敲击回车重新登陆')
            return False
        response = json.loads(result)

        if response['r'] == 0:
            print u'登陆成功!'
            print u'登陆账号:', account
            print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认'
            if raw_input() == 'yes':
                Config.account, Config.password, Config.remember_account = account, password, True
                print u'帐号密码已保存,可通过修改config.json修改设置'
            else:
                Config.account, Config.password, Config.remember_account = '', '', False
                print u'跳过保存环节,进入下一流程'
            Config._save()
            cookie = self.get_cookie()
            DB.execute('delete from LoginRecord')  # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录
            data = {}
            data['account'] = account
            data['password'] = password
            data['recordDate'] = ExtraTools.get_today()
            data['cookieStr'] = cookie
            DB.save(data, 'LoginRecord')
            DB.commit()
            return True
        else:
            print u'登陆失败'
            Debug.print_dict(response)
            return False
Beispiel #3
0
    def create_work_set(self, target_url):
        u"""
        根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入SinaBlog_Info
        的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址,
        放入work_set中

        :param target_url: 博客首页的url
        :return:
        """
        Debug.logger.debug(u"target_url是:" + str(target_url))
        if target_url in self.task_complete_set:
            return
        result = Match.SinaBlog(target_url)
        SinaBlog_author_id = int(result.group('SinaBlog_people_id'))

        href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format(
            SinaBlog_author_id)
        href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(
            SinaBlog_author_id)

        # ############下面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化
        content_profile = Http.get_content(href_profile)

        parser = SinaBlogParser(content_profile)
        self.question_list += parser.get_SinaBlog_info_list()
        # Debug.logger.debug(u"create_work_set中的question_list是什么??" + str(self.question_list))
        # #############上面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化

        # content_index = Http.get_content(href_index)
        content_article_list = Http.get_content(href_article_list)

        article_num = int(self.parse_article_num(content_article_list))
        Debug.logger.debug(u"article_num:" + str(article_num))
        if article_num % 50 != 0:
            page_num = article_num / 50 + 1  # 博客目录页面, 1页放50个博客链接
        else:
            page_num = article_num / 50

        self.question_list[0][
            'article_num'] = article_num  # 这样的话, 每行只能放一个新浪博客地址!!!
        # 上面这行, 暂时只能这样写, 因为"关于我"的页面, 没有文章的数量

        self.task_complete_set.add(target_url)

        for page in range(page_num):
            url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(
                SinaBlog_author_id, page + 1)
            content_article_list = Http.get_content(url)
            article_list = self.parse_get_article_list(content_article_list)
            for item in article_list:
                self.work_set.add(item)
            # self.work_set.add(article_list[0])
        return
Beispiel #4
0
    def __init__(self, task_list):
        self.task_set = set(task_list)
        self.work_set = set()  # 待抓取网址池
        self.answer_list = []
        self.question_list = []
        self.thread_pool = ThreadPool(Config.max_thread)

        self.info_list = []
        self.extra_index_list = []
        self.info_url_set = self.task_set.copy()

        self.add_property()  # 添加扩展属性
        Http.set_cookie()
 def get_sinablog_question_list(self, author_id):
     u"""
     get sinablog_info, article_num
     :param author_id:
     :return:
     """
     href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format(author_id)
     href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(author_id)
     content_profile = Http.get_content(href_profile)
     parser = SinaBlogParser(content_profile)
     self.question_list += parser.get_extra_info()
     content_article_list = Http.get_content(href_article_list)
     article_num = int(self.parse_article_num(content_article_list))
     return article_num
Beispiel #6
0
    def create_work_set(self, target_url):
        u"""
        根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入SinaBlog_Info
        的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址,
        放入work_set中

        :param target_url: 博客首页的url
        :return:
        """
        Debug.logger.debug(u"target_url是:" + str(target_url))
        if target_url in self.task_complete_set:
            return
        result = Match.SinaBlog(target_url)
        SinaBlog_author_id = int(result.group('SinaBlog_people_id'))

        href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format(SinaBlog_author_id)
        href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(SinaBlog_author_id)

        # ############下面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化
        content_profile = Http.get_content(href_profile)

        parser = SinaBlogParser(content_profile)
        self.question_list += parser.get_SinaBlog_info_list()
        # Debug.logger.debug(u"create_work_set中的question_list是什么??" + str(self.question_list))
        # #############上面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化

        # content_index = Http.get_content(href_index)
        content_article_list = Http.get_content(href_article_list)

        article_num = int(self.parse_article_num(content_article_list))
        Debug.logger.debug(u"article_num:" + str(article_num))
        if article_num % 50 != 0:
            page_num = article_num/50 + 1      # 博客目录页面, 1页放50个博客链接
        else:
            page_num = article_num / 50

        self.question_list[0]['article_num'] = article_num  # 这样的话, 每行只能放一个新浪博客地址!!!
        # 上面这行, 暂时只能这样写, 因为"关于我"的页面, 没有文章的数量

        self.task_complete_set.add(target_url)

        for page in range(page_num):
            url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(SinaBlog_author_id, page+1)
            content_article_list = Http.get_content(url)
            article_list = self.parse_get_article_list(content_article_list)
            for item in article_list:
                self.work_set.add(item)
            # self.work_set.add(article_list[0])
        return
Beispiel #7
0
 def worker(self, target_url):
     content = Http.get_content(target_url)
     if not content:
         return
     self.work_set.discard(target_url)
     self.parse_content(content)
     return
Beispiel #8
0
    def create_work_set(self, target_url):
        u"""
        根据target_url(例:http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles)的内容,
        先获得creator_id, 再根据文章的数目, 获得页面数, 依次打开每个页面, 将文章的地址放入work_set中
        :param target_url:
        :return:
        """
        if target_url in self.task_complete_set:
            return
        id_result = Match.jianshu_author(target_url)
        jianshu_id = id_result.group('jianshu_id')
        article_num, article_list = self.get_jianshu_question_list(target_url)
        self.task_complete_set.add(target_url)
        if article_num % 9 != 0:
            page_num = article_num/9 + 1      # 9 href on one page
        else:
            page_num = article_num / 9

        for item in article_list:
            self.work_set.add(item)
        for page in range(page_num-1):          # page+2, don't need to get the first page
            url = 'http://www.jianshu.com/users/{}/latest_articles?page={}'.format(jianshu_id, page+2)
            content_article_list = Http.get_content(url)
            article_list = self.parse_get_article_list(content_article_list)
            for item in article_list:
                self.work_set.add(item)
        return
Beispiel #9
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return
        result = Match.column(target_url)
        self.column_id = result.group("column_id")
        content = Http.get_content("https://zhuanlan.zhihu.com/api/columns/" + self.column_id)
        if not content:
            return
        raw_info = json.loads(content)
        info = {}
        info["creator_id"] = raw_info["creator"]["slug"]
        info["creator_hash"] = raw_info["creator"]["hash"]
        info["creator_sign"] = raw_info["creator"]["bio"]
        info["creator_name"] = raw_info["creator"]["name"]
        info["creator_logo"] = (
            raw_info["creator"]["avatar"]["template"]
            .replace("{id}", raw_info["creator"]["avatar"]["id"])
            .replace("_{size}", "")
        )

        info["column_id"] = raw_info["slug"]
        info["name"] = raw_info["name"]
        info["logo"] = (
            raw_info["creator"]["avatar"]["template"].replace("{id}", raw_info["avatar"]["id"]).replace("_{size}", "")
        )
        info["article"] = raw_info["postsCount"]
        info["follower"] = raw_info["followersCount"]
        info["description"] = raw_info["description"]
        self.info_list.append(info)
        self.task_complete_set.add(target_url)
        detect_url = "https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset=".format(self.column_id)
        for i in range(info["article"] / 10 + 1):
            self.work_set.add(detect_url + str(i * 10))
        return
Beispiel #10
0
    def get_captcha():
        # 知乎此处的r参数为一个13位的unix时间戳
        unix_time_stp = str(int(1000 * time.time()))[0:13]
        content = Http.get_content(
            'https://www.zhihu.com/captcha.gif?r={}&type=login'.format(
                unix_time_stp))  # 开始拉取验证码
        captcha_path = Path.base_path + u'/我是登陆知乎时的验证码.gif'

        image = open(captcha_path, 'wb')
        image.write(content)
        image.close()

        print u'请输入您所看到的验证码'
        print u'验证码在助手所处的文件夹中'
        print u'验证码位置:'
        print captcha_path
        if platform.system() == "Darwin":
            os.system(u'open "{}" &'.format(captcha_path).encode(
                sys.stdout.encoding))
        else:
            webbrowser.get().open_new_tab(u'file:///' + captcha_path)

        print u'如果不需要输入验证码可点按回车跳过此步'
        captcha = raw_input()
        return captcha
Beispiel #11
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return
        result = Match.column(target_url)
        self.column_id = result.group('column_id')
        content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' + self.column_id)
        if not content:
            return
        raw_info = json.loads(content)
        info = {}
        info['creator_id'] = raw_info['creator']['slug']
        info['creator_hash'] = raw_info['creator']['hash']
        info['creator_sign'] = raw_info['creator']['bio']
        info['creator_name'] = raw_info['creator']['name']
        info['creator_logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['creator']['avatar'][
            'id']).replace('_{size}', '')

        info['column_id'] = raw_info['slug']
        info['name'] = raw_info['name']
        info['logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['avatar']['id']).replace(
            '_{size}', '')
        info['article'] = raw_info['postsCount']
        info['follower'] = raw_info['followersCount']
        info['description'] = raw_info['description']
        self.info_list.append(info)
        self.task_complete_set.add(target_url)
        detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format(self.column_id)
        for i in range(info['article'] / 10 + 1):
            self.work_set.add(detect_url + str(i * 10))
        return
Beispiel #12
0
    def __init__(self, task_list):
        self.task_set = set(task_list)
        self.task_complete_set = set()
        self.work_set = set()  # 待抓取网址池
        self.work_complete_set = set()  # 已完成网址池
        self.content_list = []  # 用于存放已抓取的内容
        self.answer_list = []
        self.question_list = []

        self.info_list = []
        self.extra_index_list = []
        self.info_url_set = self.task_set.copy()
        self.info_url_complete_set = set()

        self.add_property()  # 添加扩展属性
        Http.set_cookie()
Beispiel #13
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return
        result = Match.column(target_url)
        column_id = result.group('column_id')
        content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' +
                                   column_id)
        if not content:
            return
        raw_info = json.loads(content)
        info = {}
        info['creator_id'] = raw_info['creator']['slug']
        info['creator_hash'] = raw_info['creator']['hash']
        info['creator_sign'] = raw_info['creator']['bio']
        info['creator_name'] = raw_info['creator']['name']
        info['creator_logo'] = raw_info['creator']['avatar'][
            'template'].replace('{id}',
                                raw_info['creator']['avatar']['id']).replace(
                                    '_{size}', '')

        info['column_id'] = raw_info['slug']
        info['name'] = raw_info['name']
        info['logo'] = raw_info['creator']['avatar']['template'].replace(
            '{id}', raw_info['avatar']['id']).replace('_{size}', '')
        info['article'] = raw_info['postsCount']
        info['follower'] = raw_info['followersCount']
        info['description'] = raw_info['description']
        self.info_list.append(info)
        self.task_complete_set.add(target_url)
        detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format(
            column_id)
        for i in range(info['article'] / 10 + 1):
            self.work_set.add(detect_url + str(i * 10))
        return
Beispiel #14
0
 def check_update():  # 强制更新
     u"""
         *   功能
             *   检测更新。
             *   若在服务器端检测到新版本,自动打开浏览器进入新版下载页面
             *   网页请求超时或者版本号正确都将自动跳过
         *   输入
             *   无
         *   返回
             *   无
     """
     print u"检查更新。。。"
     if Config.debug:
         # 当位于debug模式时,不检查更新
         return
     try:
         content = Http.get_content(u"https://www.yaozeyuan.online/zhihuhelp/upgrade.txt")
         if not content:
             raise Exception(u'HttpError')
         time, url = [x.strip() for x in content.strip('\n').split('\n')]
         if time == Config.update_time:
             return
         else:
             print u"发现新版本,\n更新日期:{} ,点按回车进入更新页面".format(time)
             print u'新版本下载地址:' + url
             raw_input()
             import webbrowser
             webbrowser.open_new_tab(url)
     except Exception:
         # 不论发生任何异常均直接返回
         return
    def create_work_set(self, target_url):
        u"""
        根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入sinablog_info
        的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址,
        放入work_set中
        :param target_url: 博客首页的url
        :return:
        """
        if target_url in self.task_complete_set:
            return
        result = Match.sinablog_author(target_url)
        sinablog_author_id = int(result.group('sinablog_people_id'))

        article_num = self.get_sinablog_question_list(sinablog_author_id)
        if article_num % 50 != 0:
            page_num = article_num/50 + 1      # 50 href on 1 page
        else:
            page_num = article_num / 50

        self.question_list[0]['article_num'] = article_num
        # 上面这行, 暂时只能这样写, 因为"关于我"的页面没有文章的数量

        self.task_complete_set.add(target_url)

        for page in range(page_num):
            url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(sinablog_author_id, page+1)
            content_article_list = Http.get_content(url)
            article_list = self.parse_get_article_list(content_article_list)
            for item in article_list:
                self.work_set.add(item)
        return
Beispiel #16
0
    def check_update():  # 强制更新
        u"""
            *   功能
                *   检测更新。
                *   若在服务器端检测到新版本,自动打开浏览器进入新版下载页面
                *   网页请求超时或者版本号正确都将自动跳过
            *   输入
                *   无
            *   返回
                *   无
        """
        print u"检查更新。。。"
        try:
            content = Http.get_content(u"http://zhihuhelpbyyzy-zhihu.stor.sinaapp.com/ZhihuHelpUpdateTime.txt")
            if not content:
                raise Exception("HttpError")
        except:
            return
        time, url = [x.strip() for x in content.split("\n")]
        if time == Config.update_time:
            return
        else:
            print u"发现新版本,\n更新日期:{} ,点按回车进入更新页面".format(time)
            print u"新版本下载地址:" + url
            raw_input()
            import webbrowser

            webbrowser.open_new_tab(url)
        return
    def download(self, index):
        image = self.container[index]
        filename = image['filename']
        href = image['href']
        # filename=href.split('/')[-1]

        if os.path.isfile(self.save_path + '/' + filename):
            return
        print 'Downloading picture:' + href + '   filename   ' + filename

        # urllib.urlretrieve(href, self.save_path + '/' + filename, cbk)

        if len(str(href)) < 300 and Match.isUrlOk(href):
            # Debug.print_in_single_line(u'Downloading picture: {}'.format(href))
            rely_url = str(href).split('@')[0]
            content = Http.get_content(url=rely_url,
                                       timeout=Config.timeout_download_picture)
        else:
            Debug.print_in_single_line(u"Href of the Picture seems wrong...")
            content = None
        if not content:
            return
        with open(self.save_path + '/' + filename, 'wb') as image:
            image.write(content)
        return
Beispiel #18
0
    def __init__(self, task_list):
        self.task_set = set(task_list)
        self.task_complete_set = set()
        self.work_set = set()  # 待抓取网址池
        self.work_complete_set = set()  # 已完成网址池
        self.content_list = []  # 用于存放已抓取的内容
        self.answer_list = []
        self.question_list = []

        self.info_list = []
        self.extra_index_list = []
        self.info_url_set = self.task_set.copy()
        self.info_url_complete_set = set()

        self.add_property()  # 添加扩展属性
        Http.set_cookie()
Beispiel #19
0
 def check_update():  # 强制更新
     u"""
         *   功能
             *   检测更新。
             *   若在服务器端检测到新版本,自动打开浏览器进入新版下载页面
             *   网页请求超时或者版本号正确都将自动跳过
         *   输入
             *   无
         *   返回
             *   无
     """
     print u"检查更新。。。"
     try:
         content = Http.get_content(
             u"http://zhihuhelpbyyzy-zhihu.stor.sinaapp.com/ZhihuHelpUpdateTime.txt"
         )
         if not content:
             raise Exception('HttpError')
     except:
         return
     time, url = [x.strip() for x in content.split('\n')]
     if time == Config.update_time:
         return
     else:
         print u"发现新版本,\n更新日期:{} ,点按回车进入更新页面".format(time)
         print u'新版本下载地址:' + url
         raw_input()
         import webbrowser
         webbrowser.open_new_tab(url)
     return
Beispiel #20
0
 def create_work_set(self, target_url):
     if target_url in self.task_complete_set:
         return
     content = Http.get_content(target_url)
     if not content:
         return
     self.task_complete_set.add(target_url)
     notebooks_id = self.info_list[0]['notebooks_id']
     page_num = self.parse_max_page(content)
     for page in range(page_num):
         url = 'http://www.jianshu.com/notebooks/627726/latest?page={}'.format(page+1)
         content = Http.get_content(url)
         article_list = JianshuNotebooksParser(content).get_article_list()
         self.add_notebooks_index(notebooks_id, article_list)
         for item in article_list:
             self.work_set.add(item)
     return
Beispiel #21
0
 def catch_info(self, target_url):
     content = Http.get_content(target_url)
     if not content:
         return
     self.info_url_set.discard(target_url)
     parser = CollectionParser(content)
     self.info_list.append(parser.get_extra_info())
     return
Beispiel #22
0
 def worker(self, target_url):
     Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
     content = Http.get_content(target_url)
     if not content:
         return
     self.work_set.discard(target_url)
     self.parse_content(content)
     return
Beispiel #23
0
 def catch_info(self, target_url):
     content = Http.get_content(target_url + '/about')
     if not content:
         return
     self.info_url_set.discard(target_url)
     parser = AuthorParser(content)
     self.info_list.append(parser.get_extra_info())
     return
Beispiel #24
0
    def init_config():
        login = Login()
        if Config.remember_account:
            print u'检测到有设置文件,是否直接使用之前的设置?(帐号、密码、图片质量)'
            print u'点按回车使用之前设置,敲入任意字符后点按回车进行重新设置'
            if raw_input():
                login.start()
                Config.picture_quality = guide.set_picture_quality()
            else:
                Http.set_cookie()
        else:
            login.start()
            Config.picture_quality = guide.set_picture_quality()

        # 储存设置
        Config._save()
        return
Beispiel #25
0
 def catch_info(self, target_url):
     content = Http.get_content(target_url + '/top-answers')
     if not content:
         return
     self.info_url_set.discard(target_url)
     parser = TopicParser(content)
     self.info_list.append(parser.get_extra_info())
     return
Beispiel #26
0
    def init_config():
        login = Login()
        if Config.remember_account:
            print u"检测到有设置文件,是否直接使用之前的设置?(帐号、密码、图片质量)"
            print u"点按回车使用之前设置,敲入任意字符后点按回车进行重新设置"
            if raw_input():
                login.start()
                Config.picture_quality = guide.set_picture_quality()
            else:
                Http.set_cookie()
        else:
            login.start()
            Config.picture_quality = guide.set_picture_quality()

        # 储存设置
        Config._save()
        return
Beispiel #27
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return

        self.task_complete_set.add(target_url)
        url = target_url + '?page=2'      # there are page num in this url

        content = Http.get_content(url)
        page_num = self.parse_max_page(content)

        for item in range(int(page_num)):
            url = target_url + '?page={}'.format(str(item+1))
            content = Http.get_content(url)
            parser = CnblogsAuthorParser(content)
            article_url_list = parser.get_article_list()
            for item in article_url_list:
                self.work_set.add(item)
        return
Beispiel #28
0
 def create_work_set(self, target_url):
     content = Http.get_content(target_url + '/answers?order_by=vote_num')
     if not content:
         return
     self.task_set.discard(target_url)
     max_page = self.parse_max_page(content)
     for page in range(max_page):
         url = '{}/answers?order_by=vote_num&page={}'.format(target_url, page + 1)
         self.work_set.add(url)
     return
Beispiel #29
0
 def create_work_set(self, target_url):
     content = Http.get_content(target_url + '?nr=1&sort=created')
     if not content:
         return
     self.task_set.discard(target_url)
     max_page = self.parse_max_page(content)
     for page in range(max_page):
         url = '{}?nr=1&sort=created&page={}'.format(target_url, page + 1)
         self.work_set.add(url)
     return
Beispiel #30
0
 def catch_info(self, target_url):
     if target_url in self.info_url_complete_set:
         return
     content = Http.get_content(target_url)
     if not content:
         return
     self.info_url_complete_set.add(target_url)
     parser = CollectionParser(content)
     self.info_list.append(parser.get_extra_info())
     return
Beispiel #31
0
 def catch_info(self, target_url):
     if target_url in self.info_url_complete_set:
         return
     content = Http.get_content(target_url + '/top-answers')
     if not content:
         return
     self.info_url_complete_set.add(target_url)
     parser = TopicParser(content)
     self.info_list.append(parser.get_extra_info())
     return
Beispiel #32
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return
        content = Http.get_content(target_url)
        if not content:
            return
        self.task_complete_set.add(target_url)
        real_id = self.info_list[0]['collection_real_id']
        fake_id = self.info_list[0]['collection_fake_id']
        page_num = self.parse_max_page(content)

        for page in range(page_num):
            url = 'http://www.jianshu.com/collections/' + str(real_id) + '/notes?order_by=added_at&page={}'.format(page+1)
            content = Http.get_content(url)
            article_list = JianshuCollectionParser(content).get_article_list()
            self.add_collection_index(fake_id, article_list)
            for item in article_list:
                self.work_set.add(item)
        return
Beispiel #33
0
 def catch_info(self, target_url):
     if target_url in self.info_url_complete_set:
         return
     content = Http.get_content(target_url)
     if not content:
         return
     self.info_url_complete_set.add(target_url)
     parser = YiibaiParser(content)
     self.info_list.append(parser.get_extra_info())
     return
Beispiel #34
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return

        content = Http.get_content(target_url)
        article_list = YiibaiParser(content).get_article_list()

        self.task_complete_set.add(target_url)
        for item in article_list:
            self.work_set.add(item)
        return
Beispiel #35
0
 def reCreate_work_set(self, target_url, subUrl="", pageFlag="?"):
     if target_url in self.task_complete_set:
         return
     content = Http.get_content(target_url + subUrl)
     if not content:
         return
     self.task_complete_set.add(target_url)
     max_page = self.parse_max_page(content)
     for page in range(max_page):
         url = "{}{}{}page={}".format(target_url, subUrl, pageFlag, page + 1)
         self.work_set.add(url)
     return
Beispiel #36
0
 def create_work_set(self, target_url):
     if target_url in self.task_complete_set:
         return
     content = Http.get_content(target_url + "?nr=1&sort=created")
     if not content:
         return
     self.task_complete_set.add(target_url)
     max_page = self.parse_max_page(content)
     for page in range(max_page):
         url = "{}?nr=1&sort=created&page={}".format(target_url, page + 1)
         self.work_set.add(url)
     return
Beispiel #37
0
 def create_work_set(self, target_url):
     if target_url in self.task_complete_set:
         return
     content = Http.get_content(target_url + '/top-answers')
     if not content:
         return
     self.task_complete_set.add(target_url)
     max_page = self.parse_max_page(content)
     for page in range(max_page):
         url = '{}/top-answers?page={}'.format(target_url, page + 1)
         self.work_set.add(url)
     return
 def findPDFUrl(self, originUrl):
     '''
         the links returned from the method <findAnnalReports> are not the PDF links
         those origin links should be further excavated to find the true links
     '''
     r = Http.get_content(originUrl)
     # r.encoding = "gbk"
     # result = unicode(str(result), 'GBK').encode('UTF-8')
     bsObj = BeautifulSoup(r, "html.parser")
     result = bsObj.findAll("a")
     pdf = [link["href"] for link in result if "PDF" in link["href"]][0]
     return pdf
Beispiel #39
0
 def get_captcha():
     content = Http.get_content('http://www.zhihu.com/captcha.gif')  # 开始拉取验证码
     captcha_path = Path.base_path + u'/我是登陆知乎时的验证码.gif'
     with open(captcha_path, 'wb') as image:
         image.write(content)
     print u'请输入您所看到的验证码'
     print u'验证码在助手所处的文件夹中'
     print u'验证码位置:'
     print captcha_path
     print u'如果不需要输入验证码可点按回车跳过此步'
     captcha = raw_input()
     return captcha
    def downloadPDF(self, name, url, isDesktop=True):
        if isDesktop:
            fileName = "/ink/work/62/ink/{name}.pdf".format(name=name)
        else:
            fileName = "{name}.pdf".format(name=name)

        if not os.path.exists(fileName):    
           pdf = Http.get_content(url, timeout=180)
           with open(fileName, "wb") as file:
               for chunk in pdf:
                   file.write(chunk)
           print("Done!")
Beispiel #41
0
 def create_work_set(self, target_url):
     if target_url in self.task_complete_set:
         return
     content = Http.get_content(target_url + '?nr=1&sort=created')
     if not content:
         return
     self.task_complete_set.add(target_url)
     max_page = self.parse_max_page(content)
     for page in range(max_page):
         url = '{}?nr=1&sort=created&page={}'.format(target_url, page + 1)
         self.work_set.add(url)
     return
Beispiel #42
0
 def get_jianshu_question_list(self, target_url):
     u"""
     get jianshu_info, article_num, article_list
     :param target_url:
     :return:
     """
     index_content = Http.get_content(target_url)
     parser = JianshuAuthorParser(index_content)
     self.question_list += parser.get_extra_info()
     article_num = self.question_list[0]['article_num']      # not collection, only one author
     article_list = self.parse_get_article_list(index_content)
     return article_num, article_list
 def findAnnalReports(self, stockid):
     url = u"http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/{stockid}/page_type/ndbg.phtml".format(
             stockid=stockid)
     r = Http.get_content(url)
     r = unicode(r, 'GBK').encode('UTF-8')
     bsObj = BeautifulSoup(r, "html.parser")
     result = bsObj.findAll("div", {"class": "datelist"})[0]
     dateList = re.findall('\d{4}-\d{2}-\d{2}', result.text)
     returnList = []
     for index, value in enumerate(result.findAll("a")):
         returnList.append([dateList[index], value.text, "http://vip.stock.finance.sina.com.cn" + value["href"]])
     return returnList
Beispiel #44
0
 def get_captcha():
     content = Http.get_content(
         'https://www.zhihu.com/captcha.gif')  # 开始拉取验证码
     captcha_path = Path.base_path + u'/我是登陆知乎时的验证码.gif'
     with open(captcha_path, 'wb') as image:
         image.write(content)
     print u'请输入您所看到的验证码'
     print u'验证码在助手所处的文件夹中'
     print u'验证码位置:'
     print captcha_path
     print u'如果不需要输入验证码可点按回车跳过此步'
     captcha = raw_input()
     return captcha
Beispiel #45
0
    def create_work_set(self, target_url):
        u"""
        根据target_url(例:http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles)的内容,
        先获得creator_id, 再根据文章的数目, 获得页面数, 依次打开每个页面, 将文章的地址放入work_set中
        :param target_url:
        :return:
        """
        if target_url in self.task_complete_set:
            return
        id_result = Match.jianshu(target_url)
        jianshu_id = id_result.group('jianshu_id')

        # ############下面这部分应该是JianshuAuthorInfo的内容, 完成jianshu_info中的内容,暂时写在这, TODO, 一行只能写一个地址
        content_profile = Http.get_content(target_url)

        parser = JianshuParser(content_profile)
        self.question_list += parser.get_jianshu_info_list()
        # #############上面这部分应该是JianshuAuthorInfo的内容, 完成jianshu_info中的内容,暂时写在这

        self.task_complete_set.add(target_url)
        article_num = self.question_list[0][
            'article_num']  # 这样的话, 一行只能写一个地址  TODO: 硬编码

        if article_num % 9 != 0:
            page_num = article_num / 9 + 1  # 博客目录页面, 1页放50个博客链接
        else:
            page_num = article_num / 9

        article_list = self.parse_get_article_list(content_profile)
        for item in article_list:
            self.work_set.add(item)
        for page in range(page_num - 1):  # 第一页是不需要打开的
            url = 'http://www.jianshu.com/users/{}/latest_articles?page={}'.format(
                jianshu_id, page + 2)
            content_article_list = Http.get_content(url)
            article_list = self.parse_get_article_list(content_article_list)
            for item in article_list:
                self.work_set.add(item)
        return
Beispiel #46
0
 def create_work_set(self, target_url):
     if target_url in self.task_complete_set:
         return
     content = Http.get_content(target_url + '/answers?order_by=vote_num')
     if not content:
         return
     self.task_complete_set.add(target_url)
     max_page = self.parse_max_page(content)
     for page in range(max_page):
         url = '{}/answers?order_by=vote_num&page={}'.format(
             target_url, page + 1)
         self.work_set.add(url)
     return
Beispiel #47
0
    def worker(self, target_url):
        if target_url in self.work_complete_set:
            # 自动跳过已抓取成功的网址
            return

        Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
        content = Http.get_content(target_url)
        if not content:
            return
        content = Match.fix_html(content)  # 需要修正其中的<br>标签,避免爆栈
        self.content_list.append(content)
        Debug.logger.debug(u'{}的内容抓取完成'.format(target_url))
        self.work_complete_set.add(target_url)
        return
Beispiel #48
0
    def download(self, index):
        image = self.container[index]
        filename = image['filename']
        href = image['href']

        if os.path.isfile(self.save_path + '/' + filename):
            return
        Debug.logger.debug(u'开始下载图片{}'.format(href))
        content = Http.get_content(url=href,
                                   timeout=Config.timeout_download_picture)
        if not content:
            return
        with open(self.save_path + '/' + filename, 'wb') as image:
            image.write(content)
        return
Beispiel #49
0
    def get_captcha():
        content = Http.get_content(
            'https://www.zhihu.com/captcha.gif')  # 开始拉取验证码
        captcha_path = Path.base_path + u'/我是登陆知乎时的验证码.gif'

        with open(captcha_path, 'wb') as image:
            image.write(content)
        print u'请输入您所看到的验证码'
        print u'验证码在助手所处的文件夹中'
        print u'验证码位置:'
        print captcha_path
        if platform.system() == "Darwin":
            os.system(u'open "{}" &'.format(captcha_path).encode(
                sys.stdout.encoding))
        else:
            webbrowser.get().open_new_tab(u'file:///' + captcha_path)

        print u'如果不需要输入验证码可点按回车跳过此步'
        captcha = raw_input()
        return captcha
Beispiel #50
0
    def download(self, index):
        image = self.container[index]
        filename = image['filename']
        href = image['href']
        #   下载图片时自动把https换成http,以便加速图片下载过程
        href = href.replace('https://', 'http://')

        if os.path.isfile(self.save_path + '/' + filename):
            return
        Debug.print_in_single_line(u'开始下载图片{}'.format(href))
        if href:
            content = Http.get_content(url=href,
                                       timeout=Config.timeout_download_picture)
            if not content:
                Debug.logger.debug(u'图片『{}』下载失败'.format(href))
                content = ''
            else:
                Debug.print_in_single_line(u'图片{}下载完成'.format(href))
        else:
            #   当下载地址为空的时候,就没必要再去下载了
            content = ''
        with open(self.save_path + '/' + filename, 'wb') as image:
            image.write(content)
        return
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 1
        max_sleep_time = 1

        article_url_index_list = []
        #   获取最大页码

        url = 'http://www.jintiankansha.me/tag/{}?page=1'.format(account_id)

        column_info = JinWanKanSaEmptColumnParser('').get_column_info()

        column_info[u'column_id'] = account_id
        dt = datetime.datetime.now()
        column_info[u'title'] = u"AI_{}".format(dt.strftime("%Y-%m-%d"))
        max_page = 1

        typeToTry = 'tag'

        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if split_url.split('/')[-1] == account_id:
                    dt = datetime.datetime.now()
                    column_info[u'title'] = u"{}_{}".format(
                        line.split('#')[1], dt.strftime("%Y-%m-%d"))

                    max_page = int(line.split('#')[2])

                    typeToTry = str(int(line.split('#')[-1])).strip('\n')

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page + 1):
            # request_url = u'http://www.jintiankansha.me/column/{}?page={}'.format(account_id, raw_front_page_index)
            request_url = u'http://www.jintiankansha.me/{}/{}?page={}'.format(
                typeToTry, account_id, raw_front_page_index)
            print request_url
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'html.parser')
                list_p_list = soup.find_all('span', class_="item_title")

                for tgo_right in list_p_list:
                    for link in tgo_right.findAll('a'):
                        ttt = str(link.get('href'))
                        print ttt
                        if not (ttt is None):
                            article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        # article_url_index_list.append('http://www.jintiankansha.me/t/u8MygoqKI8')

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取  {countert} 号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)
                time.sleep(mock_sleep_time)
                if len(request_url_content) == 0:
                    random_sleep_time = base_sleep_time + random.randint(
                        0, max_sleep_time) / 100.0
                    Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                    time.sleep(random_sleep_time)
                    continue
                article_info = JinWanKanSaArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码

        star_page = 1
        max_page = 1
        column_info = Todo1ColumnParser("").get_column_info()
        column_info[u'column_id'] = account_id

        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if str(split_url).__contains__(account_id):
                    # Config.now_id_likeName = line.split('#')[1]
                    max_page = int(line.split('#')[-1]) + 1
                    column_info[u'title'] = str(line.split('#')[1])

                    # max_page = 1
                    print max_page

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址
        for raw_front_page_index in range(star_page, max_page):
            request_url = u'https://www.guancha.cn/{}/list_{}.shtml'.format(
                account_id, raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('h4', class_="module-title")
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        ttt = li.get('href')
                        print ttt
                        if not (ttt is None):

                            ss = str(ttt).split('.')
                            article_url_index_list.append(
                                u"https://www.guancha.cn{}_s.{}".format(
                                    ss[0], ss[1]))

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = Todo1ArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
    def start(self):
        print 'start JRJ_Report'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:

            for raw_front_page_index in range(1, 8):

                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)

                Path.mkdir(sdPath)

                url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html"

                request_url = url.format(uux, raw_front_page_index)
                content = Http.get_content(request_url)

                soup = BeautifulSoup(content, 'html.parser')

                list_p_list = soup.find_all('td', class_="left")

                for p in list_p_list:
                    # print p

                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        xxurl = li.get('href')
                        # print xxurl

                        if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl:

                            time.sleep(1)
                            result = Http.get_content(xxurl)
                            result = unicode(str(result),
                                             'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            # title_tationl = xxsoup.find_all('h1')
                            # tt = str(title_tationl[0].text).strip()

                            xxlist_p_list = xxsoup.find_all('p',
                                                            class_='title')[0]
                            xxlist_ds = xxsoup.find_all('span', class_='fr')[0]

                            realu = str(xxlist_p_list).replace(
                                str(xxlist_ds), '', 1)

                            realuxsoup = BeautifulSoup(realu, 'html.parser')

                            sp = str(realuxsoup.text).split(' ')

                            ttime = sp[1]

                            if ttime.__contains__('发表于'):
                                ttime = sp[2]

                            # print (sp[2]).text
                            # print (sp[3]).text

                            # print ttime

                            all_main = xxsoup.find_all('div', class_='main')[0]

                            realuxsoup = BeautifulSoup(str(all_main),
                                                       'html.parser')

                            reaupp = realuxsoup.find_all('p')

                            for pp in reaupp:
                                list_pcyc_li = pp.find_all('a')

                                for li in list_pcyc_li:
                                    print li.text
                                    ttt = li.get('href')

                                    print ttt

                                    fileName = u"{}_{}.pdf".format(
                                        ttime,
                                        str(li.text).replace('/', ""))

                                    print fileName

                                    basePath = '/ink/work/62/ink/{}/{}'.format(
                                        fileN, fileName)

                                    Path.mkdirAndPath(basePath)

                                    Debug.print_in_single_line(
                                        u'开始下载   {}'.format(ttt))
                                    if ttt:
                                        content = Http.get_content(url=ttt,
                                                                   timeout=180)
                                        if not content:
                                            Debug.logger.debug(
                                                u'pdf『{}』下载失败'.format(ttt))
                                            content = ''
                                        else:
                                            Debug.print_in_single_line(
                                                u'pdf {} 下载完成'.format(ttt))
                                    else:
                                        #   当下载地址为空的时候,就没必要再去下载了
                                        content = ''
                                    if content.__len__() > 10:
                                        with open(basePath, "wb") as pdf:
                                            pdf.write(content)
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码
        url = 'https://www.wuxiareview.com/category/{}'.format(account_id)
        front_page_content = Http.get_content(url)

        column_info = WuXiaColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        max_page = 2
        if account_id == 'daidai':

            column_info[u'title'] = "吃瓜群众岱岱"
            max_page = 1
        elif account_id == 'gzmdzst':

            column_info[u'title'] = "顾子明的政事堂"
            max_page = 1
        else:

            column_info[u'title'] = "时文"
            max_page = 2

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page):
            request_url = u'https://www.wuxiareview.com/category/{}/{}/'.format(
                account_id, raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('article', class_="excerpt")
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        # print li.text
                        tarUrl = li.get('href')
                        ttt = str(tarUrl).split("#")[-1]
                        print ttt
                        if not (ttt is None):
                            article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = WuXiaArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码
        url = 'https://www.huxiu.com/{}'.format(account_id)
        front_page_content = Http.get_content(url)

        # Config.now_id_likeName = account_id
        # Config.save()

        column_info = HuXiuColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = account_id

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        u_result = urllib.quote(
            account_id.decode(sys.stdin.encoding).encode('utf8'))
        print account_id
        max_page = 2

        idds = ''
        #
        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if split_url.split('/')[-1] == account_id:
                    # Config.now_id_likeName = line.split('#')[1]
                    max_page = int(line.split('#')[-1]) + 1
                    idds = str(line.split('#')[1])
                    print max_page
        max_page = -1
        #   分析网页内容,存到数据库里

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page + 1):
            #https://www.huxiu.com/search.html?s=%E5%B7%B4%E8%8F%B2%E7%89%B9&sort=dateline:desc
            request_url = u'https://www.huxiu.com/search.html?s={}&sort=dateline%3Adesc&per_page={}'.format(
                u_result, raw_front_page_index)
            #request_url = u'https://www.huxiu.com/member/{}/article/{}.html'.format(idds,raw_front_page_index)
            # request_url = 'https://www.huxiu.com/member/1872007.html'
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, "lxml")

                list_pcyc_l_ = soup.find_all('li')
                # list_pcyc_l_ = soup.find_all('div',class_='mob-ctt')
                for tgo_right in list_pcyc_l_:
                    for link in tgo_right.findAll('a'):
                        hre = str(link.get('href'))
                        if hre.startswith('/article/', 0, 10):
                            print u'https://www.huxiu.com{}'.format(
                                link.get('href'))
                            article_url_index_list.append(
                                'https://www.huxiu.com{}'.format(
                                    link.get('href')))

                del index_work_set[raw_front_page_index]

        article_url_index_list.append(
            'https://www.huxiu.com/article/299355.html')

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = HuXiuArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Beispiel #56
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 1
        max_sleep_time = 1

        article_url_index_list = []
        #   获取最大页码

        column_info = WeiXinColumnParser('').get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = account_id
        column_info[u'image_url'] = 'https://wpimg.wallstcn.com/3598b719-ab0d-4be7-bc09-30c3ae29a3cc.jpg?imageView2/1/w/240/h/240'
        max_page = 1
        # with open('ReadList.txt', 'r') as read_list:
        #     read_list = read_list.readlines()
        #     for line in read_list:
        #         split_url = line.split('#')[0]
        #         if str(split_url).__contains__(account_id):
        #             # Config.now_id_likeName = line.split('#')[1]
        #             max_page = int(line.split('#')[-1]) + 1
        #             column_info[u'title'] = str(line.split('#')[1])
        #
        #             # max_page = 1
        #             print max_page



        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))


        # article_url_index_list.append('https://mp.weixin.qq.com/s?__biz=MjM5MjczNDc0Mw==&mid=2650847984&idx=2&sn=b7b111e5964d2f2fb568ba0d419e3edf&chksm=bd55d1888a22589e2f3bab0613b346427079efc6b82fac869d4f78244a500c3e5cc8cb8402ed&scene=21#wechat_redirect')
        # article_url_index_list.append('https://mp.weixin.qq.com/s/yj1BT3jWyxLjlEnzz0vEtQ')

        with open('/Users/0/Desktop/list.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                article_url_index_list.append(str(line).strip('\n'))

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print  'query : ' + article_url_index
            article_db = DB.query_row(
                    'select count(*) as article_count from Article where article_id = "{}"'.format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(u"开始抓取  {countert} 号文章,剩余{article_count}篇".format(countert=article_url_index,
                                                                                    article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)
                time.sleep(mock_sleep_time)
                if len(request_url_content) == 0:
                    random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0
                    Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                    time.sleep(random_sleep_time)
                    continue
                #article_info = Todo2ArticleParser(request_url_content).get_article_info()
                # article_info = HuXiuArticleParser(request_url_content).get_article_info()
                article_info = WeiXinArticleParser(request_url_content).get_article_info()
                # article_info = WallStreetArticleParser(request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Beispiel #57
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码

        column_info = Todo3ColumnParser('').get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "新能源汽车"
        column_info['article_count'] = 0
        column_info['follower_count'] = 0
        column_info['description'] = ''
        column_info['image_url'] = ''

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])
        star_page = 1
        max_page = 1

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址
        for raw_front_page_index in range(star_page, max_page):
            request_url = u'https://post.smzdm.com/fenlei/xinnengyuanche/p{}/'.format(
                raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('div',
                                            class_='list-border clearfix')
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    li = list_pcyc_li[0]

                    tarUrl = li.get('href')
                    ttt = str(tarUrl).split("#")[-1]
                    print ttt
                    if not (ttt is None):
                        article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = Todo3ArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Beispiel #58
0
    def login(self, account, password, captcha=''):
        content = Http.get_content('https://www.zhihu.com/')
        xsrf = Match.xsrf(content)
        if not xsrf:
            Debug.logger.info(u'登陆失败')
            Debug.logger.info(u'敲击回车重新发送登陆请求')
            return False
        xsrf = xsrf.split('=')[1]
        # add xsrf as cookie into cookieJar,
        cookie = Http.make_cookie(name='_xsrf',
                                  value=xsrf,
                                  domain='www.zhihu.com')
        self.cookieJar.set_cookie(cookie)
        if captcha:
            post_data = {
                '_xsrf': xsrf,
                'email': account,
                'password': password,
                'remember_me': True,
                'captcha': captcha
            }
        else:
            post_data = {
                '_xsrf': xsrf,
                'email': account,
                'password': password,
                'remember_me': True
            }

        header = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip,deflate',  # 主要属性,只要有此项知乎即认为来源非脚本
            'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4',
            'Host': 'www.zhihu.com',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest',
            'Origin': 'https://www.zhihu.com',
            'Referer': 'https://www.zhihu.com/',
        }
        result = Http.get_content(url=r'https://www.zhihu.com/login/email',
                                  data=post_data,
                                  extra_header=header)
        if not result:
            Debug.logger.info(u'登陆失败,请敲击回车重新登陆')
            return False
        response = json.loads(result)

        if response['r'] == 0:
            print u'登陆成功!'
            print u'登陆账号:', account
            print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认'
            if raw_input() == 'yes':
                Config.account, Config.password, Config.remember_account = account, password, True
                print u'帐号密码已保存,可通过修改config.json修改设置'
            else:
                Config.account, Config.password, Config.remember_account = '', '', False
                print u'跳过保存环节,进入下一流程'
            Config._save()
            cookie = self.get_cookie()
            DB.execute(
                'delete from LoginRecord')  # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录
            data = {}
            data['account'] = account
            data['password'] = password
            data['recordDate'] = ExtraTools.get_today()
            data['cookieStr'] = cookie
            DB.save(data, 'LoginRecord')
            DB.commit()
            return True
        else:
            print u'登陆失败'
            Debug.print_dict(response)
            return False
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码
        url = 'http://www.gushequ.com/{}/'.format(account_id)
        front_page_content = Http.get_content(url)

        column_info = TodoColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "股社区"

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])
        star_page = 0
        max_page = 24
        if account_id == '2018':
            star_page = 0
            max_page = 24

        elif account_id == '2017':
            star_page = 24
            max_page = 58

        elif account_id == '2016':
            star_page = 58
            max_page = 92

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址
        for raw_front_page_index in range(star_page, max_page):
            request_url = u'http://www.gushequ.com/page/{}/'.format(
                raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('article')
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:

                        tarUrl = li.get('href')
                        ttt = str(tarUrl).split("#")[-1]
                        print ttt
                        if not (ttt is None):
                            article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = TodoArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return