Ejemplo n.º 1
0
    def __get_question_list(self):
        question_list = [DB.wrap('question', x) for x in DB.get_result_list(self.sql.question)]
        answer_list = [DB.wrap('answer', x) for x in DB.get_result_list(self.sql.get_answer_sql())]

        def merge_answer_into_question():
            question_dict = {x['question_id']: {'question': x.copy(), 'answer_list': [], 'agree': 0} for x in
                             question_list}
            for answer in answer_list:
                question_dict[answer['question_id']]['answer_list'].append(answer)
            return question_dict.values()

        def add_property(question):
            agree_count = 0
            char_count = 0
            for answer in question['answer_list']:
                answer['char_count'] = len(answer['content'])
                answer['agree_count'] = answer['agree']
                answer['update_date'] = answer['edit_date']
                agree_count += answer['agree']
                char_count += answer['char_count']
            question['answer_count'] = len(question['answer_list'])
            question['agree_count'] = agree_count
            question['char_count'] = char_count
            return question

        question_list = [add_property(x) for x in merge_answer_into_question() if len(x['answer_list'])]
        return question_list
Ejemplo n.º 2
0
    def __get_question_list(self):
        question_list = [DB.wrap('question', x) for x in DB.get_result_list(self.sql.question)]
        answer_list = [DB.wrap('answer', x) for x in DB.get_result_list(self.sql.get_answer_sql())]

        def merge_answer_into_question():
            question_dict = {x['question_id']: {'question': x.copy(), 'answer_list': [], 'agree': 0} for x in
                             question_list}
            for answer in answer_list:
                question_dict[answer['question_id']]['answer_list'].append(answer)
            return question_dict.values()

        def add_property(question):
            agree_count = 0
            char_count = 0
            for answer in question['answer_list']:
                answer['char_count'] = len(answer['content'])
                answer['agree_count'] = answer['agree']
                answer['update_date'] = answer['edit_date']
                agree_count += answer['agree']
                char_count += answer['char_count']
            question['answer_count'] = len(question['answer_list'])
            question['agree_count'] = agree_count
            question['char_count'] = char_count
            return question

        question_list = [add_property(x) for x in merge_answer_into_question() if len(x['answer_list'])]
        return question_list
Ejemplo n.º 3
0
    def login(self, account, password, captcha=''):
        content = Http.get_content('https://www.zhihu.com/')
        xsrf = Match.xsrf(content)
        if not xsrf:
            Debug.logger.info(u'登陆失败')
            Debug.logger.info(u'敲击回车重新发送登陆请求')
            return False
        xsrf = xsrf.split('=')[1]
        # add xsrf as cookie into cookieJar,
        cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com')
        self.cookieJar.set_cookie(cookie)
        if captcha:
            post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True,
                         'captcha': captcha}
        else:
            post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True}

        header = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip,deflate',  # 主要属性,只要有此项知乎即认为来源非脚本
            'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4',
            'Host': 'www.zhihu.com',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest',
            'Origin': 'https://www.zhihu.com',
            'Referer': 'https://www.zhihu.com/',
        }
        result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header)
        if not result:
            Debug.logger.info(u'登陆失败,请敲击回车重新登陆')
            return False
        response = json.loads(result)

        if response['r'] == 0:
            print u'登陆成功!'
            print u'登陆账号:', account
            print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认'
            if raw_input() == 'yes':
                Config.account, Config.password, Config.remember_account = account, password, True
                print u'帐号密码已保存,可通过修改config.json修改设置'
            else:
                Config.account, Config.password, Config.remember_account = '', '', False
                print u'跳过保存环节,进入下一流程'
            Config._save()
            cookie = self.get_cookie()
            data = {}
            data['account'] = account
            data['password'] = password
            data['recordDate'] = ExtraTools.get_today()
            data['cookieStr'] = cookie
            DB.save(data, 'LoginRecord')
            DB.commit()
            return True
        else:
            print u'登陆失败'
            Debug.print_dict(response)
            return False
Ejemplo n.º 4
0
 def clear_index(self):
     topic_id_tuple = tuple(
         set(x['topic_id'] for x in self.topic_index_list))
     sql = 'DELETE  from TopicIndex where topic_id in ({})'.format(
         (' ?,' * len(topic_id_tuple))[:-1])
     DB.cursor.execute(sql, topic_id_tuple)
     DB.commit()
     return
Ejemplo n.º 5
0
 def save_record_list(table_name, record_list):
     """
     将数据保存到数据库中
     :return:
     """
     for record in record_list:
         DB.save(record, table_name)
     DB.commit()
     return
Ejemplo n.º 6
0
 def save(self):
     self.clear_index()
     save_config = self.create_save_config()
     for key in save_config:
         for item in save_config[key]:
             if item:
                 DB.save(item, key)
     DB.commit()
     return
 def query_answer_list_by_author_page_id(self, author_page_id):
     # 需要先查出来对应的author_id
     author_info = DB.query_row(u'select author_id from Author where author_page_id="{author_page_id}"'.format(author_page_id=author_page_id))
     author_id = author_info[u'author_id']
     raw_answer_list = DB.query_all(u'select * from Answer where author_id="{author_id}"  {order_by} '.format(author_id=author_id, order_by=Config.answer_order_by))
     answer_list = []
     for raw_answer in raw_answer_list:
         answer_list.append(self.format_answer(raw_answer))
     return answer_list
Ejemplo n.º 8
0
 def __init__(self):
     #   初始化目录结构
     Path.init_base_path()
     Path.init_work_directory()
     #   初始化数据库链接
     DB.init_database()
     #   初始化配置
     Config.init_config()
     return
Ejemplo n.º 9
0
 def save(self):
     self.clear_index()
     save_config = self.create_save_config()
     for key in save_config:
         for item in save_config[key]:
             if item:
                 DB.save(item, key)
     DB.commit()
     return
Ejemplo n.º 10
0
 def query_answer_list_by_author_page_id(self, author_page_id):
     # 需要先查出来对应的author_id
     author_info = DB.query_row(u'select author_id from Author where author_page_id="{author_page_id}"'.format(author_page_id=author_page_id))
     author_id = author_info[u'author_id']
     raw_answer_list = DB.query_all(u'select * from Answer where author_id="{author_id}"  {order_by} '.format(author_id=author_id, order_by=Config.answer_order_by))
     answer_list = []
     for raw_answer in raw_answer_list:
         answer_list.append(self.format_answer(raw_answer))
     return answer_list
Ejemplo n.º 11
0
 def save_record_list(table_name, record_list):
     """
     将数据保存到数据库中
     :return:
     """
     for record in record_list:
         DB.save(record, table_name)
     DB.commit()
     return
Ejemplo n.º 12
0
    def __get_article_list(self):
        def add_property(article):
            article['char_count'] = len(article['content'])
            article['agree_count'] = article['agree']
            article['update_date'] = article['publish_date']
            article['answer_count'] = 1
            return article

        article_list = [DB.wrap(Type.article, x) for x in DB.get_result_list(self.sql.get_answer_sql())]
        article_list = [add_property(x) for x in article_list]
        return article_list
Ejemplo n.º 13
0
    def __get_article_list(self):
        def add_property(article):
            article['char_count'] = len(article['content'])
            article['agree_count'] = article['agree']
            article['update_date'] = article['publish_date']
            article['answer_count'] = 1
            return article

        article_list = [DB.wrap(Type.article, x) for x in DB.get_result_list(self.sql.get_answer_sql())]
        article_list = [add_property(x) for x in article_list]
        return article_list
Ejemplo n.º 14
0
    def __get_article_list(self):
        def add_property(article):
            article['char_count'] = len(article['content'])
            article['answer_count'] = 1
            # TODO
            if self.kind in [Type.jianshu_author, Type.jianshu_collection, Type.jianshu_notebooks,
                             Type.sinablog_author, Type.csdnblog_author]:
                article['agree_count'] = "没有赞同数"     # article['agree']
            else:
                article['agree_count'] = article['agree']

            article['update_date'] = article['publish_date']

            return article

        if self.kind in [Type.jianshu_author, Type.jianshu_collection, Type.jianshu_notebooks]:
            article_list = [DB.wrap(Type.jianshu_article, x) for x in DB.get_result_list(self.sql.get_answer_sql())]
        elif self.kind == Type.sinablog_author:
            article_list = [DB.wrap(Type.sinablog_article, x) for x in DB.get_result_list(self.sql.get_answer_sql())]
        elif self.kind == Type.csdnblog_author:
            article_list = [DB.wrap(Type.csdnblog_article, x) for x in DB.get_result_list(self.sql.get_answer_sql())]
        else:
            article_list = [DB.wrap(Type.article, x) for x in DB.get_result_list(self.sql.get_answer_sql())]
        article_list = [add_property(x) for x in article_list]
        return article_list
Ejemplo n.º 15
0
 def query_answer_list_by_question_id(self, question_id):
     raw_answer_list = DB.query_all(
         u'select * from Answer where question_id="{question_id}" {order_by} '.format(question_id=question_id, order_by=Config.answer_order_by))
     answer_list = []
     for raw_answer in raw_answer_list:
         answer_list.append(self.format_answer(raw_answer))
     return answer_list
Ejemplo n.º 16
0
 def catch_article_book_info(self, sql):
     info_list = DB.cursor.execute(self.sql.info).fetchall()
     info_list = [DB.wrap(Type.article, item) for item in info_list]
     info = {}
     info['title'] = '_'.join([str(item['title']) for item in info_list])
     info['id'] = '_'.join([str(item['article_id']) for item in info_list])
     return info
Ejemplo n.º 17
0
 def query_question(self, question_id):
     """
     :rtype: Question_Info
     """
     question = DB.query_row(u'select * from Question where question_id in ({question_id})'.format(question_id=question_id))
     question = self.format_question(question)  # 包装成标准的信息格式
     return question
 def query_question(self, question_id):
     """
     :rtype: Question_Info
     """
     question = DB.query_row(u'select * from Question where question_id in ({question_id})'.format(question_id=question_id))
     question = self.format_question(question)  # 包装成标准的信息格式
     return question
 def query_question_list(self, question_id_list):
     raw_question_list = DB.query_all(
             u'select * from Question where question_id in ({question_id_list})'.format(question_id_list=','.join(question_id_list)))
     question_list = []
     for raw_question in raw_question_list:
         question_list.append(self.format_question(raw_question))
     return question_list
Ejemplo n.º 20
0
 def catch_article_book_info(self, sql):
     info_list = DB.cursor.execute(self.sql.info).fetchall()
     info_list = [DB.wrap(Type.article, item) for item in info_list]
     info = {}
     info['title'] = '_'.join([str(item['title']) for item in info_list])
     info['id'] = '_'.join([str(item['article_id']) for item in info_list])
     return info
Ejemplo n.º 21
0
 def catch_info(self):
     info = {}
     if self.property.sql.info:
         info = DB.cursor.execute(self.property.sql.info).fetchone()
         info = DB.wrap(Type.info_table[self.kind], info)
     self.set_info(info)
     return
Ejemplo n.º 22
0
 def catch_question_book_info(self, sql):
     info_list = DB.cursor.execute(self.sql.info).fetchall()
     info_list = [DB.wrap(Type.question, item) for item in info_list]
     info = dict()
     info['title'] = '_'.join([str(item['title']) for item in info_list])   # 可以是多个问题, 多个id联系在一起
     info['id'] = '_'.join([str(item['question_id']) for item in info_list])
     return info
 def query_answer_list_by_question_id(self, question_id):
     raw_answer_list = DB.query_all(
             u'select * from Answer where question_id="{question_id}" {order_by} '.format(question_id=question_id, order_by=Config.answer_order_by))
     answer_list = []
     for raw_answer in raw_answer_list:
         answer_list.append(self.format_answer(raw_answer))
     return answer_list
Ejemplo n.º 24
0
 def query_question_list(self, question_id_list):
     raw_question_list = DB.query_all(
         u'select * from Question where question_id in ({question_id_list})'.format(question_id_list=','.join(question_id_list)))
     question_list = []
     for raw_question in raw_question_list:
         question_list.append(self.format_question(raw_question))
     return question_list
Ejemplo n.º 25
0
 def query_article_list_by_column_id(self, column_id):
     #   根据发表时间正序获取文章列表,方便浏览
     raw_article_list = DB.query_all(u'select * from Article where column_id="{column_id}"  {order_by}'.format(column_id=column_id, order_by=Config.article_order_by))
     article_list = []
     for raw_article in raw_article_list:
         article = self.format_article(raw_article)
         article_list.append(article)
     return article_list
 def query_article_list_by_column_id(self, column_id):
     #   根据发表时间正序获取文章列表,方便浏览
     print u'select * from Article where column_id = "{column_id}"    {order_by}'.format(column_id=column_id,  order_by=Config.article_order_by)
     raw_article_list = DB.query_all(u'select * from Article where column_id = "{column_id}"    {order_by}'.format(column_id=column_id,  order_by=Config.article_order_by))
     article_list = []
     for raw_article in raw_article_list:
         article = self.format_article(raw_article)
         article_list.append(article)
     return article_list
Ejemplo n.º 27
0
    def query_answer(self, answer_id):
        """

        :type answer_id:int
        :return:
        """
        answer = DB.query_row(u'select * from Answer where answer_id in ({answer_id})'.format(answer_id=answer_id))
        answer = self.format_answer(answer)
        return answer
    def query_answer(self, answer_id):
        """

        :type answer_id:int
        :return:
        """
        answer = DB.query_row(u'select * from Answer where answer_id in ({answer_id})'.format(answer_id=answer_id))
        answer = self.format_answer(answer)
        return answer
Ejemplo n.º 29
0
    def catch_csdnblog_book_info(self):
        u"""

        :return:
        """
        info_list = DB.cursor.execute(self.sql.info).fetchall()
        info_list = [DB.wrap(Type.csdnblog_info, item) for item in info_list]
        info = dict()
        info['creator_name'] = '_'.join([str(item['creator_name']) for item in info_list])
        info['creator_id'] = '_'.join([str(item['creator_id']) for item in info_list])
        return info
Ejemplo n.º 30
0
    def catch_jianshu_book_info(self):
        u"""

        :return: info
        """
        info_list = DB.cursor.execute(self.sql.info).fetchall()
        info_list = [DB.wrap(Type.jianshu_info, item) for item in info_list]
        info = dict()
        # 可以是多个博客组合在一起 TODO: 删掉???
        info['creator_name'] = '_'.join([str(item['creator_name']) for item in info_list])
        info['creator_id'] = '_'.join([str(item['creator_id']) for item in info_list])
        return info
Ejemplo n.º 31
0
 def catch_info(self):
     info = {}
     if self.sql.info:
         if self.kind in [Type.question, Type.answer]:
             info = self.catch_question_book_info(self.sql.info)
         elif self.kind == Type.article:
             info = self.catch_article_book_info(self.sql.info)
         else:
             info = DB.cursor.execute(self.sql.info).fetchone()
             info = DB.wrap(Type.info_table[self.kind], info)
     self.set_info(info)
     return
Ejemplo n.º 32
0
    def catch_jianshu_book_info(self):
        u"""

        :param
        :return: info
        """
        info_list = DB.cursor.execute(self.sql.info).fetchall()
        info_list = [DB.wrap(Type.jianshu_info, item) for item in info_list]
        info = {}
        info['creator_name'] = '_'.join([str(item['creator_name']) for item in info_list])  # 可以是多个博客组合在一起
        info['creator_id'] = '_'.join([str(item['creator_id']) for item in info_list])
        return info
Ejemplo n.º 33
0
    def catch_sinablog_book_info(self):
        u"""

        :param
        :return: info
        """
        info_list = DB.cursor.execute(self.sql.info).fetchall()
        info_list = [DB.wrap(Type.sinablog_info, item) for item in info_list]
        info = dict()
        info['creator_name'] = '_'.join([str(item['creator_name']) for item in info_list])  # 可以是多个博客组合在一起
        info['creator_id'] = '_'.join([str(item['creator_id']) for item in info_list])
        return info
Ejemplo n.º 34
0
 def catch_info(self):
     info = {}
     if self.sql.info:
         if self.kind in [Type.question, Type.answer]:
             info = self.catch_question_book_info(self.sql.info)
         elif self.kind == Type.article:
             info = self.catch_article_book_info(self.sql.info)
         else:
             info = DB.cursor.execute(self.sql.info).fetchone()
             info = DB.wrap(Type.info_table[self.kind], info)
     self.set_info(info)
     return
Ejemplo n.º 35
0
 def init_database():
     if Path.is_file(Path.db_path):
         DB.set_conn(sqlite3.connect(Path.db_path))
     else:
         DB.set_conn(sqlite3.connect(Path.db_path))
         with open(Path.sql_path) as sql_script:
             DB.cursor.executescript(sql_script.read())
         DB.commit()
Ejemplo n.º 36
0
 def init_database():
     if Path.is_file(Path.db_path):
         DB.set_conn(sqlite3.connect(Path.db_path))
     else:
         DB.set_conn(sqlite3.connect(Path.db_path))
         with open(Path.sql_path) as sql_script:
             DB.cursor.executescript(sql_script.read())
         DB.commit()
    def extract_topic(self):
        raw_topic = DB.query_row(u'select * from Topic where topic_id="{topic_id}"'.format(topic_id=self.task.topic_id))
        self.info_page = Topic_Info(raw_topic)

        answer_list = self.query_answer_list(self.info_page.best_answer_id_list.split(','))
        question_id_dict = OrderedDict()
        #   依次获取对应的Question对象
        for answer in answer_list:
            if answer.question_id not in question_id_dict:
                question_id_dict[answer.question_id] = Question(self.query_question(answer.question_id))
            question_id_dict[answer.question_id].append_answer(answer)
        for question_id in question_id_dict:
            self.question_list.append(question_id_dict[question_id])
        return
Ejemplo n.º 38
0
    def extract_topic(self):
        raw_topic = DB.query_row(u'select * from Topic where topic_id="{topic_id}"'.format(topic_id=self.task.topic_id))
        self.info_page = Topic_Info(raw_topic)

        answer_list = self.query_answer_list(self.info_page.best_answer_id_list.split(','))
        question_id_dict = OrderedDict()
        #   依次获取对应的Question对象
        for answer in answer_list:
            if answer.question_id not in question_id_dict:
                question_id_dict[answer.question_id] = Question(self.query_question(answer.question_id))
            question_id_dict[answer.question_id].append_answer(answer)
        for question_id in question_id_dict:
            self.question_list.append(question_id_dict[question_id])
        return
Ejemplo n.º 39
0
    def extract_collection(self):
        raw_collection = DB.query_row(
            u'select * from Collection where collection_id="{collection_id}"'.format(collection_id=self.task.collection_id))
        self.info_page = Collection_Info(raw_collection)

        answer_list = self.query_answer_list(self.info_page.collected_answer_id_list.split(','))
        question_id_dict = OrderedDict()
        #   依次获取对应的Question对象
        for answer in answer_list:
            if answer.question_id not in question_id_dict:
                question_id_dict[answer.question_id] = Question(self.query_question(answer.question_id))
            question_id_dict[answer.question_id].append_answer(answer)
        for question_id in question_id_dict:
            self.question_list.append(question_id_dict[question_id])
        return
    def extract_collection(self):
        raw_collection = DB.query_row(
                u'select * from Collection where collection_id="{collection_id}"'.format(collection_id=self.task.collection_id))
        self.info_page = Collection_Info(raw_collection)

        answer_list = self.query_answer_list(self.info_page.collected_answer_id_list.split(','))
        question_id_dict = OrderedDict()
        #   依次获取对应的Question对象
        for answer in answer_list:
            if answer.question_id not in question_id_dict:
                question_id_dict[answer.question_id] = Question(self.query_question(answer.question_id))
            question_id_dict[answer.question_id].append_answer(answer)
        for question_id in question_id_dict:
            self.question_list.append(question_id_dict[question_id])
        return
Ejemplo n.º 41
0
 def init_database():
     if Path.is_file(Path.db_path):
         Debug.logger.debug(u"Connect to the database...")
         Debug.logger.debug(u"db_path: " + str(Path.db_path))
         DB.set_conn(sqlite3.connect(Path.db_path))
     else:
         Debug.logger.debug(u"Create db file...")
         DB.set_conn(sqlite3.connect(Path.db_path))
         with open(Path.sql_path) as sql_script:
             DB.cursor.executescript(sql_script.read())
         DB.commit()
Ejemplo n.º 42
0
    def extract_author(self):
        raw_author = DB.query_row(u'select * from Author where author_page_id="{author_page_id}" '.format(author_page_id=self.task.author_page_id))
        self.info_page = Author_Info(raw_author)

        answer_list = self.query_answer_list_by_author_page_id(self.info_page.author_page_id)
        question_id_dict = OrderedDict()
        #   依次获取对应的Question对象
        for answer in answer_list:
            if answer.question_id not in question_id_dict:
                db_question_info = self.query_question(answer.question_id)
                if not db_question_info:
                    #   当返回值为空的时候,直接跳过即可
                    continue
                question_id_dict[answer.question_id] = Question(db_question_info)
            question_id_dict[answer.question_id].append_answer(answer)
        for question_id in question_id_dict:
            self.question_list.append(question_id_dict[question_id])
        return
    def extract_author(self):
        raw_author = DB.query_row(u'select * from Author where author_page_id="{author_page_id}" '.format(author_page_id=self.task.author_page_id))
        self.info_page = Author_Info(raw_author)

        answer_list = self.query_answer_list_by_author_page_id(self.info_page.author_page_id)
        question_id_dict = OrderedDict()
        #   依次获取对应的Question对象
        for answer in answer_list:
            if answer.question_id not in question_id_dict:
                db_question_info = self.query_question(answer.question_id)
                if not db_question_info:
                    #   当返回值为空的时候,直接跳过即可
                    continue
                question_id_dict[answer.question_id] = Question(db_question_info)
            question_id_dict[answer.question_id].append_answer(answer)
        for question_id in question_id_dict:
            self.question_list.append(question_id_dict[question_id])
        return
Ejemplo n.º 44
0
 def catch_info(self):
     u"""
     获得博客的信息, 将info作为参数传给set_info
     :return:
     """
     info = {}
     if self.sql.info:
         if self.kind == Type.csdnblog_author:
             info = self.catch_csdnblog_book_info()
         elif self.kind == Type.jianshu_author:
             info = self.catch_jianshu_book_info()
         elif self.kind == Type.sinablog_author:
             info = self.catch_sinablog_book_info()
         elif self.kind in [Type.question, Type.answer]:
             info = self.catch_question_book_info(self.sql.info)
         elif self.kind == Type.article:
             info = self.catch_article_book_info(self.sql.info)
         else:
             info = DB.cursor.execute(self.sql.info).fetchone()
             info = DB.wrap(Type.info_table[self.kind], info)
     self.set_info(info)
     return
Ejemplo n.º 45
0
 def query_article(self, article_id):
     raw_article = DB.query_row(u'select * from Article where article_id="{article_id}" '.format(article_id=article_id))
     article = self.format_article(raw_article)
     return article
Ejemplo n.º 46
0
 def query_column(self, column_id):
     raw_column = DB.query_row(u'select * from Column where column_id="{column_id}"'.format(column_id=column_id))
     column = self.format_column(raw_column)  # 包装成标准的信息格式
     return column
Ejemplo n.º 47
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码
        url = 'http://www.gushequ.com/{}/'.format(account_id)
        front_page_content = Http.get_content(url)

        column_info = TodoColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "股社区"

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])
        star_page = 0
        max_page = 24
        if account_id == '2018':
            star_page = 0
            max_page = 24

        elif account_id == '2017':
            star_page = 24
            max_page = 58

        elif account_id == '2016':
            star_page = 58
            max_page = 92

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址
        for raw_front_page_index in range(star_page, max_page):
            request_url = u'http://www.gushequ.com/page/{}/'.format(
                raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('article')
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:

                        tarUrl = li.get('href')
                        ttt = str(tarUrl).split("#")[-1]
                        print ttt
                        if not (ttt is None):
                            article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = TodoArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Ejemplo n.º 48
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码
        url = 'https://www.huxiu.com/{}'.format(account_id)
        front_page_content = Http.get_content(url)

        # Config.now_id_likeName = account_id
        # Config.save()

        column_info = HuXiuColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = account_id

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        u_result = urllib.quote(
            account_id.decode(sys.stdin.encoding).encode('utf8'))
        print account_id
        max_page = 2

        idds = ''
        #
        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if split_url.split('/')[-1] == account_id:
                    # Config.now_id_likeName = line.split('#')[1]
                    max_page = int(line.split('#')[-1]) + 1
                    idds = str(line.split('#')[1])
                    print max_page
        max_page = -1
        #   分析网页内容,存到数据库里

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page + 1):
            #https://www.huxiu.com/search.html?s=%E5%B7%B4%E8%8F%B2%E7%89%B9&sort=dateline:desc
            request_url = u'https://www.huxiu.com/search.html?s={}&sort=dateline%3Adesc&per_page={}'.format(
                u_result, raw_front_page_index)
            #request_url = u'https://www.huxiu.com/member/{}/article/{}.html'.format(idds,raw_front_page_index)
            # request_url = 'https://www.huxiu.com/member/1872007.html'
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, "lxml")

                list_pcyc_l_ = soup.find_all('li')
                # list_pcyc_l_ = soup.find_all('div',class_='mob-ctt')
                for tgo_right in list_pcyc_l_:
                    for link in tgo_right.findAll('a'):
                        hre = str(link.get('href'))
                        if hre.startswith('/article/', 0, 10):
                            print u'https://www.huxiu.com{}'.format(
                                link.get('href'))
                            article_url_index_list.append(
                                'https://www.huxiu.com{}'.format(
                                    link.get('href')))

                del index_work_set[raw_front_page_index]

        article_url_index_list.append(
            'https://www.huxiu.com/article/299355.html')

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = HuXiuArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Ejemplo n.º 49
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 1
        max_sleep_time = 1

        article_url_index_list = []
        #   获取最大页码

        column_info = WeiXinColumnParser('').get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = account_id
        column_info[u'image_url'] = 'https://wpimg.wallstcn.com/3598b719-ab0d-4be7-bc09-30c3ae29a3cc.jpg?imageView2/1/w/240/h/240'
        max_page = 1
        # with open('ReadList.txt', 'r') as read_list:
        #     read_list = read_list.readlines()
        #     for line in read_list:
        #         split_url = line.split('#')[0]
        #         if str(split_url).__contains__(account_id):
        #             # Config.now_id_likeName = line.split('#')[1]
        #             max_page = int(line.split('#')[-1]) + 1
        #             column_info[u'title'] = str(line.split('#')[1])
        #
        #             # max_page = 1
        #             print max_page



        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))


        # article_url_index_list.append('https://mp.weixin.qq.com/s?__biz=MjM5MjczNDc0Mw==&mid=2650847984&idx=2&sn=b7b111e5964d2f2fb568ba0d419e3edf&chksm=bd55d1888a22589e2f3bab0613b346427079efc6b82fac869d4f78244a500c3e5cc8cb8402ed&scene=21#wechat_redirect')
        # article_url_index_list.append('https://mp.weixin.qq.com/s/yj1BT3jWyxLjlEnzz0vEtQ')

        with open('/Users/0/Desktop/list.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                article_url_index_list.append(str(line).strip('\n'))

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print  'query : ' + article_url_index
            article_db = DB.query_row(
                    'select count(*) as article_count from Article where article_id = "{}"'.format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(u"开始抓取  {countert} 号文章,剩余{article_count}篇".format(countert=article_url_index,
                                                                                    article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)
                time.sleep(mock_sleep_time)
                if len(request_url_content) == 0:
                    random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0
                    Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                    time.sleep(random_sleep_time)
                    continue
                #article_info = Todo2ArticleParser(request_url_content).get_article_info()
                # article_info = HuXiuArticleParser(request_url_content).get_article_info()
                article_info = WeiXinArticleParser(request_url_content).get_article_info()
                # article_info = WallStreetArticleParser(request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
    def catch(account_id):
        # 关键就在这里了

        article_url_index_list = []
        #   获取最大页码
        url = 'http://www.taoguba.com.cn/Article/' + account_id + '/1'
        front_page_content = Http.get_content(url)
        star_page = 1

        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                if str(line).__contains__('#'):
                    split_url = line.split('#')[0]
                    trgId = split_url.split('/')[-2]
                    if trgId == account_id:
                        pg = (split_url.split('/')[-1])
                        print pg
                        star_page = int(pg)

                        if star_page == 0:
                            star_page = 1
                        else:
                            print star_page

        max_page = 2
        dom = BeautifulSoup(front_page_content, "lxml")
        list_pcyc_l_ = dom.find_all('div', class_="left t_page01")
        try:
            for tgo_tgo_ in list_pcyc_l_:
                linkl = tgo_tgo_.findAll('a')
                tarUrl = linkl[0].get('href')
                max_page = int(tarUrl.split('/')[3])

        except IndexError as e:
            max_page = 1
        column_info = TGBColumnParser(front_page_content).get_column_info()

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        # 获取每一页中文章的地址的地址
        # star_page = 100
        for raw_front_page_index in range(star_page, max_page + 1):
            request_url = 'http://www.taoguba.com.cn/Article/' + account_id + '/' + str(
                raw_front_page_index)
            article_url_index_list.append(request_url)

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = TGBArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
 def query_column(self, column_id):
     raw_column = DB.query_row(u'select * from Column where column_id="{column_id}"'.format(column_id=column_id))
     column = self.format_column(raw_column)  # 包装成标准的信息格式
     return column
Ejemplo n.º 52
0
 def clear_index(self):
     topic_id_tuple = tuple(set(x['topic_id'] for x in self.topic_index_list))
     sql = 'DELETE  from TopicIndex where topic_id in ({})'.format((' ?,' * len(topic_id_tuple))[:-1])
     DB.cursor.execute(sql, topic_id_tuple)
     DB.commit()
     return
Ejemplo n.º 53
0
    def login(self, account, password, captcha=''):
        content = Http.get_content('https://www.zhihu.com/')
        xsrf = Match.xsrf(content)
        if not xsrf:
            Debug.logger.info(u'登陆失败')
            Debug.logger.info(u'敲击回车重新发送登陆请求')
            return False
        xsrf = xsrf.split('=')[1]
        # add xsrf as cookie into cookieJar,
        cookie = Http.make_cookie(name='_xsrf',
                                  value=xsrf,
                                  domain='www.zhihu.com')
        self.cookieJar.set_cookie(cookie)
        if captcha:
            post_data = {
                '_xsrf': xsrf,
                'email': account,
                'password': password,
                'remember_me': True,
                'captcha': captcha
            }
        else:
            post_data = {
                '_xsrf': xsrf,
                'email': account,
                'password': password,
                'remember_me': True
            }

        header = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip,deflate',  # 主要属性,只要有此项知乎即认为来源非脚本
            'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4',
            'Host': 'www.zhihu.com',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest',
            'Origin': 'https://www.zhihu.com',
            'Referer': 'https://www.zhihu.com/',
        }
        result = Http.get_content(url=r'https://www.zhihu.com/login/email',
                                  data=post_data,
                                  extra_header=header)
        if not result:
            Debug.logger.info(u'登陆失败,请敲击回车重新登陆')
            return False
        response = json.loads(result)

        if response['r'] == 0:
            print u'登陆成功!'
            print u'登陆账号:', account
            print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认'
            if raw_input() == 'yes':
                Config.account, Config.password, Config.remember_account = account, password, True
                print u'帐号密码已保存,可通过修改config.json修改设置'
            else:
                Config.account, Config.password, Config.remember_account = '', '', False
                print u'跳过保存环节,进入下一流程'
            Config._save()
            cookie = self.get_cookie()
            DB.execute(
                'delete from LoginRecord')  # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录
            data = {}
            data['account'] = account
            data['password'] = password
            data['recordDate'] = ExtraTools.get_today()
            data['cookieStr'] = cookie
            DB.save(data, 'LoginRecord')
            DB.commit()
            return True
        else:
            print u'登陆失败'
            Debug.print_dict(response)
            return False
Ejemplo n.º 54
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码

        column_info = Todo3ColumnParser('').get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "新能源汽车"
        column_info['article_count'] = 0
        column_info['follower_count'] = 0
        column_info['description'] = ''
        column_info['image_url'] = ''

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])
        star_page = 1
        max_page = 1

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址
        for raw_front_page_index in range(star_page, max_page):
            request_url = u'https://post.smzdm.com/fenlei/xinnengyuanche/p{}/'.format(
                raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('div',
                                            class_='list-border clearfix')
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    li = list_pcyc_li[0]

                    tarUrl = li.get('href')
                    ttt = str(tarUrl).split("#")[-1]
                    print ttt
                    if not (ttt is None):
                        article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = Todo3ArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Ejemplo n.º 55
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码
        url = 'https://www.wuxiareview.com/category/{}'.format(account_id)
        front_page_content = Http.get_content(url)

        column_info = WuXiaColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        max_page = 2
        if account_id == 'daidai':

            column_info[u'title'] = "吃瓜群众岱岱"
            max_page = 1
        elif account_id == 'gzmdzst':

            column_info[u'title'] = "顾子明的政事堂"
            max_page = 1
        else:

            column_info[u'title'] = "时文"
            max_page = 2

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page):
            request_url = u'https://www.wuxiareview.com/category/{}/{}/'.format(
                account_id, raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('article', class_="excerpt")
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        # print li.text
                        tarUrl = li.get('href')
                        ttt = str(tarUrl).split("#")[-1]
                        print ttt
                        if not (ttt is None):
                            article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = WuXiaArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Ejemplo n.º 56
0
    def login(self, account, password, captcha=''):
        if self.recipe_kind == 'zhihu':
            # 知乎此处的r参数为一个13位的unix时间戳
            unix_time_stp = str(int(1000 * time.time()))[0:13]
            content = Http.get_content('https://www.zhihu.com/')
        else:
            Debug.logger.error(u"登录中...未知网站类型错误")
            return
        xsrf = Match.xsrf(content)
        if not xsrf:
            Debug.logger.info(u'登陆失败')
            Debug.logger.info(u'敲击回车重新发送登陆请求')
            return False
        xsrf = xsrf.split('=')[1]
        # add xsrf as cookie into cookieJar,
        cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com')
        self.cookieJar.set_cookie(cookie)
        if captcha:
            post_data = {
                '_xsrf': xsrf,
                'email': account,
                'password': password,
                'remember_me': True,
                'captcha': captcha
            }
        else:
            post_data = {
                '_xsrf': xsrf,
                'email': account,
                'password': password,
                'remember_me': True
            }

        header = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip,deflate',  # 主要属性,只要有此项知乎即认为来源非脚本
            'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4',
            'Host': 'www.zhihu.com',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest',
            'Origin': 'https://www.zhihu.com',
            'Referer': 'https://www.zhihu.com/',
        }
        result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header)
        if not result:
            Debug.logger.info(u'登陆失败,请敲击回车重新登陆')
            return False
        response = json.loads(result)

        if response['r'] == 0:
            print u'\nlogin successfully...'
            print u'account:', account
            if self.from_ui is not True:
                print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认'
                remenber_account = raw_input()      # 从图形界面登录默认记住密码
            else:
                remenber_account = 'yes'
            if remenber_account == 'yes':
                Config.account, Config.password, Config.remember_account = account, password, True
                print u'帐号密码已保存,可通过修改config.json修改设置'
            else:
                Config.account, Config.password, Config.remember_account_set = '', '', False
                print u'跳过保存环节,进入下一流程'
            Config._save()
            cookie = self.get_cookie()
            DB.execute('delete from LoginRecord')  # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录
            data = dict()
            data['account'] = account
            data['password'] = password
            data['recordDate'] = ExtraTools.get_today()
            data['cookieStr'] = cookie
            DB.save(data, 'LoginRecord')
            DB.commit()
            return True
        else:
            print u'登陆失败'
            Debug.print_dict(response)
            return False
Ejemplo n.º 57
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码

        star_page = 1
        max_page = 1
        column_info = Todo1ColumnParser("").get_column_info()
        column_info[u'column_id'] = account_id

        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if str(split_url).__contains__(account_id):
                    # Config.now_id_likeName = line.split('#')[1]
                    max_page = int(line.split('#')[-1]) + 1
                    column_info[u'title'] = str(line.split('#')[1])

                    # max_page = 1
                    print max_page

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址
        for raw_front_page_index in range(star_page, max_page):
            request_url = u'https://www.guancha.cn/{}/list_{}.shtml'.format(
                account_id, raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('h4', class_="module-title")
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        ttt = li.get('href')
                        print ttt
                        if not (ttt is None):

                            ss = str(ttt).split('.')
                            article_url_index_list.append(
                                u"https://www.guancha.cn{}_s.{}".format(
                                    ss[0], ss[1]))

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = Todo1ArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
 def query_article(self, article_id):
     print u"query_article {}".format(article_id)
     raw_article = DB.query_row(u'select * from Article where article_id="{article_id}" '.format(article_id=article_id))
     article = self.format_article(raw_article)
     return article
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 1
        max_sleep_time = 1

        article_url_index_list = []
        #   获取最大页码

        url = 'http://www.jintiankansha.me/tag/{}?page=1'.format(account_id)

        column_info = JinWanKanSaEmptColumnParser('').get_column_info()

        column_info[u'column_id'] = account_id
        dt = datetime.datetime.now()
        column_info[u'title'] = u"AI_{}".format(dt.strftime("%Y-%m-%d"))
        max_page = 1

        typeToTry = 'tag'

        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if split_url.split('/')[-1] == account_id:
                    dt = datetime.datetime.now()
                    column_info[u'title'] = u"{}_{}".format(
                        line.split('#')[1], dt.strftime("%Y-%m-%d"))

                    max_page = int(line.split('#')[2])

                    typeToTry = str(int(line.split('#')[-1])).strip('\n')

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page + 1):
            # request_url = u'http://www.jintiankansha.me/column/{}?page={}'.format(account_id, raw_front_page_index)
            request_url = u'http://www.jintiankansha.me/{}/{}?page={}'.format(
                typeToTry, account_id, raw_front_page_index)
            print request_url
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'html.parser')
                list_p_list = soup.find_all('span', class_="item_title")

                for tgo_right in list_p_list:
                    for link in tgo_right.findAll('a'):
                        ttt = str(link.get('href'))
                        print ttt
                        if not (ttt is None):
                            article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        # article_url_index_list.append('http://www.jintiankansha.me/t/u8MygoqKI8')

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取  {countert} 号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)
                time.sleep(mock_sleep_time)
                if len(request_url_content) == 0:
                    random_sleep_time = base_sleep_time + random.randint(
                        0, max_sleep_time) / 100.0
                    Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                    time.sleep(random_sleep_time)
                    continue
                article_info = JinWanKanSaArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return