Beispiel #1
0
        def parse_question(command):
            result = Match.question(command)
            question_id = result.group('question_id')
            task = SingleTask()
            task.kind = 'question'

            task.spider.href = 'https://www.zhihu.com/question/{}'.format(
                question_id)
            task.book.kind = 'question'
            task.book.sql.info = ' question_id = "{}" '.format(question_id)
            task.book.sql.question = 'question_id = "{}"'.format(question_id)
            task.book.sql.answer = 'question_id = "{}"'.format(question_id)
            return task
Beispiel #2
0
 def parse_column(command):
     result = Match.column(command)
     column_id = result.group('column_id')
     task = SingleTask()
     task.kind = 'column'
     task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format(
         column_id)
     task.book.kind = 'column'
     task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format(
         column_id)
     task.book.sql.question = ''
     task.book.sql.answer = 'select * from Article where column_id = "{}" '.format(
         column_id)
     return task
Beispiel #3
0
 def parse_article(command):
     result = Match.article(command)
     column_id = result.group('column_id')
     article_id = result.group('article_id')
     task = SingleTask()
     task.kind = 'article'
     task.spider.href = 'https://zhuanlan.zhihu.com/{}/{}'.format(
         column_id, article_id)
     task.book.kind = 'article'
     task.book.sql.info = ' column_id = "{}" and article_id = "{}" '.format(
         column_id, article_id)
     task.book.sql.question = ''
     task.book.sql.answer = ' column_id = "{}" and article_id = "{}" '.format(
         column_id, article_id)
     return task
Beispiel #4
0
 def parse_topic(command):
     result = Match.topic(command)
     topic_id = result.group('topic_id')
     task = SingleTask()
     task.kind = 'topic'
     task.spider.href = 'https://www.zhihu.com/topic/{}'.format(
         topic_id)
     task.book.kind = 'topic'
     task.book.sql.info = 'select * from TopicInfo where topic_id = "{}"'.format(
         topic_id)
     task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where href in (select href from TopicIndex where topic_id = "{}"))'.format(
         topic_id)
     task.book.sql.answer = 'select * from Answer where href in (select href from TopicIndex where topic_id = "{}")'.format(
         topic_id)
     return task
Beispiel #5
0
 def parse_collection(command):
     result = Match.collection(command)
     collection_id = result.group('collection_id')
     task = SingleTask()
     task.kind = 'collection'
     task.spider.href = 'https://www.zhihu.com/collection/{}'.format(
         collection_id)
     task.book.kind = 'collection'
     task.book.sql.info = 'select * from CollectionInfo where collection_id = "{}"'.format(
         collection_id)
     task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where href in (select href from CollectionIndex where collection_id = "{}"))'.format(
         collection_id)
     task.book.sql.answer = 'select * from Answer where href in (select href from CollectionIndex where collection_id = "{}")'.format(
         collection_id)
     return task
Beispiel #6
0
 def parse_author(command):
     result = Match.author(command)
     author_id = result.group('author_id')
     task = SingleTask()
     task.kind = 'author'
     task.spider.href = 'https://www.zhihu.com/people/{}'.format(
         author_id)
     task.book.kind = 'author'
     task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(
         author_id)
     task.book.sql.question = 'select * from Question where question_id in (select question_id from Answer where author_id = "{}")'.format(
         author_id)
     task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(
         author_id)
     return task
Beispiel #7
0
        def parse_jianshu(command):
            u"""

            :param command: 某个新浪博客博主的首页地址
            :return: task:
            """
            result = Match.jianshu(command)
            jianshu_id = result.group('jianshu_id')
            task = SingleTask()

            task.author_id = jianshu_id
            task.kind = 'jianshu'
            task.spider.href = 'http://www.jianshu.com/users/{}/latest_articles'.format(
                jianshu_id)
            task.book.kind = 'jianshu'
            task.book.sql.info_extra = 'creator_id = "{}"'.format(jianshu_id)
            task.book.sql.article_extra = 'author_id = "{}"'.format(jianshu_id)
            task.book.author_id = jianshu_id
            return task
Beispiel #8
0
        def parse_SinaBlog(command):
            u"""

            :param command: 某个新浪博客博主的首页地址
            :return: task:
            """
            result = Match.SinaBlog(command)
            SinaBlog_author_id = result.group('SinaBlog_people_id')
            Debug.logger.debug(u"SinaBlog_people_id:" + str(SinaBlog_author_id))
            task = SingleTask()

            task.author_id = SinaBlog_author_id
            task.kind = 'SinaBlog'
            task.spider.href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format(SinaBlog_author_id)
            task.spider.href = 'http://blog.sina.com.cn/u/{}'.format(SinaBlog_author_id)
            task.spider.href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(SinaBlog_author_id)
            task.book.kind = 'SinaBlog'
            task.book.sql.info_extra = 'creator_id = "{}"'.format(SinaBlog_author_id)
            task.book.sql.article_extra = 'author_id = "{}"'.format(SinaBlog_author_id)
            task.book.author_id = SinaBlog_author_id
            Debug.logger.debug(u"在parse_SinaBlog中, task.book.author_id为" + str(task.book.author_id))
            return task