Beispiel #1
0
    def parser(self, url, logger):

        while 1:
            try:
                r = requests.get(url,
                                 headers=self.headers,
                                 timeout=5,
                                 proxies=self.current_proxy)
                time.sleep(3)
                logger.info('请求状态码' + str(r.status_code))
                if r.status_code == 404:
                    logger.warning('该用户被删!无法获得用户信息!!!')
                    self.is_del = True
                    break
                if r.status_code == 200:
                    self.content = BeautifulSoup(r.content, "lxml")
                    break
                if r.status_code == 410:
                    logger.warning('资源丢失')
                    break
            except Exception as e:
                logger.error('请求出错!' + str(e))
                self.current_proxy = get_IP()
                logger.warning('切换ip代理!中断3秒!')
                time.sleep(3)
                continue
Beispiel #2
0
 def parser(self, i, url, logger):
     while 1:
         try:
             r = requests.get(url,
                              headers=self.headers,
                              timeout=5,
                              proxies=self.current_proxy)
             time.sleep(3)
             logger.info('请求状态码' + str(r.status_code))
             if r.status_code == 404:
                 logger.warning('该用户被删!无法获得用户信息!!!')
                 self.is_del = True
                 return
             elif r.status_code == 200:
                 self.content = BeautifulSoup(r.content, "lxml")
                 return
             elif r.status_code == 401:
                 logger.info('Cookie过期,正在更换')
                 f = open('Cookies/ask_topics_cookies.txt', "r")
                 Lines = f.readlines()
                 if len(Lines) == 0:
                     logger.info('备用Cookies用完!')
                     self.delLogger(logger)
                     sys.exit(0)
                 else:
                     self.change_cookie()
             else:
                 self.delLogger(logger)
                 sys.exit(0)
         except Exception as e:
             logger.error('请求出错!' + str(e))
             self.current_proxy = get_IP()
             logger.warning('切换ip代理!中断3秒!')
             time.sleep(3)
             continue
Beispiel #3
0
    def get_askTopic(self):
        self.copycookies()
        self.get_createpoint()
        #从User_Asks库出发遍历用户
        items = self.mongo.db.asktopics_href.find()
        for item in items:
            self.href_list.append(item.get('href'))
        self.current_proxy = get_IP()
        self.get_cookie()

        dt = re.sub(r'[^0-9]', '',
                    str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
        for i in xrange(self.start, self.end):
            self.is_del = False
            self.content = None
            self.file.seek(0, 2)
            dt1 = re.sub(r'[^0-9]', '',
                         str(datetime.datetime.now().strftime('%Y-%m-%d')))
            News = self.type + ',' + str(i + 1) + ',' + str(
                self.end) + ',' + str(dt1) + '\n'
            self.file.write(News)

            user_url = self.href_list[i].replace('questions', 'question')

            logfielname = '/log/' + dt + '_ask_Topics' + '.log'
            logger = Logger(logfilename=logfielname,
                            logname='正在爬取第' + str(i + 1) +
                            '个用户的提问所属话题').getlog()

            if self.content == None:
                self.parser(i, user_url, logger)

            if self.is_del == True:
                self.delLogger(logger)
                continue
            else:
                soup = self.content

            # 提问问题所属话题
            topics = []
            if soup.find('div', {'class': 'Tag QuestionTopic'}) == None:
                logger.warning('该提问问题没有添加所属话题标签!')
            else:
                items = soup.findAll('div', {'class': 'Tag QuestionTopic'})

                for item in items:
                    topics.append(item.get_text())

            data_plus = {"href": user_url, "topics": topics}

            self.mongo.db.Ask_Topics.insert(data_plus)

            logger.info('已获取用户的记录信息')
            self.delLogger(logger)
            self.mongo.client.close()
Beispiel #4
0
 def get_log(self):
     self.copycookies()
     self.get_createpoint()
     self.questionUrl_list = extract_questionUrl()
     self.current_proxy = get_IP()
     self.get_cookie()
     dt = re.sub(r'[^0-9]', '',
                 str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
     for i in xrange(self.start, self.end):
         self.log = []
         self.is_del = False
         self.content = None
         self.file.seek(0, 2)
         dt1 = re.sub(r'[^0-9]', '',
                      str(datetime.datetime.now().strftime('%Y-%m-%d')))
         News = self.type + ',' + str(i + 1) + ',' + str(
             self.end) + ',' + str(dt1) + '\n'
         self.file.write(News)
         self.question_url = self.questionUrl_list[i]
         self.id = self.question_url.replace('/question/', '')
         self.log_url = self.url_domain + self.question_url + '/log'
         logfielname = '/log/' + dt + sys._getframe(
         ).f_code.co_name + '.log'
         log_questionCount = '正在爬第' + str(i + 1) + '项问题的日志'
         logger = Logger(logfilename=logfielname,
                         logname=log_questionCount).getlog()
         self.parser(i, self.log_url, logger)
         if self.is_del == True:
             continue
         soup = self.content
         items = soup.findAll('div', class_='zm-item')
         for item in items:
             #question_editor_url : /people/devymex
             question_editor_url = item.find('a').get('href')
             #所有文字: str
             did = item.get_text().encode('utf-8')
             #编辑时间:str
             did_time = item.find('time').get_text()
             data = {
                 'question_editor': question_editor_url,
                 'did': did,
                 'did_time': did_time
             }
             self.log.append(data)
         data_plus = {"question_id": self.id, "log": self.log}
         self.mongo.db.question_log.insert(data_plus)
         logger.info('成功保存数据!')
         self.delLogger(logger)
    def parser(self, i, url, logger):

        while 1:
            try:
                r = requests.get(url,
                                 headers=self.headers,
                                 timeout=5,
                                 proxies=self.current_proxy)
                time.sleep(random.randint(3, 5))
                logger.info('请求状态码' + str(r.status_code))
                if r.status_code == 404:
                    self.is_del = True
                    logger.warning('!!!该问题被删!!!')
                    self.delLogger(logger)
                    return
                elif r.status_code == 200:
                    self.content = BeautifulSoup(r.content, "lxml")
                    return
                elif r.status_code == 401:
                    logger.info('Cookie过期,正在更换')
                    f = open('Cookies/question_content_cookies.txt', "r")
                    Lines = f.readlines()
                    if len(Lines) == 0:
                        logger.info('备用Cookies用完!')
                        self.delLogger(logger)
                        sys.exit(0)
                    else:
                        self.change_cookie()
                        with open(
                                'User/question_content_loseuser_' +
                                str(self.fileNum) + '.txt', 'a+') as f1:
                            f1.write(str(i + 1) + '\n')
                else:
                    self.delLogger(logger)
                    sys.exit(0)
            except Exception as e:
                logger.error(str(e))
                self.current_proxy = get_IP()
                logger.warning('切换ip代理!中断3秒!')
                time.sleep(3)
                continue
Beispiel #6
0
    def get_userinfo(self):
        self.copycookies()
        self.get_createpoint()
        items = self.mongo.db.commenters_new.find()
        for item in items:
            self.userID_list.append(item.get('user_id'))
        self.current_proxy = get_IP()
        self.get_cookie()
        # self.user_info = extract_commenters_info()
        dt = re.sub(r'[^0-9]', '',
                    str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
        for i in xrange(self.start, self.end):
            self.is_del = False
            self.content = None
            self.file.seek(0, 2)
            dt1 = re.sub(r'[^0-9]', '',
                         str(datetime.datetime.now().strftime('%Y-%m-%d')))
            News = self.type + ',' + str(i + 1) + ',' + str(
                self.end) + ',' + str(dt1) + '\n'
            self.file.write(News)
            self.user_id = self.userID_list[i]
            user_url = self.url_damin + str(self.user_id) + '/following'
            logfielname = '/log/' + dt + 'commenters_' + sys._getframe(
            ).f_code.co_name + '.log'
            logger = Logger(logfilename=logfielname,
                            logname='正在爬取第' + str(i + 1) +
                            '个用户的记录信息').getlog()

            if self.content == None:
                self.parser(i, user_url, logger)

            if self.is_del == True:
                self.delLogger(logger)
                continue
            else:
                soup = self.content

            # 认证和个人成就
            achievements = []
            if soup.find('div', {'class': 'Profile-sideColumnItems'}) == None:
                #print'ta还没有成就'
                logger.warning('ta还没有成就')
            else:
                items = soup.find('div', {
                    'class': 'Profile-sideColumnItems'
                }).findAll('div', class_='Profile-sideColumnItem')

                for ach in items:
                    achievements.append(ach.get_text())

            try:
                # 用户回答数
                answer_count = soup.find('li', {
                    'aria-controls': 'Profile-answers'
                }).find('span', class_='Tabs-meta').get_text()

                # 用户提问数
                ask_count = soup.find('li', {
                    'aria-controls': 'Profile-asks'
                }).find('span', class_='Tabs-meta').get_text()

                # 用户关注的人数
                following_count = soup.findAll(
                    'a',
                    class_='Button NumberBoard-item Button--plain')[0].find(
                        'strong', class_='NumberBoard-itemValue').get_text()
                # 用户的关注者数
                follower_count = soup.findAll(
                    'a',
                    class_='Button NumberBoard-item Button--plain')[1].find(
                        'strong', class_='NumberBoard-itemValue').get_text()
            except Exception, e:
                logger.error('信息find失败!' + str(e))
                data_plus = {"user_id": self.user_id}
            else:
                data_plus = {
                    "user_id": self.user_id,
                    "achievement": achievements,
                    "answer_count": int(answer_count.replace(',', '')),
                    "ask_count": int(ask_count.replace(',', '')),
                    "following_count": int(following_count.replace(',', '')),
                    "follower_count": int(follower_count.replace(',', ''))
                }

            self.mongo.db.commenters_info.insert(data_plus)

            logger.info('已获取用户的记录信息')
            self.delLogger(logger)
            self.mongo.client.close()
Beispiel #7
0
    def get_following(self):
        self.copycookies()
        self.get_createpoint()
        items = self.mongo.db.followers.find()
        for item in items:
            self.userID_list.append(item.get('user_id'))
        self.current_proxy = get_IP()
        self.get_cookie()
        # self.user_id_list = extract_last_followers()
        dt = re.sub(r'[^0-9]', '',
                    str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
        for i in xrange(self.start, self.end):
            self.state = False
            self.user_id = self.userID_list[i]
            self.file.seek(0, 2)
            dt1 = re.sub(r'[^0-9]', '',
                         str(datetime.datetime.now().strftime('%Y-%m-%d')))
            News = self.type + ',' + str(i + 1) + ',' + str(
                self.end) + ',' + str(dt1) + '\n'
            self.file.write(News)
            logfielname = '/log/' + dt + 'followers_' + sys._getframe(
            ).f_code.co_name + '.log'
            logger = Logger(logfilename=logfielname,
                            logname='正在爬取第' + str(i + 1) + '个用户的关注了').getlog()

            following_url = 'https://www.zhihu.com/api/v4/members/' + str(
                self.user_id
            ) + '/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20'
            following_count = 0

            # self.following_id_list = extract_followers_following(self.user_id, self.user_id_list)
            # if len(self.following_id_list) == 0:
            #     self.following_type = 0
            # else:
            #     self.following_type = 1

            while 1:
                try:
                    r = requests.get(following_url,
                                     headers=self.headers,
                                     timeout=5,
                                     proxies=self.current_proxy)
                    time.sleep(3)
                    logger.info('第一次请求状态码' + str(r.status_code))
                    if r.status_code == 200:
                        j = r.json()
                        following_count = j['paging']['totals']
                    elif r.status_code == 404:
                        self.is_del = True
                        logger.info('!!!该用户被删!!!')
                        self.delLogger(logger)
                        break
                    elif r.status_code == 401:
                        logger.info('Cookie过期,正在更换')
                        f = open('Cookies/followers_following_cookies.txt',
                                 "r")
                        Lines = f.readlines()
                        if len(Lines) == 0:
                            logger.info('备用Cookies用完!')
                            self.delLogger(logger)
                            return
                        else:
                            self.change_cookie()
                            with open(
                                    'User/followers_following_loseuser_' +
                                    str(self.fileNum) + '.txt', 'a+') as f1:
                                f1.write(str(i + 1) + '\n')
                    else:
                        self.delLogger(logger)
                        return

                except Exception, e:
                    logger.error('查看回答数出错!' + str(e))
                    self.current_proxy = get_IP()
                    logger.warning('切换ip代理!中断3秒!')
                    time.sleep(3)
                    continue

                else:
                    # 没有关注者的用户也要保存一下
                    if following_count == 0:
                        logger.warning('用户没有关注者!')
                        self.delLogger(logger)
                        data_plus = {
                            'user_id': self.user_id,
                            "following_count": following_count
                        }
                        self.mongo.db.FR_followers_following.insert(data_plus)
                        break
                    elif self.following_type == 0 and following_count >= 4000:
                        logger.warning('用户关注了数大于4000!')
                        self.delLogger(logger)
                        data_plus = {
                            'user_id': self.user_id,
                            "following_count": following_count
                        }
                        self.mongo.db.FR_followers_following.insert(data_plus)
                        break
                    else:
                        offset = 0
                        while 1:
                            try:
                                soup = requests.get(following_url.format(
                                    str(offset)),
                                                    headers=self.headers,
                                                    timeout=5,
                                                    proxies=self.current_proxy)
                                time.sleep(3)
                                logger.info('请求状态码' + str(soup.status_code))
                            except Exception, e:
                                logger.error('请求关注者出错!' + str(e))
                                self.current_proxy = get_IP()
                                logger.warning('切换ip代理!中断3秒!')
                                time.sleep(3)
                                continue
                            else:
                                following_data = soup.json()
                                data = following_data.get('data')
                                logger.info(
                                    'is_end?' +
                                    str(following_data['paging']['is_end']))
                                if following_data['paging']['is_end']:
                                    following_list = []
                                    for i in range(0, len(data)):
                                        following_id = data[i][
                                            'url_token']  # 用户ID
                                        following_info = data[i]  # 全部信息

                                        info = {
                                            "following_id": following_id,
                                            "following_info": following_info
                                        }
                                        following_list.append(info)
                                    data_plus = {
                                        'user_id': self.user_id,
                                        "following_count": following_count,
                                        # "follower_type":self.following_type,
                                        "following": following_list
                                    }
                                    self.mongo.db.FR_followers_following.insert(
                                        data_plus)

                                    logger.info('已获得所有关注了用户!')
                                    logger.info('成功保存数据!')
                                    self.delLogger(logger)
                                    break
                                else:
                                    offset = offset + 20
                                    following_list = []
                                    for i in range(0, len(data)):
                                        following_id = data[i][
                                            'url_token']  # 用户ID
                                        # if following_id in self.following_id_list:
                                        #     self.state = True
                                        following_info = data[i]  # 全部信息

                                        info = {
                                            "following_id": following_id,
                                            "following_info": following_info
                                        }
                                        following_list.append(info)
                                    data_plus = {
                                        'user_id': self.user_id,
                                        "following_count": following_count,
                                        # "follower_type":self.following_type,
                                        "following": following_list
                                    }
                                    self.mongo.db.FR_followers_following.insert(
                                        data_plus)
                                    # if self.state:
                                    #     self.delLogger(logger)
                                    #     break
                        self.delLogger(logger)
                        self.mongo.client.close()
                        break
Beispiel #8
0
    def get_voters(self):
        self.copycookies()
        self.answerID_list = extract_answerID()
        print len(self.answerID_list)
        self.get_createpoint()
        self.current_proxy = get_IP()
        self.get_cookie()
        dt = re.sub(r'[^0-9]', '',
                    str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
        for i in xrange(self.start, self.end):
            self.state = False
            self.file.seek(0, 2)
            dt1 = re.sub(r'[^0-9]', '',
                         str(datetime.datetime.now().strftime('%Y-%m-%d')))
            News = self.type + ',' + str(i + 1) + ',' + str(
                self.end) + ',' + str(dt1) + '\n'
            self.file.write(News)
            self.answer_id = self.answerID_list[i]
            logfielname = '/log/' + dt + 'answer_' + sys._getframe(
            ).f_code.co_name + '.log'
            logger = Logger(logfilename=logfielname,
                            logname='正在爬取第' + str(i + 1) + '项回答的点赞者').getlog()
            voters_url = 'https://www.zhihu.com/api/v4/answers/' + str(
                self.answer_id
            ) + '/voters?include=data%5B%2A%5D.answer_count%2Carticles_count%2Cfollower_count%2Cgender%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=10&offset={0}'
            voter_num = 0
            self.voter_id_list = extract_answer_voters(self.answer_id)
            # if len(self.voter_id_list) == 0:
            #     self.answer_type = 0
            # else:
            #     self.answer_type = 1
            while 1:
                try:
                    r = requests.get(voters_url,
                                     headers=self.headers,
                                     timeout=5,
                                     proxies=self.current_proxy)
                    time.sleep(3)
                    logger.info('第一次请求状态码' + str(r.status_code))
                    if r.status_code == 404:
                        self.is_del = True
                        logger.info('!!!该回答被删!!!')
                        self.delLogger(logger)
                        break
                    elif r.status_code == 200:
                        j = r.json()
                        voter_num = j['paging']['totals']
                    elif r.status_code == 401:
                        logger.info('Cookie过期,正在更换')
                        f = open('Cookies/answer_voters_cookies.txt', "r")
                        Lines = f.readlines()
                        if len(Lines) == 0:
                            logger.info('备用Cookies用完!')
                            self.delLogger(logger)
                            return
                        else:
                            self.change_cookie()
                            with open(
                                    'User/answer_voters_loseuser_' +
                                    str(self.fileNum) + '.txt', 'a+') as f1:
                                f1.write(str(i + 1) + '\n')
                    else:
                        self.delLogger(logger)
                        return
                except Exception, e:
                    logger.error('获取点赞者数出错!' + str(e))
                    self.current_proxy = get_IP()
                    logger.warning('切换ip代理!中断3秒!')
                    time.sleep(3)
                    continue

                else:
                    if voter_num == 0:
                        logger.warning('回答没有点赞者!')
                        data_plus = {
                            'answer_id': self.answer_id,
                            "voter_num": 0
                        }
                        self.mongo.db.answer_voters.insert(data_plus)
                        self.delLogger(logger)
                        break
                    # elif self.answer_type == 0 and voter_num >= 4000:
                    #     logger.warning('回答点赞数大于4000!')
                    #     self.delLogger(logger)
                    #     data_plus = {'user_id': self.answer_id, "voter_num": voter_num, "answer_type":self.answer_type}
                    #     self.mongo.db.answer_voters.insert(data_plus)
                    #     break
                    else:
                        offset = 0
                        while 1:
                            try:
                                soup = requests.get(voters_url.format(
                                    str(offset)),
                                                    headers=self.headers,
                                                    timeout=5,
                                                    proxies=self.current_proxy)
                                time.sleep(3)
                                logger.info('请求状态码' + str(soup.status_code))
                            except Exception, e:
                                logger.error('请求点赞者出错!' + str(e))
                                self.current_proxy = get_IP()
                                logger.warning('切换ip代理!中断3秒!')
                                time.sleep(5)
                                continue
                            else:
                                voters_data = soup.json()
                                data = voters_data['data']
                                logger.info(
                                    'is_end?' +
                                    str(voters_data['paging']['is_end']))
                                if voters_data['paging']['is_end']:
                                    voter_list = []
                                    for i in range(0, len(data)):
                                        voter_url = data[i]['url_token']  #用户ID
                                        voter_info = data[i]  #全部信息

                                        info = {
                                            "voter_id": voter_url,
                                            "voter_info": voter_info
                                        }
                                        if voter_url in self.voter_id_list:
                                            break
                                        voter_list.append(info)
                                    data_plus = {
                                        'answer_id': self.answer_id,
                                        "voter_num": voter_num,
                                        # "answer_type":self.answer_type,
                                        "voters": voter_list
                                    }
                                    self.mongo.db.answer_voters.insert(
                                        data_plus)

                                    logger.info('已获得所有新增点赞者!')
                                    break
                                else:
                                    voter_list = []
                                    offset = offset + 10
                                    for i in range(0, len(data)):
                                        voter_url = data[i]['url_token']  #用户ID
                                        voter_info = data[i]  #全部信息

                                        info = {
                                            "voter_id": voter_url,
                                            "voter_info": voter_info
                                        }
                                        if voter_url in self.voter_id_list:
                                            self.state = True
                                            break
                                        voter_list.append(info)
                                    data_plus = {
                                        'answer_id': self.answer_id,
                                        "voter_num": voter_num,
                                        # "answer_type":self.answer_type,
                                        "voters": voter_list
                                    }
                                    self.mongo.db.answer_voters.insert(
                                        data_plus)
                                    if self.state:
                                        self.delLogger(logger)
                                        break

                        logger.info('所有数据成功保存!')
                        self.delLogger(logger)
                        break
Beispiel #9
0
    def get_Topics(self):
        self.copycookies()
        self.get_createpoint()
        items = self.mongo.db.followers.find()
        for item in items:
            self.userID_list.append(item.get('user_id'))
        self.current_proxy = get_IP()
        self.get_cookie()
        dt = re.sub(r'[^0-9]', '',
                    str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
        for i in xrange(self.start, self.end):
            self.file.seek(0, 2)
            dt1 = re.sub(r'[^0-9]', '',
                         str(datetime.datetime.now().strftime('%Y-%m-%d')))
            News = self.type + ',' + str(i + 1) + ',' + str(
                self.end) + ',' + str(dt1) + '\n'
            self.file.write(News)
            self.user_id = self.userID_list[i]
            logfielname = '/log/' + dt + 'followers_' + sys._getframe(
            ).f_code.co_name + '.log'
            logger = Logger(logfilename=logfielname,
                            logname='正在爬取第' + str(i + 1) +
                            '个用户的关注话题').getlog()
            topics_url = 'https://www.zhihu.com/api/v4/members/' + str(
                self.user_id
            ) + '/following-topic-contributions?include=data%5B*%5D.topic.introduction&offset={0}&limit=20'
            topics_count = 0

            while 1:
                try:
                    r = requests.get(topics_url,
                                     headers=self.headers,
                                     timeout=5,
                                     proxies=self.current_proxy)
                    time.sleep(3)
                    logger.info('第一次请求状态码' + str(r.status_code))
                    if r.status_code == 200:
                        j = r.json()
                        topics_count = j['paging']['totals']
                    elif r.status_code == 404:
                        self.is_del = True
                        logger.info('!!!该用户被删!!!')
                        self.delLogger(logger)
                        break
                    elif r.status_code == 401:
                        logger.info('Cookie过期,正在更换')
                        f = open('Cookies/followers_topics_cookies.txt', "r")
                        Lines = f.readlines()
                        if len(Lines) == 0:
                            logger.info('备用Cookies用完!')
                            self.delLogger(logger)
                            return
                        else:
                            self.change_cookie()
                            with open(
                                    'User/followers_topics_loseuser_' +
                                    str(self.fileNum) + '.txt', 'a+') as f1:
                                f1.write(str(i + 1) + '\n')
                    else:
                        self.delLogger(logger)
                        return

                except Exception, e:
                    logger.error('查看回答数出错!' + str(e))
                    self.current_proxy = get_IP()
                    logger.warning('切换ip代理!中断3秒!')
                    time.sleep(3)
                    continue

                else:
                    # 没有关注者的用户也要保存一下
                    if topics_count == 0:
                        logger.warning('用户没有关注话题!')
                        self.delLogger(logger)
                        data_plus = {
                            'user_id': self.user_id,
                            "topics_count": 0
                        }
                        self.mongo.db.followers_topics.insert(data_plus)
                        break
                    else:
                        offset = 0
                        while 1:
                            try:
                                soup = requests.get(topics_url.format(
                                    str(offset)),
                                                    headers=self.headers,
                                                    timeout=5,
                                                    proxies=self.current_proxy)
                                time.sleep(3)
                                logger.info('请求状态码' + str(soup.status_code))
                            except Exception, e:
                                logger.error('请求关注话题出错!' + str(e))
                                self.current_proxy = get_IP()
                                logger.warning('切换ip代理!中断3秒!')
                                time.sleep(3)
                                continue
                            else:
                                topics_data = soup.json()
                                data = topics_data.get('data')
                                logger.info(
                                    'is_end?' +
                                    str(topics_data['paging']['is_end']))
                                if topics_data['paging']['is_end']:
                                    topic_list = []
                                    for i in range(0, len(data)):
                                        info = {
                                            "name":
                                            data[i]['topic']['name'],
                                            "contributions_count":
                                            data[i]['contributions_count']
                                        }
                                        topic_list.append(info)
                                    data_plus = {
                                        'user_id': self.user_id,
                                        "topics_count": topics_count,
                                        "topic": topic_list
                                    }

                                    self.mongo.db.followers_topics.insert(
                                        data_plus)

                                    logger.info('已获得所有用户关注话题!')
                                    logger.info('成功保存数据!')
                                    self.delLogger(logger)
                                    break
                                else:
                                    offset = offset + 20
                                    topic_list = []
                                    for i in range(0, len(data)):
                                        info = {
                                            "name":
                                            data[i]['topic']['name'],
                                            "contributions_count":
                                            data[i]['contributions_count']
                                        }
                                        topic_list.append(info)
                                    data_plus = {
                                        'user_id': self.user_id,
                                        "topics_count": topics_count,
                                        "topic": topic_list
                                    }

                                    self.mongo.db.followers_topics.insert(
                                        data_plus)

                        self.mongo.client.close()
                        break
Beispiel #10
0
    def get_question_content(self):
        self.copycookies()
        self.get_createpoint()
        self.questionUrl_list = extract_questionUrl()
        self.current_proxy = get_IP()
        self.get_cookie()
        dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
        for i in range(self.start, self.end):
            self.topic_list = []
            self.is_del = False
            self.content = None
            self.file.seek(0,2)
            dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d')))
            News = self.type + ','+ str(i+1) + ',' + str(self.end) + ',' + str(dt1) + '\n'
            self.file.write(News)
            self.question_url = self.questionUrl_list[i]
            self.id = self.question_url.replace('/question/', '')
            logfielname = '/log/' + dt + sys._getframe().f_code.co_name + '.log'
            log_questionCount = '正在爬第' + str(i + 1) + '项问题的内容'
            logger = Logger(logfilename=logfielname, logname=log_questionCount).getlog()
            self.url = self.url_domain + self.question_url
            if self.content == None:
                self.parser(i, self.url, logger)
            if self.is_del == True:
                continue

            soup = self.content

            # 问题文本
            self.title = soup.find('h1', {'class', 'QuestionHeader-title'}).get_text()


            # 所属话题
            items = soup.find_all('div', class_='Tag QuestionTopic')
            for item in items:
                self.topic_list.append(item.get_text())

            # 回答数
            if soup.find('h4', {'class', 'List-headerText'}) == None:
                self.answer_num = 0
            else:
                temp = soup.find('h4', {'class', 'List-headerText'}).get_text()
                self.answer_num = int(re.search(r'^\d+', temp).group())

            # 浏览数一个数值
            self.visit_num = int(soup.findAll('strong', class_='NumberBoard-itemValue')[1].get_text().replace(',',''))
            # 关注数一个数值
            self.followers_num = int(soup.findAll('strong', class_='NumberBoard-itemValue')[0].get_text().replace(',',''))

            data = {
            "question_url": self.id,
            "title": self.title,
            "belong_topics": self.topic_list,
            "answer_num": self.answer_num,
            "followers_num": self.followers_num,
            "visit_num": self.visit_num
            }

            self.mongo.db.question_content.insert(data)
            logger.info('成功保存数据!')
            self.delLogger(logger)
Beispiel #11
0
    def get_Answer(self):
        self.copycookies()
        self.get_createpoint()
        items = self.mongo.db.answerers_1.find()
        for item in items:
            self.userID_list.append(item.get('user_id'))
        self.current_proxy = get_IP()
        self.get_cookie()
        dt = re.sub(r'[^0-9]', '',
                    str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
        for i in xrange(self.start, self.end):
            self.file.seek(0, 2)
            dt1 = re.sub(r'[^0-9]', '',
                         str(datetime.datetime.now().strftime('%Y-%m-%d')))
            News = self.type + ',' + str(i + 1) + ',' + str(
                self.end) + ',' + str(dt1) + '\n'
            self.file.write(News)
            self.user_id = self.userID_list[i]
            logfielname = '/log/' + dt + '_user_Topics' + '.log'
            logger = Logger(logfilename=logfielname,
                            logname='正在爬取第' + str(i + 1) + '个用户的回答').getlog()
            answer_url = 'https://www.zhihu.com/api/v4/members/' + str(
                self.user_id
            ) + '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
            answer_count = 0

            while 1:
                try:
                    r = requests.get(answer_url,
                                     headers=self.headers,
                                     timeout=5,
                                     proxies=self.current_proxy)
                    time.sleep(3)
                    logger.info('第一次请求状态码' + str(r.status_code))
                    if r.status_code == 200:
                        j = r.json()
                        answer_count = j['paging']['totals']
                    elif r.status_code == 404:
                        self.is_del = True
                        logger.info('!!!该用户被删!!!')
                        self.delLogger(logger)
                        break
                    elif r.status_code == 401:
                        logger.info('Cookie过期,正在更换')
                        f = open('Cookies/user_answer_cookies.txt', "r")
                        Lines = f.readlines()
                        if len(Lines) == 0:
                            logger.info('备用Cookies用完!')
                            self.delLogger(logger)
                            return
                        else:
                            self.change_cookie()
                            with open(
                                    'User/user_answer_loseuser_' +
                                    str(self.fileNum) + '.txt', 'a+') as f1:
                                f1.write(str(i + 1) + '\n')
                    elif r.status_code == 410:
                        self.is_del = True
                        logger.info('※※※资源丢失※※※')
                        self.delLogger(logger)
                        break
                    else:
                        self.delLogger(logger)
                        return

                except Exception, e:
                    logger.error('查看回答数出错!' + str(e))
                    self.current_proxy = get_IP()
                    logger.warning('切换ip代理!中断3秒!')
                    time.sleep(3)
                    continue

                else:
                    # 没有关注者的用户也要保存一下
                    if answer_count == 0:
                        logger.warning('用户没有回答!')
                        self.delLogger(logger)
                        data_plus = {'user_id': self.user_id}
                        self.mongo.db.User_Answer.insert(data_plus)
                        break
                    else:
                        offset = 0
                        while 1:
                            try:
                                soup = requests.get(answer_url.format(
                                    str(offset)),
                                                    headers=self.headers,
                                                    timeout=5,
                                                    proxies=self.current_proxy)
                                time.sleep(3)
                                logger.info('请求状态码' + str(soup.status_code))
                            except Exception, e:
                                logger.error('请求回答出错!' + str(e))
                                self.current_proxy = get_IP()
                                logger.warning('切换ip代理!中断3秒!')
                                time.sleep(3)
                                continue
                            else:
                                answer_data = soup.json()
                                # print(answer_data)
                                data = answer_data.get('data')
                                logger.info(
                                    'is_end?' +
                                    str(answer_data['paging']['is_end']))
                                if answer_data['paging']['is_end']:
                                    answer_list = []
                                    for i in range(0, len(data)):
                                        # 回答时间
                                        created_time = data[i]['created_time']
                                        # 更新时间
                                        updated_time = data[i]['updated_time']
                                        # 回答的点赞数 int
                                        vote_count = data[i]['voteup_count']
                                        # 回答id int
                                        answer_id = data[i]['id']
                                        # 回答文本
                                        answer_content = data[i]['content']
                                        # 评论数
                                        comment_count = data[i][
                                            'comment_count']
                                        # 回答者
                                        author_json = data[i]['author']
                                        # 回答者url_token
                                        author_url = data[i]['author'][
                                            'url_token']

                                        # 处理url以获得新的url进入下一个界面爬问题所属的话题和问题的回答数
                                        answer_herf = data[i]['url']
                                        question_url = data[i]['question'][
                                            'url']
                                        question_url = question_url.replace(
                                            'questions', 'question')
                                        answer_herf = answer_herf.replace(
                                            'answers', 'answer')

                                        r1 = requests.get(
                                            question_url,
                                            headers=self.headers,
                                            timeout=5,
                                            proxies=self.current_proxy)
                                        self.content1 = BeautifulSoup(
                                            r1.content, "lxml")
                                        soup1 = self.content1
                                        # 所属话题
                                        self.topic_list = []
                                        items = soup1.find_all(
                                            'div', class_='Tag QuestionTopic')
                                        for item in items:
                                            self.topic_list.append(
                                                item.get_text())

                                        # 回答数
                                        if soup1.find(
                                                'h4',
                                            {'class', 'List-headerText'
                                             }) == None:
                                            self.answer_num = 0
                                        else:
                                            temp = soup1.find(
                                                'h4',
                                                {'class', 'List-headerText'
                                                 }).get_text().replace(
                                                     ',', '')

                                            self.answer_num = int(
                                                re.search(r'^\d+',
                                                          temp).group())

                                        info = {
                                            "belong_topics": self.topic_list,
                                            "answer_num": self.answer_num,
                                            "created_time": created_time,
                                            "updated_time": updated_time,
                                            "vote_count": vote_count,
                                            "answer_id": answer_id,
                                            "answer_content": answer_content,
                                            "comment_count": comment_count,
                                            "author_json": author_json,
                                            "author_url": author_url,
                                            "answer_url": answer_herf,
                                            "question_url": question_url
                                        }
                                        answer_list.append(info)
                                    data_plus = {
                                        "question_id":
                                        data[i]['question']['id'],
                                        "answer_id": data[i]['id'],
                                        'user_id': self.user_id,
                                        # "topics_count": topics_count,
                                        "answer": answer_list
                                    }

                                    self.mongo.db.User_Answer.insert(data_plus)

                                    logger.info('已获得所有用户回答!')
                                    logger.info('成功保存数据!')
                                    self.delLogger(logger)
                                    break
                                else:
                                    offset = offset + 20
                                    answer_list = []
                                    for i in range(0, len(data)):
                                        # 回答时间
                                        created_time = data[i]['created_time']
                                        # 更新时间
                                        updated_time = data[i]['updated_time']
                                        # 回答的点赞数 int
                                        vote_count = data[i]['voteup_count']
                                        # 回答id int
                                        answer_id = data[i]['id']
                                        # 回答文本
                                        answer_content = data[i]['content']
                                        # 评论数
                                        comment_count = data[i][
                                            'comment_count']
                                        # 回答者
                                        author_json = data[i]['author']
                                        # 回答者url_token
                                        author_url = data[i]['author'][
                                            'url_token']
                                        # 处理url以获得新的url进入下一个界面爬问题所属的话题和问题的回答数
                                        answer_herf = data[i]['url']
                                        question_url = data[i]['question'][
                                            'url']
                                        answer_herf = answer_herf.replace(
                                            'answers', 'answer')
                                        question_url = question_url.replace(
                                            'questions', 'question')
                                        try:
                                            r1 = requests.get(
                                                question_url,
                                                headers=self.headers,
                                                timeout=5,
                                                proxies=self.current_proxy)
                                            time.sleep(3)
                                            logger.info('请求状态码' +
                                                        str(r1.status_code))
                                        except Exception, e:
                                            logger.error('请求回答出错!' + str(e))
                                            self.current_proxy = get_IP()
                                            logger.warning('切换ip代理!中断3秒!')
                                            time.sleep(3)
                                            continue

                                        self.content1 = BeautifulSoup(
                                            r1.content, "lxml")
                                        soup1 = self.content1
                                        # 所属话题
                                        self.topic_list = []
                                        items = soup1.find_all(
                                            'div', class_='Tag QuestionTopic')
                                        for item in items:
                                            self.topic_list.append(
                                                item.get_text())

                                        # 回答数
                                        if soup1.find(
                                                'h4',
                                            {'class', 'List-headerText'
                                             }) == None:
                                            self.answer_num = 0
                                        else:
                                            temp = soup1.find(
                                                'h4',
                                                {'class', 'List-headerText'
                                                 }).get_text().replace(
                                                     ',', '')

                                            self.answer_num = int(
                                                re.search(r'^\d+',
                                                          temp).group())
                                        info = {
                                            "belong_topics": self.topic_list,
                                            "answer_num": self.answer_num,
                                            "created_time": created_time,
                                            "updated_time": updated_time,
                                            "vote_count": vote_count,
                                            "answer_id": answer_id,
                                            "answer_content": answer_content,
                                            "comment_count": comment_count,
                                            "author_json": author_json,
                                            "author_url": author_url,
                                            "answer_url": answer_herf,
                                            "question_url": question_url
                                        }
                                        answer_list.append(info)
                                    data_plus = {
                                        "question_id":
                                        data[i]['question']['id'],
                                        "answer_id": data[i]['id'],
                                        'user_id': self.user_id,
                                        # "topics_count": topics_count,
                                        "answer": answer_list
                                    }

                                    self.mongo.db.User_Answer.insert(data_plus)

                        self.mongo.client.close()
                        break
    def get_comment(self):
        self.copycookies()
        # ※从两个合并的数据库中抽取answerID※
        self.answerID_list = extract_answerID()
        print len(self.answerID_list)  #输出回答ID列表的长度
        self.get_createpoint()
        self.current_proxy = get_IP()
        self.get_cookie()
        dt = re.sub(r'[^0-9]', '',
                    str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
        for i in xrange(self.start, self.end):
            self.file.seek(0, 2)
            dt1 = re.sub(r'[^0-9]', '',
                         str(datetime.datetime.now().strftime('%Y-%m-%d')))
            News = self.type + ',' + str(i + 1) + ',' + str(
                self.end) + ',' + str(dt1) + '\n'
            self.file.write(News)
            self.answer_id = self.answerID_list[i]
            logfielname = '/log/' + dt + 'answer_' + sys._getframe(
            ).f_code.co_name + '.log'
            logger = Logger(logfilename=logfielname,
                            logname='正在爬取第' + str(i + 1) + '项回答的评论').getlog()

            url = r'https://www.zhihu.com/api/v4/answers/' + str(
                self.answer_id
            ) + r'/comments?include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author&order=reverse&limit=20&offset={0}&status=open'
            comment_num = 0

            while 1:
                try:
                    r = requests.get(url,
                                     headers=self.headers,
                                     timeout=5,
                                     proxies=self.current_proxy)
                    time.sleep(3)
                    logger.info('第一次请求状态码' + str(r.status_code))
                    if r.status_code == 404:  #服务器找不到请求的网页
                        self.is_del = True
                        logger.info('!!!该回答被删!!!')
                        self.delLogger(logger)
                        break
                    elif r.status_code == 200:  #成功处理了请求
                        j = r.json()
                        comment_num = j['paging']['totals']
                    elif r.status_code == 401:  #请求要求身份验证(针对于需要登陆的网页)
                        logger.info('Cookie过期,正在更换')
                        f = open('Cookies/answer_comments_cookies.txt', "r")
                        Lines = f.readlines()
                        if len(Lines) == 0:
                            logger.info('备用Cookies用完!')
                            self.delLogger(logger)
                            return
                        else:
                            self.change_cookie()
                            with open(
                                    'User/answer_comments_loseuser_' +
                                    str(self.fileNum) + '.txt', 'a+') as f1:
                                f1.write(str(i + 1) + '\n')
                    else:
                        self.delLogger(logger)
                        return

                except Exception, e:
                    logger.error('查看评论数出错!' + str(e))
                    self.current_proxy = get_IP()
                    logger.warning('切换ip代理!中断3秒!')
                    time.sleep(5)
                    continue

                else:
                    # 没有评论的回答也要保存一下
                    if comment_num == 0:
                        logger.info('回答没有评论!')
                        self.delLogger(logger)
                        data_plus = {
                            'answer_id': self.answer_id,
                            "comment_num": 0
                        }
                        self.mongo.db.answer_comments.insert(data_plus)
                        break
                    else:
                        offset = 0
                        while 1:
                            try:
                                soup = requests.get(url.format(str(offset)),
                                                    headers=self.headers,
                                                    timeout=5,
                                                    proxies=self.current_proxy)
                                time.sleep(3)
                                logger.info('请求状态码' + str(soup.status_code))
                            except Exception, e:
                                logger.error('请求评论出错!' + str(e))
                                self.current_proxy = get_IP()
                                logger.warning('切换ip代理!中断3秒!')
                                time.sleep(5)
                                continue
                            else:
                                comments_data = soup.json()
                                data = comments_data['data']
                                #print 'is_end:' + str(comments_data['paging']['is_end'])
                                logger.info(
                                    'is_end?' +
                                    str(comments_data['paging']['is_end']))
                                if comments_data['paging']['is_end']:
                                    comment_list = []
                                    for i in range(0, len(data)):
                                        commenter_id = data[i]['author'][
                                            'member']['url_token']  #用户ID
                                        comment_info = data[i]  #全部信息
                                        info = {
                                            "commenter_id": commenter_id,
                                            "comment_info": comment_info
                                        }

                                        if comment_info[
                                                'created_time'] > 1500566400:
                                            comment_list.append(info)
                                        else:
                                            break

                                    data_plus = {
                                        'answer_id': self.answer_id,
                                        "comment_num": comment_num,
                                        "comments": comment_list
                                    }

                                    self.mongo.db.answer_comments.insert(
                                        data_plus)

                                    logger.info('已获得该回答下所有评论!')
                                    break
                                else:
                                    offset = offset + 20
                                    comment_list = []
                                    for i in range(0, len(data)):
                                        commenter_id = data[i]['author'][
                                            'member']['url_token']  #用户ID
                                        comment_info = data[i]  #全部信息
                                        info = {
                                            "commenter_id": commenter_id,
                                            "comment_info": comment_info
                                        }
                                        if comment_info[
                                                'created_time'] > 1500566400:
                                            comment_list.append(info)
                                        else:
                                            self.state = True
                                            break

                                    data_plus = {
                                        'answer_id': self.answer_id,
                                        "comment_num": comment_num,
                                        "comments": comment_list
                                    }
                                    self.mongo.db.answer_comments.insert(
                                        data_plus)
                                    if self.state:
                                        self.delLogger(logger)
                                        break
                        logger.info('所有数据成功保存!')
                        self.delLogger(logger)
                        break
Beispiel #13
0
    def get_followers(self):
        self.copycookies()
        self.get_createpoint()
        self.questionUrl_list = extract_questionUrl()
        self.current_proxy = get_IP()
        self.get_cookie()
        dt = re.sub(r'[^0-9]', '',
                    str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
        for i in xrange(self.start, self.end):
            self.is_del = False
            self.state = False
            self.file.seek(0, 2)
            dt1 = re.sub(r'[^0-9]', '',
                         str(datetime.datetime.now().strftime('%Y-%m-%d')))
            News = self.type + ',' + str(i + 1) + ',' + str(
                self.end) + ',' + str(dt1) + '\n'
            self.file.write(News)
            self.question_url = self.questionUrl_list[i]
            self.id = self.question_url.replace('/question/', '')
            logfielname = '/log/' + dt + sys._getframe(
            ).f_code.co_name + '.log'
            logger = Logger(logfilename=logfielname,
                            logname='正在爬取第' + str(i + 1) + '项问题的关注者').getlog()

            followers_url = u'https://www.zhihu.com/api/v4/questions/' + str(
                self.id
            ) + u'/followers?include=data[*].gender,answer_count,articles_count,follower_count,is_following,is_followed&limit=10&offset={0}'
            follower_num = 0
            self.follower_id_list = extract_question_followers(self.id)
            while 1:
                try:
                    r = requests.get(followers_url,
                                     headers=self.headers,
                                     timeout=5,
                                     proxies=self.current_proxy)
                    time.sleep(3)
                    logger.info('第一次请求状态码' + str(r.status_code))
                    if r.status_code == 404:
                        self.is_del = True
                        logger.info('!!!该问题被删!!!')
                        self.delLogger(logger)
                        break
                    elif r.status_code == 200:
                        j = r.json()
                        follower_num = j['paging']['totals']
                    elif r.status_code == 401:
                        logger.info('Cookie过期,正在更换')
                        f = open('Cookies/question_followers_cookies.txt', "r")
                        Lines = f.readlines()
                        if len(Lines) == 0:
                            logger.info('备用Cookies用完!')
                            return
                        else:
                            self.change_cookie()
                            with open(
                                    'User/question_followers_loseuser_' +
                                    str(self.fileNum) + '.txt', 'a+') as f1:
                                f1.write(str(i + 1) + '\n')
                    else:
                        return
                except Exception, e:
                    logger.error('获取关注者数出错!' + str(e))
                    self.current_proxy = get_IP()
                    logger.warning('切换ip代理!中断3秒!')
                    time.sleep(3)
                    continue
                else:
                    # 没有关注者的问题也要保存一下
                    if follower_num == 0:
                        logger.warning('问题没有关注者!')
                        data_plus = {'question_id': self.id, "follower_num": 0}
                        self.mongo.db.question_followers.insert(data_plus)
                        self.delLogger(logger)
                        break
                    else:
                        offset = 0
                        while 1:
                            try:
                                soup = requests.get(followers_url.format(
                                    str(offset)),
                                                    headers=self.headers,
                                                    timeout=5,
                                                    proxies=self.current_proxy)
                                time.sleep(3)
                                logger.info('请求状态码' + str(soup.status_code))
                            except Exception, e:
                                logger.error('请求关注者出错!' + str(e))
                                self.current_proxy = get_IP()
                                logger.warning('切换ip代理!中断3秒!')
                                time.sleep(3)
                                continue
                            else:
                                followers_data = soup.json()
                                data = followers_data.get('data')
                                #print 'is_end:' + str(followers_data['paging']['is_end'])
                                logger.info(
                                    'is_end?' +
                                    str(followers_data['paging']['is_end']))

                                if followers_data['paging']['is_end']:
                                    follower_list = []
                                    for i in range(0, len(data)):
                                        follower_url = data[i][
                                            'url_token']  #用户ID
                                        follower_info = data[i]  #全部信息

                                        info = {
                                            "follower_url": follower_url,
                                            "follower_info": follower_info
                                        }
                                        if follower_url in self.follower_id_list:
                                            break
                                        follower_list.append(info)
                                    data_plus = {
                                        "question_id": self.id,
                                        "follower_num": follower_num,
                                        "followers": follower_list
                                    }
                                    self.mongo.db.question_followers.insert(
                                        data_plus)
                                    logger.info('已经获得所有新增关注者!')
                                    logger.info('成功保存数据!')
                                    self.delLogger(logger)
                                    break
                                else:
                                    offset = offset + 10
                                    follower_list = []
                                    for i in range(0, len(data)):
                                        follower_url = data[i][
                                            'url_token']  #用户ID
                                        follower_info = data[i]  #全部信息
                                        info = {
                                            "follower_url": follower_url,
                                            "follower_info": follower_info
                                        }
                                        if follower_url in self.follower_id_list:
                                            self.state = True
                                            break
                                        follower_list.append(info)
                                    data_plus = {
                                        "question_id": self.id,
                                        "follower_num": follower_num,
                                        "followers": follower_list
                                    }
                                self.mongo.db.question_followers.insert(
                                    data_plus)
                                if self.state:
                                    self.delLogger(logger)
                                    break

                        self.delLogger(logger)
                        break
Beispiel #14
0
    def get_Asks(self):
        self.copycookies()
        self.get_createpoint()
        items = self.mongo.db.answerers_1.find()
        for item in items:
            self.userID_list.append(item.get('user_id'))
        self.current_proxy = get_IP()
        self.get_cookie()
        dt = re.sub(r'[^0-9]', '',
                    str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
        for i in xrange(self.start, self.end):
            self.file.seek(0, 2)
            dt1 = re.sub(r'[^0-9]', '',
                         str(datetime.datetime.now().strftime('%Y-%m-%d')))
            News = self.type + ',' + str(i + 1) + ',' + str(
                self.end) + ',' + str(dt1) + '\n'
            self.file.write(News)
            self.user_id = self.userID_list[i]
            logfielname = '/log/' + dt + '_user_Asks' + '.log'
            logger = Logger(logfilename=logfielname,
                            logname='正在爬取第' + str(i + 1) +
                            '个用户的提问信息').getlog()
            #https://www.zhihu.com/api/v4/members/cang-hai-qing-yue/questions?include=data%5B*%5D.created%2Canswer_count%2Cfollower_count%2Cauthor%2Cadmin_closed_comment&offset={0}&limit=20
            asks_url = 'https://www.zhihu.com/api/v4/members/' + str(
                self.user_id
            ) + '/questions?include=data%5B*%5D.created%2Canswer_count%2Cfollower_count%2Cauthor%2Cadmin_closed_comment&offset={0}&limit=20'
            asks_count = 0

            while 1:
                try:
                    r = requests.get(asks_url,
                                     headers=self.headers,
                                     timeout=5,
                                     proxies=self.current_proxy)
                    time.sleep(3)
                    logger.info('第一次请求状态码' + str(r.status_code))
                    if r.status_code == 200:
                        j = r.json()
                        asks_count = j['paging']['totals']
                    elif r.status_code == 404:
                        self.is_del = True
                        logger.info('!!!该用户被删!!!')
                        self.delLogger(logger)
                        break
                    elif r.status_code == 401:
                        logger.info('Cookie过期,正在更换')
                        f = open('Cookies/user_asks_cookies.txt', "r")
                        Lines = f.readlines()
                        if len(Lines) == 0:
                            logger.info('备用Cookies用完!')
                            self.delLogger(logger)
                            return
                        else:
                            self.change_cookie()
                    elif r.status_code == 410:
                        self.is_del = True
                        logger.info('※※※资源丢失※※※')
                        self.delLogger(logger)
                        break
                    else:
                        self.delLogger(logger)
                        return

                except Exception, e:
                    logger.error('查看提问数出错!' + str(e))
                    self.current_proxy = get_IP()
                    logger.warning('切换ip代理!中断3秒!')
                    time.sleep(3)
                    continue

                else:
                    # 没有提问的用户也要保存一下
                    if asks_count == 0:
                        logger.warning('用户没有提问问题!')
                        self.delLogger(logger)
                        data_plus = {'user_id': self.user_id, "asks_count": 0}
                        self.mongo.db.User_Asks.insert(data_plus)
                        break
                    else:
                        offset = 0
                        while 1:
                            try:
                                soup = requests.get(asks_url.format(
                                    str(offset)),
                                                    headers=self.headers,
                                                    timeout=5,
                                                    proxies=self.current_proxy)
                                time.sleep(3)
                                logger.info('请求状态码' + str(soup.status_code))
                            except Exception, e:
                                logger.error('请求用户提问出错!' + str(e))
                                self.current_proxy = get_IP()
                                logger.warning('切换ip代理!中断3秒!')
                                time.sleep(3)
                                continue
                            else:
                                asks_data = soup.json()
                                data = asks_data.get('data')
                                logger.info('is_end?' +
                                            str(asks_data['paging']['is_end']))
                                if asks_data['paging']['is_end']:
                                    ask_list = []
                                    for i in range(0, len(data)):
                                        info = {
                                            "question_id":
                                            data[i]['id'],
                                            "content":
                                            data[i]['title'],
                                            "answer_count":
                                            data[i]['answer_count'],
                                            "follower_count":
                                            data[i]['follower_count'],
                                            "create_time":
                                            data[i]['created'],
                                            "update_time":
                                            data[i]['updated_time'],
                                            "href":
                                            data[i]['url']
                                        }
                                        ask_list.append(info)
                                    data_plus = {
                                        'user_id': self.user_id,
                                        "asks_count": asks_count,
                                        "ask_list": ask_list
                                    }
                                    self.mongo.db.User_Asks.insert(data_plus)
                                    logger.info('已获得所有用户关注话题!')
                                    logger.info('成功保存数据!')
                                    self.delLogger(logger)
                                    break
                                else:
                                    offset = offset + 20
                                    ask_list = []
                                    for i in range(0, len(data)):
                                        info = {
                                            "question_id":
                                            data[i]['id'],
                                            "content":
                                            data[i]['title'],
                                            "answer_count":
                                            data[i]['answer_count'],
                                            "follower_count":
                                            data[i]['follower_count'],
                                            "create_time":
                                            data[i]['created'],
                                            "update_time":
                                            data[i]['updated_time'],
                                            "href":
                                            data[i]['url']
                                        }
                                        ask_list.append(info)
                                    data_plus = {
                                        'user_id': self.user_id,
                                        "asks_count": asks_count,
                                        "ask_list": ask_list
                                    }
                                    self.mongo.db.User_Asks.insert(data_plus)

                        self.mongo.client.close()
                        break
Beispiel #15
0
    def get_answers(self):
        self.copycookies()
        self.get_createpoint()
        self.questionUrl_list = extract_questionUrl()
        self.current_proxy = get_IP()
        self.get_cookie()
        dt = re.sub(r'[^0-9]', '',
                    str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')))
        for i in xrange(self.start, self.end):
            self.is_del = False
            self.file.seek(0, 2)
            dt1 = re.sub(r'[^0-9]', '',
                         str(datetime.datetime.now().strftime('%Y-%m-%d')))
            News = self.type + ',' + str(i + 1) + ',' + str(
                self.end) + ',' + str(dt1) + '\n'
            self.file.write(News)
            self.question_url = self.questionUrl_list[i]
            self.id = self.question_url.replace('/question/', '')
            logfielname = '/log/' + dt + sys._getframe(
            ).f_code.co_name + '.log'
            logger = Logger(logfilename=logfielname,
                            logname='正在爬取第' + str(i + 1) + '项问题的回答').getlog()
            answer_url = 'https://www.zhihu.com/api/v4/questions/' + str(
                self.id
            ) + '/answers?include=data%5B*%5D.is_normal%2Cis_collapsed%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
            answer_number = 0
            while 1:
                try:
                    r = requests.get(answer_url,
                                     headers=self.headers,
                                     timeout=5,
                                     proxies=self.current_proxy)
                    time.sleep(3)
                    logger.info('第一次请求状态码' + str(r.status_code))
                    if r.status_code == 404:
                        self.is_del = True
                        logger.info('!!!该问题被删!!!')
                        self.delLogger(logger)
                        break
                    elif r.status_code == 200:
                        j = r.json()
                        answer_number = j['paging']['totals']
                    elif r.status_code == 401:
                        logger.info('Cookie过期,正在更换')
                        f = open('Cookies/question_answers_cookies.txt', "r")
                        Lines = f.readlines()
                        if len(Lines) == 0:
                            logger.info('备用Cookies用完!')
                            self.delLogger(logger)
                            return
                        else:
                            self.change_cookie()
                            with open(
                                    'User/question_answers_loseuser_' +
                                    str(self.fileNum) + '.txt', 'a+') as f1:
                                f1.write(str(i + 1) + '\n')
                    else:
                        self.delLogger(logger)
                        return
                except Exception, e:
                    logger.error('查看回答数出错!' + str(e))
                    self.current_proxy = get_IP()
                    logger.warning('切换ip代理!中断3秒!')
                    time.sleep(3)
                else:
                    # 没有回答的问题也要保存一下
                    if answer_number == 0:
                        logger.warning('该问题没有回答!')
                        self.delLogger(logger)
                        data_plus = {'question_id': self.id, "answer_num": 0}
                        self.mongo.db.question_answers.insert(data_plus)
                        break
                    else:
                        offset = 0
                        while 1:
                            try:
                                soup = requests.get(answer_url.format(
                                    str(offset)),
                                                    headers=self.headers,
                                                    timeout=5,
                                                    proxies=self.current_proxy)
                                time.sleep(3)
                                logger.info('请求状态码' + str(soup.status_code))
                            except Exception, e:
                                logger.error('请求回答出错!' + str(e))
                                self.current_proxy = get_IP()
                                logger.warning('切换ip代理!中断3秒!')
                                time.sleep(3)
                                continue
                            else:
                                answer_data = soup.json()
                                answer_info = answer_data['data']
                                if answer_data['paging']['is_end']:
                                    answer_list = []
                                    for i in range(0, len(answer_info)):
                                        #回答时间
                                        created_time = answer_info[i][
                                            'created_time']
                                        #更新时间
                                        updated_time = answer_info[i][
                                            'updated_time']
                                        #回答的点赞数 int
                                        vote_count = answer_info[i][
                                            'voteup_count']
                                        #回答id int
                                        answer_id = answer_info[i]['id']
                                        #回答文本
                                        answer_content = answer_info[i][
                                            'content']
                                        #评论数
                                        comment_count = answer_info[i][
                                            'comment_count']
                                        #回答者
                                        author_json = answer_info[i]['author']
                                        #回答者url_token
                                        author_url = answer_info[i]['author'][
                                            'url_token']
                                        data = {
                                            "created_time": created_time,
                                            "updated_time": updated_time,
                                            "vote_count": vote_count,
                                            "answer_id": answer_id,
                                            "answer_content": answer_content,
                                            "comment_count": comment_count,
                                            "author_json": author_json,
                                            "author_url": author_url,
                                        }
                                        if updated_time > 1500393600:
                                            answer_list.append(data)
                                        else:
                                            break
                                    data_plus = {
                                        "question_id": self.id,
                                        "answer_num": answer_number,
                                        "answers": answer_list
                                    }

                                    self.mongo.db.question_answers.insert(
                                        data_plus)
                                    logger.info('已获得该问题下所有回答!')
                                    break
                                else:
                                    offset = offset + 20
                                    answer_list = []
                                    for i in range(0, 20):
                                        #回答时间
                                        created_time = answer_info[i][
                                            'created_time']
                                        #更新时间
                                        updated_time = answer_info[i][
                                            'updated_time']
                                        #回答的点赞数 int
                                        vote_count = answer_info[i][
                                            'voteup_count']
                                        #回答id int
                                        answer_id = answer_info[i]['id']
                                        #回答文本
                                        answer_content = answer_info[i][
                                            'content']
                                        #评论数
                                        comment_count = answer_info[i][
                                            'comment_count']
                                        #回答者
                                        author_json = answer_info[i]['author']
                                        #回答者url_token
                                        author_url = answer_info[i]['author'][
                                            'url_token']
                                        data = {
                                            "created_time": created_time,
                                            "updated_time": updated_time,
                                            "vote_count": vote_count,
                                            "answer_id": answer_id,
                                            "answer_content": answer_content,
                                            "comment_count": comment_count,
                                            "author_json": author_json,
                                            "author_url": author_url,
                                        }
                                        if updated_time > 1500393600:
                                            answer_list.append(data)
                                        else:
                                            self.state = True
                                            break
                                    data_plus = {
                                        "question_id": self.id,
                                        "answer_num": answer_number,
                                        "answers": answer_list
                                    }
                                    self.mongo.db.question_answers.insert(
                                        data_plus)
                                    if self.state:
                                        self.delLogger(logger)
                                        break

                        logger.info('成功保存数据!')
                        self.delLogger(logger)
                        break