Ejemplo n.º 1
0
 def __init__(self, path, failPath):
     self.path = path
     self.failPath = failPath
     self.failId = []
     self.helptool = HelpTool()
     # 实例化爬虫类和数据库连接工具类
     self.db_helper = DbHelper()
     self.login = GetCookies()
     # self.login = Login()
     self.start_time = datetime.datetime.now()
     self.end_time = datetime.datetime.now()
     self.headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}
Ejemplo n.º 2
0
class storeData:
    def __init__(self):
        # 实例化爬虫类和数据库连接工具类
        self.db_helper = DbHelper()
        self.helptool = HelpTool()

    def readPersonUrl(self, path):
        data = self.db_helper.queryPersonUrl()
        # 释放资源
        self.db_helper.close_db()
        if data:
            self.helptool.storeFailData(path, data)
            print('存储成功!')
        else:
            print('存储失败')
Ejemplo n.º 3
0
class personInfo:
    def __init__(self, path, failPath):
        self.path = path
        self.failPath = failPath
        self.failId = []
        self.helptool = HelpTool()
        # 实例化爬虫类和数据库连接工具类
        self.db_helper = DbHelper()
        self.login = GetCookies()
        # self.login = Login()
        self.start_time = datetime.datetime.now()
        self.end_time = datetime.datetime.now()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
        }

    def parsePage(self, personUrl, html, name):
        personalInfo = Entity.personalInfo.copy()
        tmp = personUrl.split('/')
        if tmp[-1] == '':
            personalInfo['pid'] = tmp[-2]
        else:
            personalInfo['pid'] = tmp[-1]
        personalInfo['name'] = name[0].strip()
        personalInfo['personUrl'] = personUrl
        try:
            personalInfo['register_time'] = html.xpath(
                '//*[@id="profile"]/div/div[2]/div[1]/div/div/text()'
            )[1].strip()[:-2]
        except Exception:
            pass
        try:
            personalInfo['location'] = html.xpath(
                '//*[@id="profile"]/div/div[2]/div[1]/div/a/text()')[0].strip(
                )
        except Exception:
            pass
        try:
            personalInfo['introduction'] = ''.join(
                html.xpath('//*[@id="intro_display"]/text()'))
        except Exception:
            pass
        try:
            temp = html.xpath('//*[@id="friend"]/h2/span/a/text()')[0].strip()
            personalInfo['follow_num'] = re.search('(\d+)', temp).group()
        except Exception:
            pass
        try:
            personalInfo['follow_url'] = html.xpath(
                '//*[@id="friend"]/h2/span/a/@href')[0]
        except Exception:
            pass
        try:
            temps = html.xpath('//*[@id="movie"]/h2/span/a')
            #['https://movie.douban.com/people/153843683/do', 'https://movie.douban.com/people/153843683/wish', 'https://movie.douban.com/people/153843683/collect']
            for temp in temps:
                result = temp.xpath('@href')[0]
                num = temp.xpath('text()')[0]
                num1 = re.search('(\d+)', num).group()
                tmp = result.split('/')
                url = ''
                if tmp[-1] == '':
                    url = tmp[-2]
                else:
                    url = tmp[-1]
                if url == 'do':
                    personalInfo['do'] = result
                    personalInfo['do_num'] = num1
                elif url == 'wish':
                    personalInfo['wish'] = result
                    personalInfo['wish_num'] = num1
                elif url == 'collect':
                    personalInfo['collect'] = result
                    personalInfo['collect_num'] = num1
        except Exception:
            pass
        return personalInfo

    def requestInfo(self, cookies, personUrl, peopleIndex, retryTime):
        sys.stdout.flush()
        r = requests.get(personUrl, headers=self.headers, cookies=cookies)
        r.encoding = 'utf-8'
        if r.status_code != 200:
            self.failId.append(personUrl)
            print('----------爬取第{}个用户,链接为{},爬取失败----------'.format(
                str(peopleIndex), personUrl))
            Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
            return 2
        # 提示当前到达的id(log)
        print('正在爬取第{}个用户,链接为{}的个人信息'.format(str(peopleIndex), personUrl))
        html = etree.HTML(r.text)
        name = html.xpath(
            '//*[@id="profile"]/div/div[2]/div[1]/div/div/text()[1]')
        if not name:
            if retryTime >= constants.MAX_RETRY_TIMES:
                return 2
            retryTime += 1
            print('cookies失效')
            end_time1 = datetime.datetime.now()
            print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time))
            cookies = self.login.getCookie()
            if not cookies:
                print('获取cookie失败,退出程序!')
                return 1
            return self.requestInfo(cookies, personUrl, peopleIndex, retryTime)
        else:
            personalInfo = self.parsePage(personUrl, html, name)
            # 豆瓣数据有效,写入数据库
            if personalInfo:
                self.db_helper.insert_personalInfo(personalInfo)
                print('插入链接为{}用户信息成功!'.format(personUrl))
            return 0

    def end(self):
        # 存储爬取失败的电影id
        self.helptool.storeFailData(self.failPath, self.failId)
        # 释放资源
        self.db_helper.close_db()
        self.end_time = datetime.datetime.now()
        self.login.closeChrome()

    # 每次爬取一个电影的影评用一个新的用户

    def spider(self):
        peopleIndex = 1
        times = 0
        cookies = self.login.getCookie()
        with open(self.path, "r") as f:  # 设置文件对象
            for personUrl in f:
                if personUrl[-1] == '\n':
                    personUrl = personUrl[:-1]
                if times >= constants.MAX_URL_TIMES:
                    times = 0
                    cookies = self.login.getCookie()
                if not cookies:
                    print('获取cookie失败,退出程序!')
                    print(personUrl)
                    break
                sys.stdout.flush()
                flag = self.requestInfo(cookies, personUrl, peopleIndex, 1)
                if flag == 1:
                    print(personUrl)
                    break
                peopleIndex += 1
                times += 1
                Utils.delay(constants.DELAY_MIN_SECOND,
                            constants.DELAY_MAX_SECOND)
        self.end()
Ejemplo n.º 4
0
class FollowPersonUrl:
    def __init__(self, path, failPath):
        self.path = path
        self.failPath = failPath
        self.failId = []
        self.helptool = HelpTool()
        # 实例化爬虫类和数据库连接工具类
        self.db_helper = DbHelper()
        self.login = GetCookies()
        # self.login = Login()
        self.start_time = datetime.datetime.now()
        self.end_time = datetime.datetime.now()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
        }

    def parsePage(self, result, personUrl):
        followPersonUrls = []
        for item in result:
            try:
                followPersonUrl = Entity.followPersonUrl.copy()
                followName = item.xpath('dd/a/text()')[0].strip()
                if followName == "[已注销]" or followName == "已注销":
                    continue
                tmp = personUrl.split('/')
                if tmp[-1] == '':
                    followPersonUrl['originalId'] = tmp[-3]
                else:
                    followPersonUrl['originalId'] = tmp[-2]
                url = item.xpath('dd/a/@href')[0].strip()
                followPersonUrl['followUrl'] = url
                tmp = url.split('/')
                if tmp[-1] == '':
                    followPersonUrl['followId'] = tmp[-2]
                else:
                    followPersonUrl['followId'] = tmp[-1]
                followPersonUrls.append(followPersonUrl)
            except Exception:
                pass
        return followPersonUrls

    def request_personfollowurl(self, cookies, personUrl, peopleIndex,
                                retryTime):
        sys.stdout.flush()
        r = requests.get(personUrl, headers=self.headers, cookies=cookies)
        r.encoding = 'utf-8'
        if r.status_code != 200:
            self.failId.append(personUrl)
            print('----------爬取第{}个用户,链接为{}的关注人链接,爬取失败----------'.format(
                str(peopleIndex), personUrl))
            Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
            return 2
        # 提示当前到达的id(log)
        print('正在爬取第{}个用户,链接为{}的关注人链接信息'.format(str(peopleIndex), personUrl))
        html = etree.HTML(r.text)
        result = html.xpath('//dl[@class="obu"]')
        if not result:
            if retryTime >= constants.MAX_RETRY_TIMES:
                return 2
            retryTime += 1
            print('cookies失效')
            end_time1 = datetime.datetime.now()
            print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time))
            cookies = self.login.getCookie()
            if not cookies:
                print('获取cookie失败,退出程序!')
                return 1
            return self.request_personfollowurl(cookies, personUrl,
                                                peopleIndex, retryTime)
        else:
            followPersonUrls = self.parsePage(result, personUrl)
            # 豆瓣数据有效,写入数据库
            if followPersonUrls:
                self.db_helper.insert_followPersonUrl(followPersonUrls)
                print('插入链接为{}用户关注人链接成功!'.format(personUrl))
            return 0

    def end(self):
        # 存储爬取失败的电影id
        self.helptool.storeFailData(self.failPath, self.failId)
        # 释放资源
        self.db_helper.close_db()
        self.end_time = datetime.datetime.now()
        self.login.closeChrome()

    def spider(self):
        peopleIndex = 1
        times = 0
        cookies = self.login.getCookie()
        with open(self.path, "r") as f:  # 设置文件对象
            for personUrl in f:
                if personUrl[-1] == '\n':
                    personUrl = personUrl[:-1]
                if times >= constants.MAX_URL_TIMES:
                    times = 0
                    cookies = self.login.getCookie()
                if not cookies:
                    print('获取cookie失败,退出程序!')
                    print(personUrl)
                    break
                sys.stdout.flush()
                flag = self.request_personfollowurl(cookies, personUrl,
                                                    peopleIndex, 1)
                if flag == 1:
                    print(personUrl)
                    break
                peopleIndex += 1
                times += 1
                Utils.delay(constants.DELAY_MIN_SECOND,
                            constants.DELAY_MAX_SECOND)
        self.end()
Ejemplo n.º 5
0
class Movie:
    def __init__(self, path, failPath):
        self.path = path
        self.failPath = failPath
        self.failId = []
        self.helptool = HelpTool()
        # 实例化爬虫类和数据库连接工具类
        self.db_helper = DbHelper()
        self.login = GetCookies()
        # self.login = Login()
        self.start_time = datetime.datetime.now()
        self.end_time = datetime.datetime.now()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
        }
        # 实例化爬虫类和数据库连接工具类
        self.movie_parser = MovieParser()

    def request_movie(self, cookies, mid, movieIndex, retryTime):
        sys.stdout.flush()
        r = requests.get(constants.URL_PREFIX + mid,
                         headers=self.headers,
                         cookies=cookies)
        r.encoding = 'utf-8'
        if r.status_code != 200:
            self.failId.append(mid)
            print('----------爬取第{}部电影信息,id为{},爬取失败----------'.format(
                str(movieIndex), mid))
            Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
            return 2
        print('正在爬取第{}部电影信息,id为{}'.format(str(movieIndex), mid))
        movie = self.movie_parser.extract_movie_info(r)
        if not movie:
            if retryTime >= constants.MAX_RETRY_TIMES:
                return 2
            retryTime += 1
            print('cookies失效')
            end_time1 = datetime.datetime.now()
            print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time))
            cookies = self.login.getCookie()
            if not cookies:
                print('获取cookie失败,退出程序!')
                return 1
            return self.request_movie(cookies, mid, movieIndex, retryTime)
        else:
            # 豆瓣数据有效,写入数据库
            movie['douban_id'] = mid
            self.db_helper.insert_movie(movie)
            print('----------电影id ' + mid + ':爬取成功' + '----------')
            return 0

    def end(self):
        # 存储爬取失败的电影id
        self.helptool.storeFailData(self.failPath, self.failId)
        # 释放资源
        self.db_helper.close_db()
        self.end_time = datetime.datetime.now()
        self.login.closeChrome()

    def spider(self):
        movieIndex = 1
        times = 0
        cookies = self.login.getCookie()
        with open(self.path, "r") as f:  # 设置文件对象
            for mid in f:
                if mid[-1] == '\n':
                    mid = mid[:-1]
                if times >= constants.MAX_URL_TIMES:
                    times = 0
                    cookies = self.login.getCookie()
                if not cookies:
                    print('获取cookie失败,退出程序!')
                    print(mid)
                    break
                sys.stdout.flush()
                flag = self.request_movie(cookies, mid, movieIndex, 1)
                if flag == 1:
                    print(mid)
                    break
                movieIndex += 1
                times += 1
                Utils.delay(constants.DELAY_MIN_SECOND,
                            constants.DELAY_MAX_SECOND)
        self.end()
Ejemplo n.º 6
0
class DoMovie:
    def __init__(self, path, failPath):
        self.path = path
        self.failPath = failPath
        self.failId = []
        self.helptool = HelpTool()
        # 实例化爬虫类和数据库连接工具类
        self.db_helper = DbHelper()
        self.login = GetCookies()
        # self.login = Login()
        self.start_time = datetime.datetime.now()
        self.end_time = datetime.datetime.now()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
        }

    def parsePage(self, pid, do_url, result, html):
        ids = 1
        doComments = []
        star = 'allstar{}0 rating'
        for item in result:
            movieComment = Entity.movieComment.copy()
            movieComment['people_id'] = pid
            movieComment['comment_url'] = do_url
            movieComment['comment_id'] = str(
                datetime.datetime.now().timestamp()).replace('.',
                                                             '') + str(ids)
            ids += 1
            tmp = item.xpath(
                'div[@class="info"]/ul/li[@class="title"]/a/@href')
            if not tmp:
                continue
            else:
                tmp1 = tmp[0].strip().split('/')
                if tmp1[-1] == '':
                    douban_id = tmp1[-2]
                else:
                    douban_id = tmp1[-1]
                movieComment['douban_id'] = douban_id
            try:
                tmp = item.xpath(
                    'div[@class="info"]/ul/li[3]/span[1]/@class')[0].strip()
                tmp1 = re.search('(\d+)', tmp).group()
                movieComment['star'] = star.format(tmp1)
            except Exception:
                pass
            try:
                movieComment['time'] = item.xpath(
                    'div[@class="info"]/ul/li[3]/span[@class="date"]/text()'
                )[0].strip()
            except Exception:
                pass
            try:
                movieComment['content'] = item.xpath(
                    'div[@class="info"]/ul/li[4]/span[@class="comment"]/text()'
                )[0].strip()
            except Exception:
                pass
            # try:
            #     tmp = item.xpath('div[@class="info"]/ul/li[4]/span[@class="p1"]/text()')[0].strip()
            #     movieComment['useful_num'] = re.search('(\d+)', tmp).group()
            # except Exception:
            #     pass
            doComments.append(movieComment)
        nextUrl = html.xpath(
            '//div[@class="paginator"]/span[@class="next"]/a[1]/@href')
        return doComments, nextUrl

    def requestDo(self, cookies, pid, pageIndex, base_url, nextUrl, retryTime):
        sys.stdout.flush()
        if nextUrl:
            do_url = base_url + nextUrl
        else:
            do_url = base_url
        r = requests.get(do_url, headers=self.headers, cookies=cookies)
        r.encoding = 'utf-8'
        if r.status_code != 200:
            self.failId.append(pid)
            Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
            return 2
        html = etree.HTML(r.text)
        result = html.xpath('//div[@class="item"]')
        if not result:
            if retryTime >= constants.MAX_RETRY_TIMES:
                return 2
            retryTime += 1
            print('cookie失效')
            end_time1 = datetime.datetime.now()
            print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time))
            cookies = self.login.getCookie()
            if not cookies:
                print('获取session失败,退出程序!')
                return 1
            return self.requestDo(cookies, pid, pageIndex, base_url, nextUrl,
                                  retryTime)
        else:
            doComments, nextUrl = self.parsePage(pid, do_url, result, html)
            # 豆瓣数据有效,写入数据库
            if doComments:
                self.db_helper.insert_movieComments(doComments)
                print('插入第{}页短评成功!'.format(pageIndex))
                pageIndex += 1
            if nextUrl:
                base_url = 'https://movie.douban.com'
                Utils.delay(constants.DELAY_MIN_SECOND,
                            constants.DELAY_MAX_SECOND)
                return self.requestDo(cookies, pid, pageIndex, base_url,
                                      nextUrl[0], 1)
            return 0

    def end(self):
        # 存储爬取失败的电影id
        self.helptool.storeFailData(self.failPath, self.failId)
        # 释放资源
        self.db_helper.close_db()
        self.end_time = datetime.datetime.now()
        self.login.closeChrome()

    def spider(self):
        times = 0
        cookies = self.login.getCookie()
        with open(self.path, "r") as f:  # 设置文件对象
            for doUrl in f:
                pid = ''
                if times >= constants.MAX_URL_TIMES:
                    times = 0
                    cookies = self.login.getCookie()
                if not cookies:
                    print('获取session失败,退出程序!')
                    print(doUrl)
                    break
                tmp = doUrl.split('/')
                if tmp[-1] == '':
                    pid = tmp[-3]
                else:
                    pid = tmp[-2]
                # 提示当前到达的id(log)
                print('当前爬取用户id {} 正在看的电影!'.format(pid))
                sys.stdout.flush()
                flag = self.requestDo(cookies, pid, 1, doUrl.strip(), None, 1)
                if flag == 1:
                    print(pid)
                    break
                times += 1
                Utils.delay(constants.DELAY_MIN_SECOND,
                            constants.DELAY_MAX_SECOND)
        self.end()
Ejemplo n.º 7
0
class WishMovie:
    def __init__(self, path, failPath):
        self.path = path
        self.failPath = failPath
        self.failId = []
        self.helptool = HelpTool()
        # 实例化爬虫类和数据库连接工具类
        self.db_helper = DbHelper()
        self.login = GetCookies()
        # self.login = Login()
        self.start_time = datetime.datetime.now()
        self.end_time = datetime.datetime.now()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}

    def parsePage(self, pid, result, html):
        wishMovies = []
        for item in result:
            wishMovie = Entity.wishMovie.copy()
            wishMovie['people_id'] = pid
            tmp = item.xpath('div[@class="info"]/ul/li[@class="title"]/a/@href')
            if not tmp:
                continue
            else:
                tmp1 = tmp[0].strip().split('/')
                if tmp1[-1] == '':
                    douban_id = tmp1[-2]
                else:
                    douban_id = tmp1[-1]
                wishMovie['douban_id'] = douban_id
            try:
                wishMovie['time'] = item.xpath('div[@class="info"]/ul/li[3]/span[@class="date"]/text()')[0].strip()
            except Exception:
                pass
            wishMovies.append(wishMovie)
        nextUrl = html.xpath('//div[@class="paginator"]/span[@class="next"]/a[1]/@href')
        return wishMovies, nextUrl

    def requestWish(self, cookies, pid, pageIndex, base_url, nextUrl,retryTime):
        sys.stdout.flush()
        if nextUrl:
            wish_url = base_url + nextUrl
        else:
            wish_url = base_url
        r = requests.get(
            wish_url,
            headers=self.headers,
            cookies=cookies
        )
        r.encoding = 'utf-8'
        if r.status_code != 200:
            self.failId.append(pid)
            Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
            return 2
        html = etree.HTML(r.text)
        result = html.xpath('//div[@class="item"]')
        if not result:
            if retryTime >= constants.MAX_RETRY_TIMES:
                return 2
            retryTime += 1
            print('cookie失效')
            end_time1 = datetime.datetime.now()
            print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time))
            cookies = self.login.getCookie()
            if not cookies:
                print('获取session失败,退出程序!')
                return 1
            return self.requestWish(cookies, pid, pageIndex, base_url, nextUrl,retryTime)
        else:
            wishMovies, nextUrl = self.parsePage(pid, result, html)
            # 豆瓣数据有效,写入数据库
            if wishMovies:
                self.db_helper.insert_wishMovies(wishMovies)
                print('插入第{}页想看的电影成功!'.format(pageIndex))
                pageIndex += 1
            if nextUrl:
                base_url = 'https://movie.douban.com'
                Utils.delay(constants.DELAY_MIN_SECOND,
                            constants.DELAY_MAX_SECOND)
                return self.requestWish(cookies, pid, pageIndex, base_url, nextUrl[0],1)
            return 0

    def end(self):
        # 存储爬取失败的电影id
        self.helptool.storeFailData(self.failPath, self.failId)
        # 释放资源
        self.db_helper.close_db()
        self.end_time = datetime.datetime.now()
        self.login.closeChrome()

    def spider(self):
        times = 0
        cookies = self.login.getCookie()
        with open(self.path, "r") as f:  # 设置文件对象
            for wishUrl in f:
                pid = ''
                if times >= constants.MAX_URL_TIMES:
                    times = 0
                    cookies = self.login.getCookie()
                if not cookies:
                    print('获取session失败,退出程序!')
                    print(wishUrl)
                    break
                sys.stdout.flush()
                tmp = wishUrl.split('/')
                if tmp[-1] == '':
                    pid = tmp[-3]
                else:
                    pid = tmp[-2]
                # 提示当前到达的id(log)
                print('当前爬取用户id {} 想看的电影!'.format(pid))
                flag = self.requestWish(cookies, pid, 1, wishUrl.strip(), None,1)
                if flag == 1:
                    print(pid)
                    break
                times += 1
                Utils.delay(constants.DELAY_MIN_SECOND,
                            constants.DELAY_MAX_SECOND)
        self.end()
Ejemplo n.º 8
0
class Comment:
    def __init__(self, path, failPath):
        self.path = path
        self.failPath = failPath
        self.failId = []
        self.helptool = HelpTool()
        # 实例化爬虫类和数据库连接工具类
        self.db_helper = DbHelper()
        self.login = GetCookies()
        # self.login = Login()
        self.start_time = datetime.datetime.now()
        self.end_time = datetime.datetime.now()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
        }

    def parsePage(self, mid, comment_url, result, html):
        movieComments = []
        # 提取豆瓣数据
        for item in result:
            movieComment = Entity.movieComment.copy()
            movieComment['douban_id'] = mid
            movieComment['comment_url'] = comment_url
            try:
                # 短评的唯一id
                movieComment['comment_id'] = \
                    item.xpath(
                        'div[@class="comment"]/h3/span[@class="comment-vote"]/input/@value')[0].strip()
            except Exception:
                pass
            try:
                # 多少人评论有用
                movieComment['useful_num'] = \
                    item.xpath(
                        'div[@class="comment"]/h3/span[@class="comment-vote"]/span/text()')[0].strip()
            except Exception:
                pass
            try:
                # 评分
                movieComment['star'] = \
                    item.xpath(
                        'div[@class="comment"]/h3/span[@class="comment-info"]/span[2]/@class')[0].strip()
            except Exception:
                pass
            try:
                # 评论时间
                movieComment['time'] = item.xpath(
                    'div[@class="comment"]/h3/span[@class="comment-info"]/span[@class="comment-time "]/@title'
                )[0]
            except Exception:
                pass
            try:
                # 评论内容
                movieComment['content'] = item.xpath(
                    'div[@class="comment"]/p/span/text()')[0]
            except Exception:
                pass
            try:
                # 评论者名字(唯一)
                movieComment['people'] = item.xpath(
                    'div[@class="avatar"]/a/@title')[0]
            except Exception:
                pass
            try:
                # 评论者页面
                url = item.xpath('div[@class="avatar"]/a/@href')[0].strip()
                tmp = url.split('/')
                if tmp[-1] == '':
                    movieComment['people_id'] = tmp[-2]
                else:
                    movieComment['people_id'] = tmp[-1]
                movieComment['people_url'] = item.xpath(
                    'div[@class="avatar"]/a/@href')[0]
            except Exception:
                pass
            movieComments.append(movieComment)
        nextUrl = html.xpath('//a[@class="next"]/@href')
        return movieComments, nextUrl

    def requestComment(self, cookies, mid, pageIndex, base_url, nextUrl,
                       retryTime):
        # headers = {'User-Agent': random.choice(constants.USER_AGENT)}
        # 获取豆瓣页面(API)数据
        sys.stdout.flush()
        if nextUrl:
            comment_url = base_url + nextUrl
        else:
            comment_url = base_url
        r = requests.get(comment_url, headers=self.headers, cookies=cookies)
        r.encoding = 'utf-8'
        if r.status_code != 200:
            self.failId.append(mid)
            Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
            return 2
        html = etree.HTML(r.text)
        result = html.xpath('//div[@class="comment-item"]')
        # 如果获取的数据为空,延时以减轻对目标服务器的压力,并跳过。
        if not result:
            if retryTime >= constants.MAX_RETRY_TIMES:
                return 2
            retryTime += 1
            print('session失效')
            end_time1 = datetime.datetime.now()
            print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time))
            cookies = self.login.getCookie()
            if not cookies:
                print('获取session失败,退出程序!')
                return 1
            return self.requestComment(cookies, mid, pageIndex, base_url,
                                       nextUrl, retryTime)
        else:
            movieComments, nextUrl = self.parsePage(mid, comment_url, result,
                                                    html)
            # 豆瓣数据有效,写入数据库
            if movieComments:
                self.db_helper.insert_movieComments(movieComments)
                print('插入第{}页短评成功!'.format(pageIndex))
                pageIndex += 1
            if nextUrl:
                Utils.delay(constants.DELAY_MIN_SECOND,
                            constants.DELAY_MAX_SECOND)
                return self.requestComment(cookies, mid, pageIndex, base_url,
                                           nextUrl[0], 1)
            return 0

    def end(self):
        # 存储爬取失败的电影id
        self.helptool.storeFailData(self.failPath, self.failId)
        # 释放资源
        self.db_helper.close_db()
        self.end_time = datetime.datetime.now()
        self.login.closeChrome()

    # 每次爬取一个电影的影评用一个新的用户
    def spider(self):
        with open(self.path, "r") as f:  # 设置文件对象
            for mid in f:
                cookies = self.login.getCookie()
                if not cookies:
                    print('获取session失败,退出程序!')
                    print(mid)
                    break
                sys.stdout.flush()
                if mid[-1] == '\n':
                    mid = mid[:-1]
                base_url = constants.URL_PREFIX + mid + "/comments"
                # 提示当前到达的id(log)
                print('当前爬取电影id {} 的影评!'.format(mid))
                flag = self.requestComment(cookies, mid, 1, base_url, None, 1)
                if flag == 1:
                    print(mid)
                    break
                Utils.delay(constants.DELAY_MIN_SECOND,
                            constants.DELAY_MAX_SECOND)
        self.end()
Ejemplo n.º 9
0
 def __init__(self):
     # 实例化爬虫类和数据库连接工具类
     self.db_helper = DbHelper()
     self.helptool = HelpTool()