Ejemplo n.º 1
0
 def requestInfo(self, cookies, personUrl, peopleIndex, retryTime):
     sys.stdout.flush()
     r = requests.get(personUrl, headers=self.headers, cookies=cookies)
     r.encoding = 'utf-8'
     if r.status_code != 200:
         self.failId.append(personUrl)
         print('----------爬取第{}个用户,链接为{},爬取失败----------'.format(
             str(peopleIndex), personUrl))
         Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
         return 2
     # 提示当前到达的id(log)
     print('正在爬取第{}个用户,链接为{}的个人信息'.format(str(peopleIndex), personUrl))
     html = etree.HTML(r.text)
     name = html.xpath(
         '//*[@id="profile"]/div/div[2]/div[1]/div/div/text()[1]')
     if not name:
         if retryTime >= constants.MAX_RETRY_TIMES:
             return 2
         retryTime += 1
         print('cookies失效')
         end_time1 = datetime.datetime.now()
         print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time))
         cookies = self.login.getCookie()
         if not cookies:
             print('获取cookie失败,退出程序!')
             return 1
         return self.requestInfo(cookies, personUrl, peopleIndex, retryTime)
     else:
         personalInfo = self.parsePage(personUrl, html, name)
         # 豆瓣数据有效,写入数据库
         if personalInfo:
             self.db_helper.insert_personalInfo(personalInfo)
             print('插入链接为{}用户信息成功!'.format(personUrl))
         return 0
Ejemplo n.º 2
0
 def spider(self):
     peopleIndex = 1
     times = 0
     cookies = self.login.getCookie()
     with open(self.path, "r") as f:  # 设置文件对象
         for personUrl in f:
             if personUrl[-1] == '\n':
                 personUrl = personUrl[:-1]
             if times >= constants.MAX_URL_TIMES:
                 times = 0
                 cookies = self.login.getCookie()
             if not cookies:
                 print('获取cookie失败,退出程序!')
                 print(personUrl)
                 break
             sys.stdout.flush()
             flag = self.requestInfo(cookies, personUrl, peopleIndex, 1)
             if flag == 1:
                 print(personUrl)
                 break
             peopleIndex += 1
             times += 1
             Utils.delay(constants.DELAY_MIN_SECOND,
                         constants.DELAY_MAX_SECOND)
     self.end()
Ejemplo n.º 3
0
 def request_personfollowurl(self, cookies, personUrl, peopleIndex,
                             retryTime):
     sys.stdout.flush()
     r = requests.get(personUrl, headers=self.headers, cookies=cookies)
     r.encoding = 'utf-8'
     if r.status_code != 200:
         self.failId.append(personUrl)
         print('----------爬取第{}个用户,链接为{}的关注人链接,爬取失败----------'.format(
             str(peopleIndex), personUrl))
         Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
         return 2
     # 提示当前到达的id(log)
     print('正在爬取第{}个用户,链接为{}的关注人链接信息'.format(str(peopleIndex), personUrl))
     html = etree.HTML(r.text)
     result = html.xpath('//dl[@class="obu"]')
     if not result:
         if retryTime >= constants.MAX_RETRY_TIMES:
             return 2
         retryTime += 1
         print('cookies失效')
         end_time1 = datetime.datetime.now()
         print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time))
         cookies = self.login.getCookie()
         if not cookies:
             print('获取cookie失败,退出程序!')
             return 1
         return self.request_personfollowurl(cookies, personUrl,
                                             peopleIndex, retryTime)
     else:
         followPersonUrls = self.parsePage(result, personUrl)
         # 豆瓣数据有效,写入数据库
         if followPersonUrls:
             self.db_helper.insert_followPersonUrl(followPersonUrls)
             print('插入链接为{}用户关注人链接成功!'.format(personUrl))
         return 0
Ejemplo n.º 4
0
 def request_movie(self, cookies, mid, movieIndex, retryTime):
     sys.stdout.flush()
     r = requests.get(constants.URL_PREFIX + mid,
                      headers=self.headers,
                      cookies=cookies)
     r.encoding = 'utf-8'
     if r.status_code != 200:
         self.failId.append(mid)
         print('----------爬取第{}部电影信息,id为{},爬取失败----------'.format(
             str(movieIndex), mid))
         Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
         return 2
     print('正在爬取第{}部电影信息,id为{}'.format(str(movieIndex), mid))
     movie = self.movie_parser.extract_movie_info(r)
     if not movie:
         if retryTime >= constants.MAX_RETRY_TIMES:
             return 2
         retryTime += 1
         print('cookies失效')
         end_time1 = datetime.datetime.now()
         print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time))
         cookies = self.login.getCookie()
         if not cookies:
             print('获取cookie失败,退出程序!')
             return 1
         return self.request_movie(cookies, mid, movieIndex, retryTime)
     else:
         # 豆瓣数据有效,写入数据库
         movie['douban_id'] = mid
         self.db_helper.insert_movie(movie)
         print('----------电影id ' + mid + ':爬取成功' + '----------')
         return 0
Ejemplo n.º 5
0
 def spider(self):
     movieIndex = 1
     times = 0
     cookies = self.login.getCookie()
     with open(self.path, "r") as f:  # 设置文件对象
         for mid in f:
             if mid[-1] == '\n':
                 mid = mid[:-1]
             if times >= constants.MAX_URL_TIMES:
                 times = 0
                 cookies = self.login.getCookie()
             if not cookies:
                 print('获取cookie失败,退出程序!')
                 print(mid)
                 break
             sys.stdout.flush()
             flag = self.request_movie(cookies, mid, movieIndex, 1)
             if flag == 1:
                 print(mid)
                 break
             movieIndex += 1
             times += 1
             Utils.delay(constants.DELAY_MIN_SECOND,
                         constants.DELAY_MAX_SECOND)
     self.end()
Ejemplo n.º 6
0
 def spider(self):
     times = 0
     cookies = self.login.getCookie()
     with open(self.path, "r") as f:  # 设置文件对象
         for doUrl in f:
             pid = ''
             if times >= constants.MAX_URL_TIMES:
                 times = 0
                 cookies = self.login.getCookie()
             if not cookies:
                 print('获取session失败,退出程序!')
                 print(doUrl)
                 break
             tmp = doUrl.split('/')
             if tmp[-1] == '':
                 pid = tmp[-3]
             else:
                 pid = tmp[-2]
             # 提示当前到达的id(log)
             print('当前爬取用户id {} 正在看的电影!'.format(pid))
             sys.stdout.flush()
             flag = self.requestDo(cookies, pid, 1, doUrl.strip(), None, 1)
             if flag == 1:
                 print(pid)
                 break
             times += 1
             Utils.delay(constants.DELAY_MIN_SECOND,
                         constants.DELAY_MAX_SECOND)
     self.end()
Ejemplo n.º 7
0
 def detection(self, session):
     r = session.get(self.logined_url, headers=self.headers3)
     Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
     if r.status_code == 200:
         r.encoding = 'utf-8'
         html = etree.HTML(r.text)
         name = html.xpath(
             '//*[@id="profile"]/div/div[2]/div[1]/div/div/text()[1]')
         if not name:
             return 1
         else:
             return 0
     else:
         return 1
Ejemplo n.º 8
0
def run():
    helptool = HelpTool()
    cookieList = helptool.getCookie()
    base_url = 'https://movie.douban.com/top250'
    headers = {'User-Agent': random.choice(constants.USER_AGENT)}
    doubanIds = []
    savePath = 'data/top250id.txt'
    # 指明当前用第几个cookie
    index = 0
    # 获取豆瓣TOP250id数据
    r = requests.get(
        base_url,
        headers=headers,
        cookies=cookieList[index]
    )
    r.encoding = 'utf-8'
    while True:
        html = etree.HTML(r.text)
        result = html.xpath('//div[@class="item"]')
        # 如果获取的数据为空,延时以减轻对目标服务器的压力,并跳过。
        if not result:
            continue
        for item in result:
            doubanid = item.xpath('div/div[@class="hd"]/a/@href')
            if doubanid:
                tmp = doubanid[0].strip().split('/')
                if tmp[-1] == '':
                    value = tmp[-2]
                else:
                    value = tmp[-1]
                doubanIds.append(value)
                print('----------电影id ' + value + ':爬取成功' + '----------')
        nextUrl = html.xpath('//span[@class="next"]/a/@href')
        if not nextUrl:
            break
        url = base_url+nextUrl[0]
        index += 1
        if index >= constants.UserNum:
            index = 0
            Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
        r = requests.get(
            url,
            headers=headers,
            cookies=cookieList[index]
        )
        r.encoding = 'utf-8'
    if doubanIds:
        helptool.storeFailData(savePath, doubanIds)
Ejemplo n.º 9
0
 def getCookies1(self):
     index = 1
     cookieNum = 0
     cookieList = []
     for i in range(constants.UserNum):
         cookie = self.getCookie(constants.UserInfo[i][0],
                                 constants.UserInfo[i][1])
         if 'dbcl2' in cookie.keys():
             print('获取第{}个cookie信息成功!'.format(index))
             cookieList.append(cookie)
             cookieNum += 1
         else:
             print('获取第{}个cookie信息失败!'.format(index))
         index += 1
         Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
     return cookieList, cookieNum
Ejemplo n.º 10
0
 def requestComment(self, cookies, mid, pageIndex, base_url, nextUrl,
                    retryTime):
     # headers = {'User-Agent': random.choice(constants.USER_AGENT)}
     # 获取豆瓣页面(API)数据
     sys.stdout.flush()
     if nextUrl:
         comment_url = base_url + nextUrl
     else:
         comment_url = base_url
     r = requests.get(comment_url, headers=self.headers, cookies=cookies)
     r.encoding = 'utf-8'
     if r.status_code != 200:
         self.failId.append(mid)
         Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
         return 2
     html = etree.HTML(r.text)
     result = html.xpath('//div[@class="comment-item"]')
     # 如果获取的数据为空,延时以减轻对目标服务器的压力,并跳过。
     if not result:
         if retryTime >= constants.MAX_RETRY_TIMES:
             return 2
         retryTime += 1
         print('session失效')
         end_time1 = datetime.datetime.now()
         print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time))
         cookies = self.login.getCookie()
         if not cookies:
             print('获取session失败,退出程序!')
             return 1
         return self.requestComment(cookies, mid, pageIndex, base_url,
                                    nextUrl, retryTime)
     else:
         movieComments, nextUrl = self.parsePage(mid, comment_url, result,
                                                 html)
         # 豆瓣数据有效,写入数据库
         if movieComments:
             self.db_helper.insert_movieComments(movieComments)
             print('插入第{}页短评成功!'.format(pageIndex))
             pageIndex += 1
         if nextUrl:
             Utils.delay(constants.DELAY_MIN_SECOND,
                         constants.DELAY_MAX_SECOND)
             return self.requestComment(cookies, mid, pageIndex, base_url,
                                        nextUrl[0], 1)
         return 0
Ejemplo n.º 11
0
 def login(self, name, password):
     session = requests.Session()
     session.get(self.login_url, headers=self.headers1)
     post_data = {'name': name, 'password': password, 'remember': 'false'}
     response = session.post(self.post_url,
                             data=post_data,
                             headers=self.headers2)
     if response.status_code == 200:
         Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
         flag = self.detection(session)
         if flag == 0:
             print('获取session成功')
             return session
         else:
             print('获取session失败')
             return None
     else:
         print('获取session失败')
         return None
Ejemplo n.º 12
0
 def requestWish(self, cookies, pid, pageIndex, base_url, nextUrl,retryTime):
     sys.stdout.flush()
     if nextUrl:
         wish_url = base_url + nextUrl
     else:
         wish_url = base_url
     r = requests.get(
         wish_url,
         headers=self.headers,
         cookies=cookies
     )
     r.encoding = 'utf-8'
     if r.status_code != 200:
         self.failId.append(pid)
         Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND)
         return 2
     html = etree.HTML(r.text)
     result = html.xpath('//div[@class="item"]')
     if not result:
         if retryTime >= constants.MAX_RETRY_TIMES:
             return 2
         retryTime += 1
         print('cookie失效')
         end_time1 = datetime.datetime.now()
         print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time))
         cookies = self.login.getCookie()
         if not cookies:
             print('获取session失败,退出程序!')
             return 1
         return self.requestWish(cookies, pid, pageIndex, base_url, nextUrl,retryTime)
     else:
         wishMovies, nextUrl = self.parsePage(pid, result, html)
         # 豆瓣数据有效,写入数据库
         if wishMovies:
             self.db_helper.insert_wishMovies(wishMovies)
             print('插入第{}页想看的电影成功!'.format(pageIndex))
             pageIndex += 1
         if nextUrl:
             base_url = 'https://movie.douban.com'
             Utils.delay(constants.DELAY_MIN_SECOND,
                         constants.DELAY_MAX_SECOND)
             return self.requestWish(cookies, pid, pageIndex, base_url, nextUrl[0],1)
         return 0
Ejemplo n.º 13
0
 def spider(self):
     with open(self.path, "r") as f:  # 设置文件对象
         for mid in f:
             cookies = self.login.getCookie()
             if not cookies:
                 print('获取session失败,退出程序!')
                 print(mid)
                 break
             sys.stdout.flush()
             if mid[-1] == '\n':
                 mid = mid[:-1]
             base_url = constants.URL_PREFIX + mid + "/comments"
             # 提示当前到达的id(log)
             print('当前爬取电影id {} 的影评!'.format(mid))
             flag = self.requestComment(cookies, mid, 1, base_url, None, 1)
             if flag == 1:
                 print(mid)
                 break
             Utils.delay(constants.DELAY_MIN_SECOND,
                         constants.DELAY_MAX_SECOND)
     self.end()