Ejemplo n.º 1
0
class videoDay:
    def __init__(self):
        self.httpClint = HTTPClient()
        self.redisConn = redisUtils().redis_conn()
        self.mysqlConn = MysqlConn()

    def sendVideoByDay(self):
        """
        获取2012.1.1日到昨天的每天电影数据
        :return:
        """
        GetMovieDayBoxOfficeListUrl = urls.get("GetMovieDayBoxOfficeList", "")

        _date = datetime.datetime.strftime(
            datetime.datetime.strptime(
                datetime.datetime.strftime(
                    datetime.datetime.now(),
                    '%Y-%m-%d',
                ), '%Y-%m-%d') + datetime.timedelta(days=-1), '%Y-%m-%d')
        data = {
            "r": random.random(),
            "UserID": "",
            "DateSort": "Day",
            "Date": _date,
            "sDate": _date,
            "eDate": _date,
            "Index": "102,201,202,205,203,211,221,222,606,225,251,801,604",
            "Line": "",
            "City": "",
            "CityLevel": "",
            "ServicePrice": 1,
            "PageIndex": 1,
            "PageSize": 40,
            "Order": 201,
            "OrderType": "DESC"
        }
        GetMovieDayBoxOfficeListRsp = self.httpClint.send(
            urls=GetMovieDayBoxOfficeListUrl, data=data)
        GetMovieDayBoxOfficeListData = GetMovieDayBoxOfficeListRsp["Data"][
            "Table2"]
        self.mysqlConn.insert_video_day(GetMovieDayBoxOfficeListData, _date,
                                        self.redisConn)
Ejemplo n.º 2
0
class doubanVideo:
    def __init__(self):
        self.httpClint = HTTPClient()
        self.redisConn = redisUtils().redis_conn()
        self.mysqlConn = MysqlConn()
        self.isDone = True

    def new_search_subjects(self):
        start = 7895
        while self.isDone:
            print(start)
            new_search_subjects_urls = copy.copy(urls["new_search_subjects"])
            new_search_subjects_urls["req_url"] = new_search_subjects_urls[
                "req_url"].format(start)
            start += 1
            new_search_subjects_rsp = self.httpClint.send(
                new_search_subjects_urls)
            if new_search_subjects_rsp.get("data", ""):
                douban_datas = new_search_subjects_rsp.get("data", {})
                self.mysqlConn.insert_douban_data(douban_datas)
            else:
                self.isDone = False
Ejemplo n.º 3
0
 def __init__(self):
     self.httpClint = HTTPClient()
     self.redisConn = redisUtils().redis_conn()
     self.mysqlConn = MysqlConn()
Ejemplo n.º 4
0
 def __init__(self, threadingName):
     threading.Thread.__init__(self)
     self.threadingName = threadingName
     self.httpClint = HTTPClient()
     self.mysqlConn = MysqlConn()
     self.redisConn = redisUtils().redis_conn()
Ejemplo n.º 5
0
class commentThread(threading.Thread):
    def __init__(self, threadingName):
        threading.Thread.__init__(self)
        self.threadingName = threadingName
        self.httpClint = HTTPClient()
        self.mysqlConn = MysqlConn()
        self.redisConn = redisUtils().redis_conn()

    def run(self):
        t = threading.Thread(target=getProxy, args=(self, ))
        t.setDaemon(True)
        t.start()
        try:
            self.getComment()
        except:
            pass

    def getComment(self):
        """
        获取评论
        :return:
        # """
        # delta = datetime.timedelta(days=1)

        while self.redisConn.llen("movice"):
            time.sleep(random.randint(0, 4))
            movie = eval(self.redisConn.rpop("movice").decode())
            print(movie)
            offset = movie.get("offset", 0)
            movie_name = movie["nm"]
            # start_time = movie.get("spider_time", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))  # 获取当前时间,从当前时间向前获取
            start_time = self.redisConn.get(movie_name)
            print("start_time", start_time)
            if start_time == "done" or start_time is None:
                start_time = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')  # 获取当前时间,从当前时间向前获取
            while 1:
                try:
                    commentUrls = copy.copy(urls["comments"])
                    start_time = start_time.decode() if isinstance(
                        start_time, bytes) else start_time
                    commentUrls["req_url"] = commentUrls["req_url"].format(
                        movie.get("id"), offset, start_time)
                    # offset += 15
                    getCommnetRsp = self.httpClint.send(commentUrls)
                    if getCommnetRsp.get("cmts", ""):
                        self.mysqlConn.insert_comments(
                            getCommnetRsp.get("cmts", ""), movie)
                        for index in range(1, 4):
                            start_time = getCommnetRsp.get(
                                "cmts",
                                "")[index * -1]['startTime']  # 获得末尾评论的时间
                            if start_time:
                                break
                        start_time = datetime.datetime.strptime(
                            start_time,
                            '%Y-%m-%d %H:%M:%S') + datetime.timedelta(
                                seconds=-1)  # 转换为datetime类型,减1秒,避免获取到重复数据
                        start_time = datetime.datetime.strftime(
                            start_time, '%Y-%m-%d %H:%M:%S')  # 转换为str
                        print(
                            f"当前线程为{self.threadingName}, 当前正在爬取的电影为{movie_name}, 下次爬取评论的时间为:{start_time}"
                        )
                        self.redisConn.set(movie["nm"], start_time,
                                           60 * 60 * 24 * 365)
                    elif getCommnetRsp.get(
                            "total", "") == 0 or getCommnetRsp.get(
                                "cmts", "") == []:  # 如果不返回数据,就代表评论爬到底
                        print(
                            f"当前线程为{self.threadingName}, 当前正在爬取的电影为{movie_name}, 当前页面返回数据为0,判断爬取完成"
                        )
                        break
                except ValueError as e:
                    print(f"日期转化失败: {e}")
                    # movie["offset"] = offset  # 出现问题断点续爬
                    # self.redisConn.lpush("movice", movie)
                    break
                except KeyError as e:
                    print(f"有数据错误:{e}")
                    continue
                except Exception as e:
                    print(f"错误信息:{e}")