Exemple #1
0
def get_comic_info(comicId):
    '''
    return None, None || comicInfo:{id,name,author,description,coverUrl,status,updatedAt}, chapters:[{id,name,status,addedAt,pageCount}]
    '''
    db = DataSource()
    queryComicSql = "select id,name,author,description,status, updatedAt from comics where id = %s"
    rowCount, rows = db.execute(queryComicSql, [comicId])
    if rowCount < 1:
        return None, None
    
    comicRecord = rows[0]
    comic = {}
    comic['id'] = comicRecord[0]
    comic['name'] = comicRecord[1]
    comic['author'] = comicRecord[2]
    comic['description'] = comicRecord[3]
    comic['coverUrl'] = _toCoverUrl(comicRecord[0])
    comic['status'] = comicRecord[4]
    comic['updatedAt'] = time.mktime(comicRecord[5].timetuple())
    
    queryChaptersSql = "select chapterId,chapterName,addedAt,status,pageCount from chapters where comicId = %s order by chapterOrder asc"
    rowCount, rows = db.execute(queryChaptersSql, [comicId])
    chapters = []
    for row in rows:
        chapter = {}
        chapter['id'] = row[0]
        chapter['name'] = row[1]
        chapter['addedAt'] = time.mktime(row[2].timetuple())
        chapter['status'] = row[3]
        chapter['pageCount'] = row[4]
        chapter['url'] = _toChapterUrl(chapter['id'], comicId) 
        chapters.append(chapter)
        
    return comic, chapters
Exemple #2
0
    def __init__(self, srcTaskQueue):
        threading.Thread.__init__(self)

        self.srcTasks = srcTaskQueue  # {movieId, format}
        self.connSession = None
        self.continuousFailedTimes = 0
        self.datasource = DataSource()
Exemple #3
0
    def __init__(self, taskQueue):
        threading.Thread.__init__(self)

        self.taskMovieIds = taskQueue  # 需要爬取的任务电影id队列
        self.connSession = None
        self.continuousFailedTimes = 0
        self.datasource = DataSource()
Exemple #4
0
def _changeComicStatus(comicId, toStatus):
    sql = "update comics set status = %s, updatedAt = now() where id = %s"
    updatedCount, _ = DataSource().execute(sql, [toStatus, comicId])
    if updatedCount != 1:
        Logger.log("mark comic failed, comicId:" + str(comicId))
        return _buildFailResp('record not found')
        
    return _buildSuccessResp()
Exemple #5
0
def get_chapter(chapterId, comicId = 0):
    '''
    return None || {chapterName, status, imgUrls:[], nextChapterId}
    nextChapterId - 可能为None
    '''
    queryChapterSql = "select chapterId,comicId,chapterName,status,pageCount,chapterOrder from chapters where chapterId = %s"
    db = DataSource()
    rowCount, rows = db.execute(queryChapterSql, [chapterId])
    if rowCount < 1:
        return None
    
    targetRow = None
    for row in rows:
        if row[1] == comicId:
            targetRow = row
            break
    
    if targetRow == None:
        targetRow = rows[0]
    
    comicId = targetRow[1]
    chapterName = targetRow[2]
    status = targetRow[3]
    pageCount = targetRow[4]
    order = targetRow[5]
    
    imgUrls = []
    for pageNo in range(1, pageCount + 1):
            imgUrls.append(_toChapterImgUrl(chapterId, pageNo))
    
    nextChapterSql = "select chapterId from chapters where comicId = %s and chapterOrder = %s"
    rowCount, rows = db.execute(nextChapterSql, [comicId, order + 1])
    nextChapterUrl = None
    if rowCount > 0:
        nextChapterUrl = _toChapterUrl(rows[0][0], comicId)
    
    chapter = {}
    chapter['comicId'] = comicId
    chapter['name'] = chapterName
    chapter['status'] = status
    chapter['imgUrls'] = imgUrls
    chapter['nextChapterUrl'] = nextChapterUrl 
    
    return chapter
Exemple #6
0
def get_comics_info(pageNo = 1, pageSize = 10, onlyMarked = False):
    '''
    return: {totalCount, pageNo, pageSize, comics:[{id,name,author,description,coverUrl,status,updatedAt}]}
    '''
    offset = (pageNo - 1) * pageSize
    
    db = DataSource()
    countSql =  "select count(1) from comics"
    if onlyMarked:
        countSql += " where status = 'makred'"
    _, [[totalCount]] = db.execute(countSql)
    
    querySql = "select id,name,author,description,status, updatedAt from comics"
    if onlyMarked:
        querySql += " where status = 'marked'"
    querySql += " limit %s,%s"
    _rowCount, rows = db.execute(querySql, [offset, pageSize])
    
    comics = []
    for row in rows:
        comic = {}
        comic['id'] = row[0]
        comic['name'] = row[1]
        comic['author'] = row[2]
        comic['description'] = row[3]
        comic['coverUrl'] = _toCoverUrl(row[0])
        comic['status'] = row[4]
        comic['updatedAt'] = time.mktime(row[5].timetuple())
        
        comics.append(comic)
    
    result = _buildSuccessResp()
    result['totalCount'] = totalCount
    result['pageNo'] = pageNo
    result['pageSize'] = pageSize
    result['comics'] = comics
    
    return result
Exemple #7
0
from db.datasource import DataSource

dataSource = DataSource(
    host="localhost",
    user='******',
    password='******',
    db="good",
    port=43122,
)
Exemple #8
0
class MovieSrcSpider(threading.Thread):
    '''
            电影片源收集器
    '''
    def __init__(self, srcTaskQueue):
        threading.Thread.__init__(self)

        self.srcTasks = srcTaskQueue  # {movieId, format}
        self.connSession = None
        self.continuousFailedTimes = 0
        self.datasource = DataSource()

    def run(self):
        noTaskCount = 0
        while (True):
            try:
                srcTask = self.srcTasks.get(True, 1)
                noTaskCount = 0
            except Exception as _e:
                # no task return
                noTaskCount += 1
                if noTaskCount > 10:
                    Logger.log(
                        "[MovieSrcSpider] no task got, thread ready to stop.")
                    break

                time.sleep(3)
                continue

            try:
                movieId = srcTask['movieId']
                videoFormat = srcTask['format']
                if self.tryCrawlMovieInfo(movieId, videoFormat) == False:
                    self.continuousFailedTimes += 1
                    if self.continuousFailedTimes >= 100:
                        Logger.log(
                            "[MovieSrcSpider] stop crawling cause too much fail."
                        )
                        break
                else:
                    self.continuousFailedTimes = 0
            except Exception as e:
                Logger.log(
                    "[MovieSrcSpider] failed, movieId:{}, error:{}".format(
                        movieId, e))

            time.sleep(IntervalSecondPerCrawl)

    def ensureSession(self):
        # how to test the session is available or not??
        if self.connSession == None:
            self.connSession = requests.Session()

        return self.connSession

    # http://www.80s.tw/movie/${movieId}/${format}-1
    def tryCrawlMovieSrc(self, movieId, videoFormat):
        '''
        return {
            'videoSrcs' : [
                {
                    'title' : ""
                    'size' : ""
                    'src' : ""
                }
            ] 
        }
        '''

        movieSrcUrl = "http://www.80s.tw/movie/{}/{}-1".format(
            movieId, videoFormat)

        session = self.ensureSession()
        resp = session.get(movieSrcUrl, headers={'User-Agent': ConstUserAgent})
        if resp.status_code != 200:
            Logger.log(
                "[MovieSrcSpider] get movie src failed, url:{}, resp:{}".
                format(movieSrcUrl, resp))
            return False

        videoSrcs = self.parsePage(resp.text)
        self.sync2DB(movieId, videoFormat, videoSrcs)

        return True

    def parsePage(self, pageContent):
        '''
        return [{name, size, src}]
        '''
        soup = BeautifulSoup(pageContent, "lxml")

        videoSrcs = []

        spanViews = soup.find_all('span', class_="dlname")
        for spanView in spanViews:
            aLink = spanView.find('a')
            if aLink == None:
                continue

            name = aLink.text.strip().encode('utf-8')
            size = aLink.parent.contents[-1].strip().encode('utf-8')
            src = aLink.get('href').encode('utf-8')

            videoSrc = {}
            videoSrc['name'] = name
            videoSrc['size'] = size
            videoSrc['src'] = src
            videoSrcs.append(videoSrc)

        return videoSrcs

    def sync2DB(self, movieId, videoFormat, videoSrcs):
        insertVideoSrcSql = """
                replace into video_src_80s(
                        movieId, videoFormat, videoNo, title, size,
                        videoSrc, createdAt, updatedAt)
                values(%s,%s,%s,%s,%s, %s,now(), now())
        """
        rows = []
        videoNo = 0
        for videoSrc in videoSrcs:
            videoNo = videoNo + 1
            row = [
                movieId, videoFormat, videoNo, videoSrc['name'],
                videoSrc['size'], videoSrc['src']
            ]
            rows.append(row)
        try:
            self.datasource.inert_or_update_batch(insertVideoSrcSql, rows)
        except Exception as e:
            Logger.log(
                "[sync2DB] fail, movieId:{}, format:{}, error:{}".format(
                    movieId, videoFormat, e))
Exemple #9
0
class MovieInfoSpider(threading.Thread):
    '''
            电影信息收集器
    '''
    def __init__(self, taskQueue):
        threading.Thread.__init__(self)

        self.taskMovieIds = taskQueue  # 需要爬取的任务电影id队列
        self.connSession = None
        self.continuousFailedTimes = 0
        self.datasource = DataSource()

    def run(self):
        noTaskCount = 0
        while (True):
            try:
                movieId = self.taskMovieIds.get(True, 1)
                self.taskMovieIds.task_done()
                noTaskCount = 0
            except Exception as _e:
                # no task return
                noTaskCount += 1
                if noTaskCount > 100:
                    Logger.log(
                        "[MovieInfoSpider] no task got, thread ready to stop.")
                    break

                time.sleep(30)
                continue

            try:
                if self.tryCrawlMovieInfo(movieId) == False:
                    self.continuousFailedTimes += 1
                    if self.continuousFailedTimes >= 100:
                        Logger.log(
                            "[MovieInfoSpider] stop crawling cause too much fail."
                        )
                        break
                else:
                    self.continuousFailedTimes = 0
            except Exception as e:
                Logger.log(
                    "[MovieInfoSpider] failed, movieId:{}, error:{}".format(
                        movieId, e))

            time.sleep(IntervalSecondPerCrawl)

    def ensureSession(self):
        # how to test the session is available or not??
        if self.connSession == None:
            self.connSession = requests.Session()

        return self.connSession

    def addTasks(self, movieIds):
        '''
        movieIds - array of movieId
        '''
        for movieId in movieIds:
            self.taskMovieIds.put_nowait(movieId)

    # http://www.80s.tw/movie/${movieId}
    def tryCrawlMovieInfo(self, movieId):
        '''
        return true:success, false:failed
        '''
        Logger.log("[MovieInfoSpider] try handle movie:{}".format(movieId))

        movieIndexUrl = "http://www.80s.tw/movie/" + str(movieId)

        session = self.ensureSession()
        resp = session.get(movieIndexUrl,
                           headers={'User-Agent': ConstUserAgent})
        if resp.status_code != 200:
            Logger.log(
                "[MovieInfoSpider] get movie info failed, url:{}, resp:{}".
                format(movieIndexUrl, resp))
            return False

        movieData = self.parsePage(movieId, resp.text)
        movieData['movieId'] = movieId

        self.sync2DB(movieData)

        if ShouldDownloadCover == True and bool(movieData['coverUrl']):
            coverUrl = movieData['coverUrl']
            coverImgPath = ImgRepoDir + '/covers/80s/' + str(movieId) + '.jpg'
            downloadImage(self.ensureSession(), movieIndexUrl, coverUrl,
                          coverImgPath)

        return True

    def parsePage(self, movieId, pageContent):
        '''
        return {
                'coverUrl':"",
                'name':"",
                'aliases':[],
                'stars':[],
                'genres':[],
                'region':"",
                'languages':[],
                'director':"",
                'showTime':"",
                'platformUpdatedAt':",
                'duration':"", 
                'doubanScore':""
                'doubanCommentLink':"",
                'outline':"",
                'videoFormats':[]
                }
        '''
        soup = BeautifulSoup(pageContent, "lxml")

        coverUrl = "http:" + soup.find("img").get("src")

        infoView = soup.find("div", class_="info")
        name = infoView.find("h1").string.encode('utf-8')

        spanViews = infoView.find_all("span", class_="")
        aliases = None
        stars = []
        for spanView in spanViews:
            subSpan = spanView.find('span')
            if subSpan == None:
                continue

            spanTitle = subSpan.string.encode('utf-8')
            if '又名' in spanTitle:
                aliases = spanView.contents[-1].encode('utf-8').strip()
            elif '演员' in spanTitle:
                starLinks = spanView.find_all('a')
                for starLink in starLinks:
                    stars.append(starLink.string.encode('utf-8'))

        divViews = infoView.find_all("div", class_="clearfix")

        # div[0]: 类型 + 地区 + 语言 + 导演 + 上映时间 + 片长 + 更新时间
        genres = []
        region = ""
        languages = []
        director = ""
        showTime = None
        duration = None
        platformUpdateAt = None
        spanViews = divViews[0].find_all("span", class_="span_block")
        for spanView in spanViews:
            subSpan = spanView.find('span')
            if subSpan == None:
                continue

            spanTitle = subSpan.string.encode('utf-8')
            if '类型' in spanTitle:
                genreLinks = spanView.find_all("a")
                for genreLink in genreLinks:
                    genres.append(genreLink.string.encode('utf-8'))

            if '地区' in spanTitle:
                regionLink = spanView.find("a")
                if regionLink != None:
                    region = regionLink.string.encode('utf-8')

            if '语言' in spanTitle:
                languageLinks = spanViews[2].find_all("a")
                for languageLink in languageLinks:
                    languages.append(languageLink.string.encode('utf-8'))

            if '导演' in spanTitle:
                directorLink = spanView.find("a")
                if directorLink != None:
                    director = directorLink.string.encode('utf-8')

            if '上映' in spanTitle:
                showTime = spanView.contents[-1].encode('utf-8').strip()

            if '片长' in spanTitle:
                duration = spanView.contents[-1].encode('utf-8').strip()

            if '更新' in spanTitle:
                platformUpdateAt = spanView.contents[-1].encode(
                    'utf-8').strip()

        #div[1]: 豆瓣
        doubanScore = 0
        doubanCommentLink = ""
        spanViews = divViews[1].find_all("span", class_="span_block")
        for spanView in spanViews:
            spanText = spanView.text.encode('utf-8')
            if '豆瓣评分' in spanText:
                doubanScore = spanView.contents[-1].encode('utf-8').strip()

            if '豆瓣短评' in spanText:
                doubanCommentLink = spanView.find_all("a")[1].get('href')

        #div[2]: 电影简介
        outline = divViews[2].contents[2].encode('utf-8').strip()

        idFormatMapping = {'cpdl3': 'hd', 'cpdl4': 'bd', 'cpdl5': 'bt'}
        videoFormats = []
        formatViews = soup.find_all('li', id=re.compile('^cpdl'))
        for formatView in formatViews:
            viewId = formatView.get('id')
            if viewId not in idFormatMapping:
                Logger.log(
                    "[MovieInfoSpider] mapping video format failed, movieId:{}, viewId:{}"
                    .format(movieId, viewId))
                continue
            else:
                videoFormats.append(idFormatMapping[viewId])

        ret = {}
        ret['coverUrl'] = coverUrl
        ret['name'] = name
        ret['stars'] = stars
        ret['aliases'] = aliases
        ret['genres'] = genres
        ret['region'] = region
        ret['languages'] = languages
        ret['director'] = director
        ret['showTime'] = showTime
        ret['duration'] = duration
        ret['platformUpdateAt'] = platformUpdateAt
        ret['doubanScore'] = doubanScore
        ret['doubanCommentLink'] = doubanCommentLink
        ret['outline'] = outline
        ret['videoFormats'] = videoFormats

        #Logger.log("[MovieInfoSpider] try handle ret:{}".format(ret))
        #self.sync2DB(ret)
        return ret

    def sync2DB(self, movieData):
        movieId = movieData['movieId']
        name = movieData['name']
        coverUrl = movieData['coverUrl']
        aliases = movieData['aliases']
        stars = "`".join(movieData['stars'])
        genres = "`".join(movieData['genres'])

        region = movieData['region']
        languages = "`".join(movieData['languages'])
        director = movieData['director']
        showTime = movieData['showTime']
        duration = movieData['duration']
        platformUpdateAt = movieData['platformUpdateAt']
        doubanScore = movieData['doubanScore']
        doubanCommentLink = movieData['doubanCommentLink']
        outline = movieData['outline']

        insertMovieSql = """
                insert into movies_80s(
                        id, name, aliases, stars, genres,
                        region, languages, director, showTime, duration,
                        platformUpdatedAt, doubanScore, doubanCommentLink, outline, createdAt,
                        updatedAt)
                values(%s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s,now(), now())  
                on duplicate key update updatedAt = now()
        """

        try:
            self.datasource.execute(insertMovieSql, [
                movieId, name, aliases, stars, genres, region, languages,
                director, showTime, duration, platformUpdateAt, doubanScore,
                doubanCommentLink, outline
            ])
        except Exception as e:
            Logger.log("[sync2DB] fail, movieId:{}, error:{}".format(
                movieId, e))
Exemple #10
0
 def isAlreadyCrawl(self, movieId):
     querySql = "select * from movies_80s where id = %s"
     rowCount, _rows = DataSource().execute(querySql, [movieId])
     return rowCount > 0