def get_comic_info(comicId): ''' return None, None || comicInfo:{id,name,author,description,coverUrl,status,updatedAt}, chapters:[{id,name,status,addedAt,pageCount}] ''' db = DataSource() queryComicSql = "select id,name,author,description,status, updatedAt from comics where id = %s" rowCount, rows = db.execute(queryComicSql, [comicId]) if rowCount < 1: return None, None comicRecord = rows[0] comic = {} comic['id'] = comicRecord[0] comic['name'] = comicRecord[1] comic['author'] = comicRecord[2] comic['description'] = comicRecord[3] comic['coverUrl'] = _toCoverUrl(comicRecord[0]) comic['status'] = comicRecord[4] comic['updatedAt'] = time.mktime(comicRecord[5].timetuple()) queryChaptersSql = "select chapterId,chapterName,addedAt,status,pageCount from chapters where comicId = %s order by chapterOrder asc" rowCount, rows = db.execute(queryChaptersSql, [comicId]) chapters = [] for row in rows: chapter = {} chapter['id'] = row[0] chapter['name'] = row[1] chapter['addedAt'] = time.mktime(row[2].timetuple()) chapter['status'] = row[3] chapter['pageCount'] = row[4] chapter['url'] = _toChapterUrl(chapter['id'], comicId) chapters.append(chapter) return comic, chapters
def __init__(self, srcTaskQueue): threading.Thread.__init__(self) self.srcTasks = srcTaskQueue # {movieId, format} self.connSession = None self.continuousFailedTimes = 0 self.datasource = DataSource()
def __init__(self, taskQueue): threading.Thread.__init__(self) self.taskMovieIds = taskQueue # 需要爬取的任务电影id队列 self.connSession = None self.continuousFailedTimes = 0 self.datasource = DataSource()
def _changeComicStatus(comicId, toStatus): sql = "update comics set status = %s, updatedAt = now() where id = %s" updatedCount, _ = DataSource().execute(sql, [toStatus, comicId]) if updatedCount != 1: Logger.log("mark comic failed, comicId:" + str(comicId)) return _buildFailResp('record not found') return _buildSuccessResp()
def get_chapter(chapterId, comicId = 0): ''' return None || {chapterName, status, imgUrls:[], nextChapterId} nextChapterId - 可能为None ''' queryChapterSql = "select chapterId,comicId,chapterName,status,pageCount,chapterOrder from chapters where chapterId = %s" db = DataSource() rowCount, rows = db.execute(queryChapterSql, [chapterId]) if rowCount < 1: return None targetRow = None for row in rows: if row[1] == comicId: targetRow = row break if targetRow == None: targetRow = rows[0] comicId = targetRow[1] chapterName = targetRow[2] status = targetRow[3] pageCount = targetRow[4] order = targetRow[5] imgUrls = [] for pageNo in range(1, pageCount + 1): imgUrls.append(_toChapterImgUrl(chapterId, pageNo)) nextChapterSql = "select chapterId from chapters where comicId = %s and chapterOrder = %s" rowCount, rows = db.execute(nextChapterSql, [comicId, order + 1]) nextChapterUrl = None if rowCount > 0: nextChapterUrl = _toChapterUrl(rows[0][0], comicId) chapter = {} chapter['comicId'] = comicId chapter['name'] = chapterName chapter['status'] = status chapter['imgUrls'] = imgUrls chapter['nextChapterUrl'] = nextChapterUrl return chapter
def get_comics_info(pageNo = 1, pageSize = 10, onlyMarked = False): ''' return: {totalCount, pageNo, pageSize, comics:[{id,name,author,description,coverUrl,status,updatedAt}]} ''' offset = (pageNo - 1) * pageSize db = DataSource() countSql = "select count(1) from comics" if onlyMarked: countSql += " where status = 'makred'" _, [[totalCount]] = db.execute(countSql) querySql = "select id,name,author,description,status, updatedAt from comics" if onlyMarked: querySql += " where status = 'marked'" querySql += " limit %s,%s" _rowCount, rows = db.execute(querySql, [offset, pageSize]) comics = [] for row in rows: comic = {} comic['id'] = row[0] comic['name'] = row[1] comic['author'] = row[2] comic['description'] = row[3] comic['coverUrl'] = _toCoverUrl(row[0]) comic['status'] = row[4] comic['updatedAt'] = time.mktime(row[5].timetuple()) comics.append(comic) result = _buildSuccessResp() result['totalCount'] = totalCount result['pageNo'] = pageNo result['pageSize'] = pageSize result['comics'] = comics return result
from db.datasource import DataSource dataSource = DataSource( host="localhost", user='******', password='******', db="good", port=43122, )
class MovieSrcSpider(threading.Thread): ''' 电影片源收集器 ''' def __init__(self, srcTaskQueue): threading.Thread.__init__(self) self.srcTasks = srcTaskQueue # {movieId, format} self.connSession = None self.continuousFailedTimes = 0 self.datasource = DataSource() def run(self): noTaskCount = 0 while (True): try: srcTask = self.srcTasks.get(True, 1) noTaskCount = 0 except Exception as _e: # no task return noTaskCount += 1 if noTaskCount > 10: Logger.log( "[MovieSrcSpider] no task got, thread ready to stop.") break time.sleep(3) continue try: movieId = srcTask['movieId'] videoFormat = srcTask['format'] if self.tryCrawlMovieInfo(movieId, videoFormat) == False: self.continuousFailedTimes += 1 if self.continuousFailedTimes >= 100: Logger.log( "[MovieSrcSpider] stop crawling cause too much fail." ) break else: self.continuousFailedTimes = 0 except Exception as e: Logger.log( "[MovieSrcSpider] failed, movieId:{}, error:{}".format( movieId, e)) time.sleep(IntervalSecondPerCrawl) def ensureSession(self): # how to test the session is available or not?? if self.connSession == None: self.connSession = requests.Session() return self.connSession # http://www.80s.tw/movie/${movieId}/${format}-1 def tryCrawlMovieSrc(self, movieId, videoFormat): ''' return { 'videoSrcs' : [ { 'title' : "" 'size' : "" 'src' : "" } ] } ''' movieSrcUrl = "http://www.80s.tw/movie/{}/{}-1".format( movieId, videoFormat) session = self.ensureSession() resp = session.get(movieSrcUrl, headers={'User-Agent': ConstUserAgent}) if resp.status_code != 200: Logger.log( "[MovieSrcSpider] get movie src failed, url:{}, resp:{}". format(movieSrcUrl, resp)) return False videoSrcs = self.parsePage(resp.text) self.sync2DB(movieId, videoFormat, videoSrcs) return True def parsePage(self, pageContent): ''' return [{name, size, src}] ''' soup = BeautifulSoup(pageContent, "lxml") videoSrcs = [] spanViews = soup.find_all('span', class_="dlname") for spanView in spanViews: aLink = spanView.find('a') if aLink == None: continue name = aLink.text.strip().encode('utf-8') size = aLink.parent.contents[-1].strip().encode('utf-8') src = aLink.get('href').encode('utf-8') videoSrc = {} videoSrc['name'] = name videoSrc['size'] = size videoSrc['src'] = src videoSrcs.append(videoSrc) return videoSrcs def sync2DB(self, movieId, videoFormat, videoSrcs): insertVideoSrcSql = """ replace into video_src_80s( movieId, videoFormat, videoNo, title, size, videoSrc, createdAt, updatedAt) values(%s,%s,%s,%s,%s, %s,now(), now()) """ rows = [] videoNo = 0 for videoSrc in videoSrcs: videoNo = videoNo + 1 row = [ movieId, videoFormat, videoNo, videoSrc['name'], videoSrc['size'], videoSrc['src'] ] rows.append(row) try: self.datasource.inert_or_update_batch(insertVideoSrcSql, rows) except Exception as e: Logger.log( "[sync2DB] fail, movieId:{}, format:{}, error:{}".format( movieId, videoFormat, e))
class MovieInfoSpider(threading.Thread): ''' 电影信息收集器 ''' def __init__(self, taskQueue): threading.Thread.__init__(self) self.taskMovieIds = taskQueue # 需要爬取的任务电影id队列 self.connSession = None self.continuousFailedTimes = 0 self.datasource = DataSource() def run(self): noTaskCount = 0 while (True): try: movieId = self.taskMovieIds.get(True, 1) self.taskMovieIds.task_done() noTaskCount = 0 except Exception as _e: # no task return noTaskCount += 1 if noTaskCount > 100: Logger.log( "[MovieInfoSpider] no task got, thread ready to stop.") break time.sleep(30) continue try: if self.tryCrawlMovieInfo(movieId) == False: self.continuousFailedTimes += 1 if self.continuousFailedTimes >= 100: Logger.log( "[MovieInfoSpider] stop crawling cause too much fail." ) break else: self.continuousFailedTimes = 0 except Exception as e: Logger.log( "[MovieInfoSpider] failed, movieId:{}, error:{}".format( movieId, e)) time.sleep(IntervalSecondPerCrawl) def ensureSession(self): # how to test the session is available or not?? if self.connSession == None: self.connSession = requests.Session() return self.connSession def addTasks(self, movieIds): ''' movieIds - array of movieId ''' for movieId in movieIds: self.taskMovieIds.put_nowait(movieId) # http://www.80s.tw/movie/${movieId} def tryCrawlMovieInfo(self, movieId): ''' return true:success, false:failed ''' Logger.log("[MovieInfoSpider] try handle movie:{}".format(movieId)) movieIndexUrl = "http://www.80s.tw/movie/" + str(movieId) session = self.ensureSession() resp = session.get(movieIndexUrl, headers={'User-Agent': ConstUserAgent}) if resp.status_code != 200: Logger.log( "[MovieInfoSpider] get movie info failed, url:{}, resp:{}". format(movieIndexUrl, resp)) return False movieData = self.parsePage(movieId, resp.text) movieData['movieId'] = movieId self.sync2DB(movieData) if ShouldDownloadCover == True and bool(movieData['coverUrl']): coverUrl = movieData['coverUrl'] coverImgPath = ImgRepoDir + '/covers/80s/' + str(movieId) + '.jpg' downloadImage(self.ensureSession(), movieIndexUrl, coverUrl, coverImgPath) return True def parsePage(self, movieId, pageContent): ''' return { 'coverUrl':"", 'name':"", 'aliases':[], 'stars':[], 'genres':[], 'region':"", 'languages':[], 'director':"", 'showTime':"", 'platformUpdatedAt':", 'duration':"", 'doubanScore':"" 'doubanCommentLink':"", 'outline':"", 'videoFormats':[] } ''' soup = BeautifulSoup(pageContent, "lxml") coverUrl = "http:" + soup.find("img").get("src") infoView = soup.find("div", class_="info") name = infoView.find("h1").string.encode('utf-8') spanViews = infoView.find_all("span", class_="") aliases = None stars = [] for spanView in spanViews: subSpan = spanView.find('span') if subSpan == None: continue spanTitle = subSpan.string.encode('utf-8') if '又名' in spanTitle: aliases = spanView.contents[-1].encode('utf-8').strip() elif '演员' in spanTitle: starLinks = spanView.find_all('a') for starLink in starLinks: stars.append(starLink.string.encode('utf-8')) divViews = infoView.find_all("div", class_="clearfix") # div[0]: 类型 + 地区 + 语言 + 导演 + 上映时间 + 片长 + 更新时间 genres = [] region = "" languages = [] director = "" showTime = None duration = None platformUpdateAt = None spanViews = divViews[0].find_all("span", class_="span_block") for spanView in spanViews: subSpan = spanView.find('span') if subSpan == None: continue spanTitle = subSpan.string.encode('utf-8') if '类型' in spanTitle: genreLinks = spanView.find_all("a") for genreLink in genreLinks: genres.append(genreLink.string.encode('utf-8')) if '地区' in spanTitle: regionLink = spanView.find("a") if regionLink != None: region = regionLink.string.encode('utf-8') if '语言' in spanTitle: languageLinks = spanViews[2].find_all("a") for languageLink in languageLinks: languages.append(languageLink.string.encode('utf-8')) if '导演' in spanTitle: directorLink = spanView.find("a") if directorLink != None: director = directorLink.string.encode('utf-8') if '上映' in spanTitle: showTime = spanView.contents[-1].encode('utf-8').strip() if '片长' in spanTitle: duration = spanView.contents[-1].encode('utf-8').strip() if '更新' in spanTitle: platformUpdateAt = spanView.contents[-1].encode( 'utf-8').strip() #div[1]: 豆瓣 doubanScore = 0 doubanCommentLink = "" spanViews = divViews[1].find_all("span", class_="span_block") for spanView in spanViews: spanText = spanView.text.encode('utf-8') if '豆瓣评分' in spanText: doubanScore = spanView.contents[-1].encode('utf-8').strip() if '豆瓣短评' in spanText: doubanCommentLink = spanView.find_all("a")[1].get('href') #div[2]: 电影简介 outline = divViews[2].contents[2].encode('utf-8').strip() idFormatMapping = {'cpdl3': 'hd', 'cpdl4': 'bd', 'cpdl5': 'bt'} videoFormats = [] formatViews = soup.find_all('li', id=re.compile('^cpdl')) for formatView in formatViews: viewId = formatView.get('id') if viewId not in idFormatMapping: Logger.log( "[MovieInfoSpider] mapping video format failed, movieId:{}, viewId:{}" .format(movieId, viewId)) continue else: videoFormats.append(idFormatMapping[viewId]) ret = {} ret['coverUrl'] = coverUrl ret['name'] = name ret['stars'] = stars ret['aliases'] = aliases ret['genres'] = genres ret['region'] = region ret['languages'] = languages ret['director'] = director ret['showTime'] = showTime ret['duration'] = duration ret['platformUpdateAt'] = platformUpdateAt ret['doubanScore'] = doubanScore ret['doubanCommentLink'] = doubanCommentLink ret['outline'] = outline ret['videoFormats'] = videoFormats #Logger.log("[MovieInfoSpider] try handle ret:{}".format(ret)) #self.sync2DB(ret) return ret def sync2DB(self, movieData): movieId = movieData['movieId'] name = movieData['name'] coverUrl = movieData['coverUrl'] aliases = movieData['aliases'] stars = "`".join(movieData['stars']) genres = "`".join(movieData['genres']) region = movieData['region'] languages = "`".join(movieData['languages']) director = movieData['director'] showTime = movieData['showTime'] duration = movieData['duration'] platformUpdateAt = movieData['platformUpdateAt'] doubanScore = movieData['doubanScore'] doubanCommentLink = movieData['doubanCommentLink'] outline = movieData['outline'] insertMovieSql = """ insert into movies_80s( id, name, aliases, stars, genres, region, languages, director, showTime, duration, platformUpdatedAt, doubanScore, doubanCommentLink, outline, createdAt, updatedAt) values(%s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s,now(), now()) on duplicate key update updatedAt = now() """ try: self.datasource.execute(insertMovieSql, [ movieId, name, aliases, stars, genres, region, languages, director, showTime, duration, platformUpdateAt, doubanScore, doubanCommentLink, outline ]) except Exception as e: Logger.log("[sync2DB] fail, movieId:{}, error:{}".format( movieId, e))
def isAlreadyCrawl(self, movieId): querySql = "select * from movies_80s where id = %s" rowCount, _rows = DataSource().execute(querySql, [movieId]) return rowCount > 0