class Publish(): def __init__(self): self.b_title = 'Publish' self.b_second = 1 self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') self.rds = RedisToo() self.dataToo = DataToo(logName=self.b_title, second=self.b_second, timeStr=self.b_timeStr) self.logger = Logger(logname=self.dataToo.initLogName(), loglevel=1, logger=self.b_title).getlog() def saveBookToRedisAction(self): environmentalType = input("请输入0、1、2(0:dev,1:test,2:online): >>") maxBookNex = 0 self.logger.debug('\n\n参数确认: 环境 : %s | 最大抓取数 : %s \n\n' % (environmentalType, maxBookNex)) time.sleep(1) isStart = input("是否开始?(y/n): >>") if (isStart == 'y'): self.rds.p.publish( 'bookChannel', str( json.dumps({ 'type': 'SaveBookToRedis', 'environmentalType': environmentalType, 'maxBookNex': maxBookNex }))) else: print('取消抓取') def getBookTXTAction(self): getBookIdsListSize = input("获取多少组数据(最大10): >>") maxCatalogNex = 1 print('\n\n参数确认: maxCatalogNex : %s | getBookIdsListSize : %s \n\n' % (maxCatalogNex, getBookIdsListSize)) time.sleep(1) isStart = input("是否开始?(y/n): >>") if (isStart == 'y'): self.rds.p.publish( 'bookChannel', str( json.dumps({ 'type': 'GetBookTXT', 'maxCatalogNex': maxCatalogNex, 'getBookIdsListSize': getBookIdsListSize }))) else: print('取消抓取')
class Subscribe(): def __init__(self): self.b_title = 'Subscribe' self.b_second = 1 self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') self.rds = RedisTool() self.dataToo = DataTool(logName=self.b_title, second=self.b_second, timeStr=self.b_timeStr) self.logger = Logger(logname=self.dataToo.initLogName(), loglevel=1, logger=self.b_title).getlog() self.mySql = MySqlTool(logName=self.dataToo.initLogName()) def saveBookToRedisAction(self, params): self.logger.debug(params) book = SaveBookToRedisTool( environmentalType=params['environmentalType'], rds=self.rds, dataToo=self.dataToo, mySql=self.mySql, logger=self.logger) book.saveAllBookListToRedis(rdsKeyName=params['rdsKeyName']) self.logger.debug('saveBookToRedisAction处理结束') def getBookTXTAction(self, params): self.logger.debug(params) book = GetBookTXT(getBookIdsListSize=params['getBookIdsListSize'], rdsKeyName=params['rdsKeyName']) book.contentsLoad() self.logger.debug('getBookTXTAction处理结束')
class GetBookTXT(object): def __init__(self, getBookIdsListSize, rdsKeyName): self.b_getBookIdsListSize = int(getBookIdsListSize) self.b_rdsKeyName = rdsKeyName self.b_title = 'getBookTXT' self.b_second = 1 self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') self.dataToo = DataTool(logName=self.b_title, second=self.b_second, timeStr=self.b_timeStr) self.mySql = MySqlTool(logName=self.dataToo.initLogName()) self.logger = Logger(logname=self.dataToo.initLogName(), loglevel=1, logger=self.b_title).getlog() self.rds = RedisTool() self.getBookInfoToo = GetBookInfoTool(second=self.b_second, dataToo=self.dataToo, logger=self.logger) self.saveBookInfoToMySqlToo = SaveBookInfoToMySqlToo( second=self.b_second, logger=self.logger, getBookInfoToo=self.getBookInfoToo, mySql=self.mySql, dataToo=self.dataToo) def target(self): links = [] for i in range(self.b_getBookIdsListSize): link = self.rds.r.lpop(self.b_rdsKeyName) if link != None: link = link.decode(encoding='utf-8') links.append(link) return links def contentsLoad(self): links = self.target() if len(links) <= 0: self.logger.debug('bookTxtLoad 没有数据\n') return for item in links: self.logger.debug(item) self.saveBookInfoToMySqlToo.saveText(link=item) self.isOk() def isOk(self): self.contentsLoad()
class Subscribe(): def __init__(self): self.b_title = 'Subscribe' self.b_second = 1 self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') self.rds = RedisToo() self.dataToo = DataToo(logName=self.b_title, second=self.b_second, timeStr=self.b_timeStr) self.logger = Logger(logname=self.dataToo.initLogName(), loglevel=1, logger=self.b_title).getlog() def saveBookToRedisAction(self, params): self.logger.debug(params) book = SaveBookToRedis(environmentalType=params['environmentalType'], maxBookNex=params['maxBookNex']) book.bookTxtLoad() self.logger.debug('saveBookToRedisAction处理结束') def getBookTXTAction(self, params): self.logger.debug(params) book = GetBookTXT(maxCatalogNex=params['maxCatalogNex'], getBookIdsListSize=params['getBookIdsListSize']) book.contentsLoad() self.logger.debug('getBookTXT处理结束')
class MySqlToo(): def __init__(self, logName): self.con = ConfigParser() self.logger = Logger(logname=logName, loglevel=1, logger="MySQLToo").getlog() mysqlConfig = self.con.getConfig('mysql', 'host'), self.con.getConfig( 'mysql', 'user'), self.con.getConfig( 'mysql', 'password'), self.con.getConfig('mysql', 'database') self.logger.info( "\n\t mySqlConfig:\n\t\t host: %s\n\t\t user : %s\n\t\t password : %s\n\t\t database : %s " % (mysqlConfig)) # 数据库信息 def openMySqlConfig(self): return pymysql.connect(self.con.getConfig('mysql', 'host'), self.con.getConfig('mysql', 'user'), self.con.getConfig('mysql', 'password'), self.con.getConfig('mysql', 'database')) # 批量添加 信息 # 批量添加 信息 def batchAdd(self, sql, data_info): # self.logger.warning(data_info) db = self.openMySqlConfig() # 使用cursor()方法获取操作游标 cursor = db.cursor() try: # 执行sql语句 cursor.executemany(sql, data_info) # 提交到数据库执行 db.commit() db.close() self.logger.info('存储成功') return True except: # 如果发生错误则回滚 db.rollback() db.close() self.logger.debug('存储失败:[ sql ] %s ' % (str(sql))) self.logger.debug('存储失败:[ data_info ] %s ' % (str(data_info))) return False # 获取列表数据 def getListData(self, sql): db = self.openMySqlConfig() # 使用cursor()方法获取操作游标 cursor = db.cursor() results = [] try: # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 results = cursor.fetchall() db.close() self.logger.debug("查询成功[ %s ]: sql==> %s" % (len(results), sql)) return results except: self.logger.debug("查询失败: sql==> %s" % (sql)) # 关闭数据库连接 db.close() return results
class BookTXTLoad(object): def __init__(self, second, environmentalType, maxBookNex): self.b_bookPageSize = 10 self.b_bookIdSize = 5 self.b_bookTXTGroupSize = 100 self.b_second = int(second) self.b_environmentalType = int(environmentalType) self.b_maxBookNex = int(maxBookNex) self.b_title = 'getBookTXT' self.b_catalogList = [] self.b_bookTXTData = [] self.errorUrl = [] self.request404 = [] self.countNum = 0 self.con = ConfigParser() self.logName = self.intLogName() self.mySql = MySqlToo(logName=self.logName) self.dataToo = DataToo(logName=self.logName, second=self.b_second) self.logger = Logger(logname=self.logName, loglevel=1, logger=self.b_title).getlog() self.rds = self.initRds() self.timeToo = TimeToo() self.b_heads = self.initHeads() self.b_mysqlStr = self.initMysqlStr() def initMysqlStr(self): if self.b_environmentalType == 2: environmental = 'online' getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s" % self.b_maxBookNex elif self.b_environmentalType == 1: environmental = 'test' testBookId = '10000,20000' getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s limit %s" % ( self.b_maxBookNex, testBookId) else: environmental = 'dev' testBookId = "'10000611804961003','10000828104982003'" getBookIdsSql = "SELECT book_Id FROM books WHERE book_Id in (%s)" % testBookId return { 'saveText': "INSERT INTO `links` (`url`,article) VALUES (%s, %s) ON DUPLICATE KEY UPDATE article = VALUES (article), nex = nex+1", 'getBookIdsSql': getBookIdsSql, 'getCatalogData': "SELECT url FROM links WHERE fs = 0 AND book_Id in " } def initHeads(self): heads = {} heads[ 'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' heads['Accept-Encoding'] = 'gzip, deflate, br' heads['Accept-Language'] = 'zh-CN,zh;q=0.9' heads['Connection'] = 'keep-alive' heads[ 'Cookie'] = 'newstatisticUUID=1547076169_1527614489; qdrs=0%7C3%7C0%7C0%7C1; qdgd=1' heads['Host'] = 'www.xs8.cn' heads['Upgrade-Insecure-Requests'] = '1' heads['Referer'] = '' heads[ 'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36' return heads def intLogName(self): timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') return '%s_%s.txt' % (self.b_title, timeStr) def initRds(self): pool = redis.ConnectionPool( host=self.con.getConfig('redisConfig', 'host'), port=self.con.getConfig('redisConfig', 'port'), db=self.con.getConfig('redisConfig', 'db')) return redis.StrictRedis(connection_pool=pool) def second(self): time.sleep(self.b_second) # 2、调用mySQL类 mysqlUtils.getListData 获取数据列表 def getBookData(self): bookList = [] bookData = self.mySql.getListData(sql=self.b_mysqlStr['getBookIdsSql']) for item in bookData: bookList.append(item[0]) return bookList def getCatalogData(self, bookId, index): catalogList = [] sql = '%s %s' % (self.b_mysqlStr['getCatalogData'], self.dataToo.listToStr(bookId)) self.logger.info('查询小说章节 [ %s ]...\n' % (sql)) catalogData = self.mySql.getListData(sql=sql) for item in catalogData: catalogList.append(item[0]) self.b_catalogList.append(catalogList) # 4、 章节目录 catalog 数据整理 数组 def setCatalogList(self, bookGroupingData): bookData = bookGroupingData['listTaskList'] if len(bookData) <= 0: self.logger.debug('setCatalogList 没有数据\n') return bookIdGroupingData = self.dataToo.groupingData( list=bookData, pageSize=self.b_bookIdSize) listTaskList = bookIdGroupingData['listTaskList'] for i in range(bookIdGroupingData['listGroupSize']): if len(listTaskList[i]) <= 0: continue self.dataToo.threads(listTaskList[i], self.getCatalogData) def getArticle(self, link, group, bookCatalogUrlGroupingData, ngroup, nindex): bkd = bookCatalogUrlGroupingData self.b_heads['Referer'] = link self.logger.info( '已采集 [ %s ] 书籍组 [ %s / %s ] 目录组 [ %s / %s ] 文章组 [ %s / %s ] 链接 [ %s ] %s 秒后开始抓取' % (self.countNum, group + 1, len( self.b_catalogList), ngroup + 1, bkd['listGroupSize'], nindex + 1, bkd['listTaskSize'], link, self.b_second)) self.second() text = self.dataToo.getText(link=link, heads=self.b_heads) if len(text['data']) <= 0: self.errorUrl.append(link) self.countNum += 1 self.logger.debug('第 %s 条链接:数据抓取异常 :%s\n' % (self.countNum, text)) return html = etree.HTML(text['data']) content_list = html.xpath('//div[@class="read-content j_readContent"]') if len(content_list) <= 0: self.countNum += 1 title = html.xpath('//title/text()') requestIntercept = html.xpath( '//div[@class="empty-text"]//strong/text()') request404 = html.xpath('//h3[@class="lang"]/text()') self.logger.debug('第 %s 条链接:HTML解析异常!' % (self.countNum)) self.logger.debug('第 %s 条链接[title]:%s' % (self.countNum, title)) if len(requestIntercept) > 0: self.errorUrl.append(link) second = self.b_second * 180 self.logger.debug( '第 %s 条链接[requestIntercept]:%s 被拦截了暂停 %s 秒后 抓取下一条链接 ' % (self.countNum, requestIntercept, second)) time.sleep(second) if len(request404) > 0: self.request404.append(link) self.logger.debug('第 %s 条链接[request404]:%s' % (self.countNum, request404)) self.logger.debug('第 %s 条链接[text]:%s\n' % (self.countNum, text)) return content_list = content_list[0] content = etree.tostring(content_list, method='xml').decode('utf-8') res = self.mySql.batchAdd(sql=self.b_mysqlStr['saveText'], data_info=[(link, content)]) if res: self.errorUrl.append(link) self.countNum += 1 self.logger.debug('第 %s 条链接: %s\n' % (self.countNum, res)) # self.b_bookTXTData.append((link, content)) # 6、循环调用 getBookTxt() # 根据章节 catalogId、url 抓取页面数据 def getBookTXT(self, catalogList, index): if len(catalogList) <= 0: self.logger.debug('书籍组 [ %s / %s ] :getBookTXT 没有数据\n' % (index + 1, len(self.b_catalogList))) return bookCatalogUrlGroupingData = self.dataToo.groupingData( list=catalogList, pageSize=self.b_bookTXTGroupSize, fixed=True) listTaskList = bookCatalogUrlGroupingData['listTaskList'] for i in range(bookCatalogUrlGroupingData['listGroupSize']): if len(listTaskList[i]) <= 0: continue start = time.time() for j in range(len(listTaskList[i])): self.second() self.getArticle(listTaskList[i][j], index, bookCatalogUrlGroupingData, i, j) end = time.time() self.logger.debug( '书籍组 [ %s / %s ] 目录组 [ %s / %s ] : 开始时间:%s : 结束时间:%s ==> 共消耗时间 :%s 秒 [ %s ]\n' % (index + 1, len(self.b_catalogList), i + 1, bookCatalogUrlGroupingData['listGroupSize'], float(start), float(end), int(float(end) - float(start)), self.timeToo.changeTime(int(float(end) - float(start))))) def saveText(self): for i in range(len(self.b_catalogList)): if len(self.b_catalogList[i]) <= 0: self.logger.debug('书籍组 [ %s / %s ] saveText 没有数据\n' % (i + 1, len(self.b_catalogList))) continue start = time.time() self.getBookTXT(self.b_catalogList[i], i) end = time.time() self.logger.debug( '书籍组 [ %s / %s ] : 开始时间:%s : 结束时间:%s ==> 共消耗时间 :%s 秒 [ %s ]\n' % (i + 1, len(self.b_catalogList), float(start), float(end), int(float(end) - float(start)), self.timeToo.changeTime(int(float(end) - float(start))))) self.logger.info('*-*-*-*-*-*-' * 15) # res = mySql.batchAdd(sql=self.b_mysqlStr['saveText'], data_info=self.b_bookTXTData) # if res: self.b_bookTXTData = [] # 文章内容存储 def bookTxtLoad(self): start = time.time() bookData = self.getBookData() if len(bookData) <= 0: self.logger.debug('bookTxtLoad 没有数据\n') return bookGroupingData = self.dataToo.groupingData( list=bookData, pageSize=self.b_bookPageSize) self.logger.info('========' * 15) self.logger.info("\t时间: %s" % (moment.now().format('YYYY-MM-DD HH:mm:ss'))) self.logger.info("\t网站:%s" % (self.con.getConfig('webConfig', 'host'))) self.logger.info("\t\t\t本次将采集 %s 本小说。\n" % (bookGroupingData['listSize'])) self.logger.info( '\t\t\t%s 本小说,共分为 %s 个组,每组 %s 本小说。 \n' % (bookGroupingData['listSize'], bookGroupingData['listGroupSize'], bookGroupingData['listTaskSize'])) self.logger.info( '\t\t\t采集时间预算:共 %s 组,每组采集间隔 %s 秒,每组 %s 本小说,每本小说 预计 %s 秒,每组预计 %s 秒,总计 %s 秒 [ %s ]\n' % (bookGroupingData['listGroupSize'], self.b_second, bookGroupingData['listTaskSize'], self.b_second + 10, self.b_second + (self.b_second + 10) * bookGroupingData['listTaskSize'], (self.b_second + (bookGroupingData['listTaskSize'] * (self.b_second + 10))) * bookGroupingData['listGroupSize'], self.timeToo.changeTime( ((self.b_second + (bookGroupingData['listTaskSize'] * (self.b_second + 10)))) * bookGroupingData['listGroupSize']))) self.logger.info('========' * 15) self.setCatalogList(bookGroupingData) self.saveText() end = time.time() self.logger.info('---' * 30) self.logger.info('\t\t时间 :%s' % (moment.now().format('YYYY-MM-DD HH:mm:ss'))) self.logger.info('\t\t消耗 时间 :%s 秒 [ %s ]' % (float(end) - float(start), self.timeToo.changeTime(float(end) - float(start)))) self.logger.info('\t\t采集 链接 : %s 条' % (self.countNum)) self.logger.info('\t\t采集 失败链接 : %s 条' % (len(self.errorUrl))) self.logger.info('\t\t请求 失败链接 : %s 条' % (len(self.request404))) self.logger.info('\t\t采集 失败链接 :\n\t\t\t' % (self.errorUrl)) self.logger.info('\t\t请求 失败链接 :\n\t\t\t' % (self.request404))
class DataTool(): def __init__(self, logName, second, timeStr): self.b_second = second self.b_timeStr = timeStr self.b_logName = logName self.logger = Logger(logname=self.initLogName(), loglevel=1, logger="DataTool").getlog() def groupingData(self, list, pageSize, fixed=False): listSize = len(list) if fixed: listGroupSize = pageSize else: listGroupSize = math.ceil(float(listSize) / pageSize) nloops = range(listGroupSize) listTaskList = [] listTaskSize = math.ceil(float(listSize) / listGroupSize) for i in nloops: try: self.logger.info( "第 %s 组 :[ %s ] \n\t" % (i + 1, len( list[i * listTaskSize:(i + 1) * listTaskSize]))) listTaskList.append(list[i * listTaskSize:(i + 1) * listTaskSize]) except: self.logger.info("第 %s 组 :[ %s ] \n\t" % (i + 1, len(list[i * listTaskSize:]))) listTaskList.append(list[i * listTaskSize:]) res = { 'listSize': listSize, 'listGroupSize': listGroupSize, 'listTaskSize': listTaskSize, 'listTaskList': listTaskList } self.logger.info('groupingData : %s' % res) return res def threads(self, taskList, target): nloops = range(len(taskList)) threads = [] for i in nloops: if len(taskList[i]) <= 0: continue t = threading.Thread(target=target, args=(taskList[i], i)) threads.append(t) for i in nloops: if len(taskList[i]) <= 0: continue threads[i].start() for i in nloops: if len(taskList[i]) <= 0: continue threads[i].join() # 调接口获取数据 def getHTMLTxt(self, link, heads): result = {'status': '200', 'data': '', 'link': link} # # r = requests.get(link, headers=heads, timeout=100) # r.encoding = "utr-8" # result['data'] = r.text try: r = requests.get(link, headers=heads, timeout=100) r.encoding = "utr-8" result['data'] = r.text except: second = random.randint(0, self.b_second * 60) self.logger.debug('[ %s ][ 403 ] 可能被拦截了暂停 %s 秒后 抓取下一条链接 !\n' % (link, second)) time.sleep(second) result['status'] = '403' return result def getJsonTxt(self, link, heads): result = {'status': '200', 'data': '', 'link': link} try: r = requests.get(link, headers=heads) r.encoding = "utr-8" result['data'] = json.loads(r.text) except: second = random.randint(0, self.b_second * 60) self.logger.debug('[ %s ][ 403 ] 可能被拦截了暂停 %s 秒后 抓取下一条链接 !\n' % (link, second)) time.sleep(second) result['status'] = '403' return result def listToStr(self, data_info=[]): return tuple(data_info) def getText(self, link): if len(link) <= 0: return heads = self.initHeads('html') return self.getHTMLTxt(link=link, heads=heads) def getJson(self, link): if len(link) <= 0: return heads = self.initHeads('json') return self.getJsonTxt(link=link, heads=heads) def initHeads(self, type): if type == 'html': heads = {} heads[ 'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' heads['Accept-Encoding'] = 'gzip, deflate, br' heads['Accept-Language'] = 'zh-CN,zh;q=0.9' heads['Connection'] = 'keep-alive' heads[ 'Cookie'] = 'newstatisticUUID=1547894850_1903849637; qdrs=0%7C3%7C0%7C0%7C1; qdgd=1' heads['Host'] = 'www.xs8.cn' heads['Upgrade-Insecure-Requests'] = '1' heads['Referer'] = '' heads[ 'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36' return heads elif type == 'json': heads = {} heads['Accept'] = 'application/json, text/javascript, */*; q=0.01' heads['Accept-Encoding'] = 'gzip, deflate, br' heads['Accept-Language'] = 'zh-CN,zh;q=0.9' heads['Connection'] = 'keep-alive' heads[ 'Cookie'] = 'newstatisticUUID=1547123562_436906659; qdrs=0%7C3%7C0%7C0%7C1; qdgd=1' heads['Host'] = 'www.xs8.cn' heads['Referer'] = '' heads[ 'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' heads['X-Requested-With'] = 'XMLHttpRequest' def initLogName(self): return '%s_%s.log' % (self.b_logName, self.b_timeStr) def getLinkArr(self, data): linkArr = [] for item in data: linkArr.append(item) return linkArr
class DataToo(): def __init__(self, logName, second): self.b_second = second self.logger = Logger(logname=logName, loglevel=1, logger="DataToo").getlog() def groupingData(self, list, pageSize, fixed=False): listSize = len(list) if fixed: listGroupSize = pageSize else: listGroupSize = math.ceil(float(listSize) / pageSize) nloops = range(listGroupSize) listTaskList = [] listTaskSize = math.ceil(float(listSize) / listGroupSize) for i in nloops: try: self.logger.info("第 %s 组 :[ %s ] \n\t" % (i + 1, len(list[i * listTaskSize:(i + 1) * listTaskSize]))) listTaskList.append(list[i * listTaskSize:(i + 1) * listTaskSize]) except: self.logger.info("第 %s 组 :[ %s ] \n\t" % (i + 1, len(list[i * listTaskSize:]))) listTaskList.append(list[i * listTaskSize:]) res = { 'listSize': listSize, 'listGroupSize': listGroupSize, 'listTaskSize': listTaskSize, 'listTaskList': listTaskList } self.logger.info('groupingData : %s' % res) return res def threads(self, taskList, target): nloops = range(len(taskList)) # self.logger.debug('threads:==>\n\t %s \n\t %s' % (nloops, taskList)) threads = [] for i in nloops: if len(taskList[i]) <= 0: continue t = threading.Thread(target=target, args=(taskList[i], i)) threads.append(t) for i in nloops: if len(taskList[i]) <= 0: continue threads[i].start() for i in nloops: if len(taskList[i]) <= 0: continue threads[i].join() # 调接口获取数据 def getHTMLTxt(self, link, heads): result = {'status': '200', 'data': '', 'link': link} # r = requests.get(link, headers=heads) # r.encoding = "utr-8" # # result['data'] = r.text # return result try: r = requests.get(link, headers=heads, timeout=10) r.encoding = "utr-8" result['data'] = r.text except: second = random.randint(0, self.b_second * 60) self.logger.debug('[ %s ][ 403 ] 可能被拦截了暂停 %s 秒后 抓取下一条链接 !\n' % (link, second)) time.sleep(second) result['status'] = '403' return result def listToStr(self, data_info): # links = ','.join(data_info) return tuple(data_info) # for item in links: # print(item) # print(str(item)) # return str(','.join(data_info)) def getText(self, link, heads): if len(link) <= 0: return return self.getHTMLTxt(link=link, heads=heads)
mysqlStr = { 'saveText': "INSERT INTO `links` (`url`,article) VALUES (%s, %s) ON DUPLICATE KEY UPDATE article = VALUES (article), nex = nex+1", 'getBookIdsSql': getBookIdsSql, 'getCatalogData': "SELECT url FROM links WHERE fs = 0 AND book_Id in " } # saveBookToRedis.control() end = time.time() pool = redis.ConnectionPool(host='192.168.2.202', port=6379, db=8) r2 = redis.StrictRedis(connection_pool=pool) while True: msg = input("publish: >>") if msg == "stop": logger.debug("停止发布") break if msg == "catalog": logger.debug("发布抓取通知") saveBookToRedis = SaveBookToRedis(mysqlStr=mysqlStr, bookPageSize=bookPageSize, bookIdSize=bookIdSize) saveBookToRedis.control() time.sleep(60) r2.publish('getBookCatalog', msg) if msg == "txt": logger.debug("发布抓取通知") saveBookToRedis = SaveBookToRedis(mysqlStr=mysqlStr, bookPageSize=bookPageSize, bookIdSize=bookIdSize) saveBookToRedis.control()
class SaveBookToRedis(): def __init__(self, environmentalType, maxBookNex): self.b_bookPageSize = 10 self.b_bookIdSize = 5 self.b_bookTXTGroupSize = 100 self.b_environmentalType = int(environmentalType) self.b_maxBookNex = int(maxBookNex) self.b_title = 'SaveBookToRedis' self.b_second = 1 self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') self.b_catalogList = [] self.b_bookTXTData = [] self.errorUrl = [] self.request404 = [] self.countNum = 0 self.con = ConfigParser() self.logName = self.intLogName() self.rds = RedisToo() self.mySql = MySqlToo(logName=self.logName) self.dataToo = DataToo(logName=self.b_title, second=self.b_second) self.logger = Logger(logname=self.intLogName(), loglevel=1, logger=self.b_title).getlog() self.timeToo = TimeToo() self.b_mysqlStr = self.initMysqlStr() def intLogName(self): timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') return '%s_%s.log' % (self.b_title, timeStr) def initMysqlStr(self): if self.b_environmentalType == 2: environmental = 'online' getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s" % self.b_maxBookNex elif self.b_environmentalType == 1: environmental = 'test' testBookId = '10000,20000' getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s limit %s" % ( self.b_maxBookNex, testBookId) else: environmental = 'dev' testBookId = "'10000611804961003','10000828104982003'" getBookIdsSql = "SELECT book_Id FROM books WHERE book_Id in (%s)" % testBookId self.b_bookPageSize = 2 self.b_bookIdSize = 2 self.b_bookTXTGroupSize = 1 return { 'saveText': "INSERT INTO `links` (`url`,article) VALUES (%s, %s) ON DUPLICATE KEY UPDATE article = VALUES (article), nex = nex+1", 'getBookIdsSql': getBookIdsSql, 'getCatalogData': "SELECT url FROM links WHERE fs = 0 AND book_Id in " } def second(self): time.sleep(self.b_second) def getBookData(self): bookList = [] bookData = self.mySql.getListData(sql=self.b_mysqlStr['getBookIdsSql']) for item in bookData: bookList.append(item[0]) return bookList def setCatalogList(self, bookGroupingData): bookData = bookGroupingData['listTaskList'] if len(bookData) <= 0: self.logger.debug('setCatalogList 没有数据\n') return bookIdGroupingData = self.dataToo.groupingData( list=bookData, pageSize=self.b_bookIdSize) listTaskList = bookIdGroupingData['listTaskList'] for i in range(bookIdGroupingData['listGroupSize']): time.sleep(10) if len(listTaskList[i]) <= 0: continue data = [] for item in listTaskList[i]: data.append(','.join(item)) self.rds.setListData('bookIdsList', data) def bookTxtLoad(self): start = time.time() bookData = self.getBookData() if len(bookData) <= 0: self.logger.debug('bookTxtLoad 没有数据\n') return bookGroupingData = self.dataToo.groupingData( list=bookData, pageSize=self.b_bookPageSize) self.setCatalogList(bookGroupingData) end = time.time() self.logger.info('========' * 15) self.logger.info("startTime: %s" % (moment.now().format('YYYY-MM-DD HH:mm:ss'))) self.logger.info("webHost:%s" % (self.con.getConfig('webConfig', 'host'))) self.logger.info("author:%s" % (self.con.getConfig('webConfig', 'author'))) self.logger.info("email:%s" % (self.con.getConfig('webConfig', 'email'))) self.logger.info( '本次将采集 [ %s ] 本小说,共分为 %s 个组,每组 %s 本小说。' % (bookGroupingData['listSize'], bookGroupingData['listGroupSize'], bookGroupingData['listTaskSize'])) self.logger.info( 'saveBooksToRedis [ %s ] 组 小说,消耗时间:%s 秒 [ %s ]' % (bookGroupingData['listGroupSize'], float(end) - float(start), self.timeToo.changeTime(float(end) - float(start)))) self.logger.info('========' * 15)
class getFreeBookTXT(object): def __init__(self): # self.b_getBookIdsListSize = int(getBookIdsListSize) # self.b_rdsKeyName = rdsKeyName self.b_title = 'getFreeBookTXT' self.b_second = 1 self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') self.dataToo = DataTool(logName=self.b_title, second=self.b_second, timeStr=self.b_timeStr) self.mySql = MySqlTool(logName=self.dataToo.initLogName()) self.logger = Logger(logname=self.dataToo.initLogName(), loglevel=1, logger=self.b_title).getlog() self.rds = RedisTool() self.getBookInfoToo = GetBookInfoTool(second=self.b_second, dataToo=self.dataToo, logger=self.logger) self.saveBookInfoToMySqlToo = SaveBookInfoToMySqlToo( second=self.b_second, logger=self.logger, getBookInfoToo=self.getBookInfoToo, mySql=self.mySql, dataToo=self.dataToo) self.con = ConfigParser() def getFreeBookLink(self): bookInfoList = self.getBookInfoToo.toFreeBookListPageGetBookList( freeBookListPage=self.con.getConfig('webConfig', 'freeBookListPage')) self.logger.debug(bookInfoList) return bookInfoList # def target(self): # # links = [] # for i in range(self.b_getBookIdsListSize): # link = self.rds.r.lpop(self.b_rdsKeyName) # if link != None: # link = link.decode(encoding='utf-8') # links.append(link) # return links def formatCatalogInfo(self, data): catalogData = data['vs'] links = [] for i in catalogData: for j in i['cs']: url = self.con.getConfig( 'webConfig', 'host') + '/chapter/' + data['bookId'] + '/' + j['id'] # links.append({j['id'], j['cN'], bookId, bookName, j['cnt'], url, j['uuid'], j['fS']}) # links.append(str(url)) self.saveBookInfoToMySqlToo.saveText(link=str(url)) return links def contentsLoad(self): links = self.getFreeBookLink() if len(links) <= 0: self.logger.debug('getFreeBookLink 没有数据\n') return for item in links: # self.logger.debug(item) # self.logger.debug(item['book_Id']) time.sleep(self.b_second) jsonData = self.getBookInfoToo.getCatalogInfo( bookId=item['book_Id']) self.logger.debug(jsonData) catalogData = self.formatCatalogInfo(data=jsonData['data']) self.logger.debug(catalogData) self.saveBookInfoToMySqlToo.saveCatalog( bookId=jsonData['data']['bookId'])