def __init__(self, second, environmentalType, maxBookNex): self.b_bookPageSize = 10 self.b_bookIdSize = 5 self.b_bookTXTGroupSize = 100 self.b_second = int(second) self.b_environmentalType = int(environmentalType) self.b_maxBookNex = int(maxBookNex) self.b_title = 'getBookTXT' self.b_catalogList = [] self.b_bookTXTData = [] self.errorUrl = [] self.request404 = [] self.countNum = 0 self.con = ConfigParser() self.logName = self.intLogName() self.mySql = MySqlToo(logName=self.logName) self.dataToo = DataToo(logName=self.logName, second=self.b_second) self.logger = Logger(logname=self.logName, loglevel=1, logger=self.b_title).getlog() self.rds = self.initRds() self.timeToo = TimeToo() self.b_heads = self.initHeads() self.b_mysqlStr = self.initMysqlStr()
def __init__(self, maxCatalogNex, getBookIdsListSize): self.b_getBookIdsListSize = int(getBookIdsListSize) self.b_bookPageSize = 10 self.b_bookIdSize = 5 self.b_bookTXTGroupSize = 10 self.b_fs = 0 self.b_maxCatalogNex = int(maxCatalogNex) self.b_title = 'getBookTXT' self.b_second = 1 self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') self.b_catalogList = [] self.b_bookTXTData = [] self.errorUrl = [] self.request404 = [] self.countNum = 0 self.con = ConfigParser() self.logName = self.intLogName() self.mySql = MySqlToo(logName=self.logName) self.dataToo = DataToo(logName=self.b_title, second=self.b_second, timeStr=self.b_timeStr) self.logger = Logger(logname=self.dataToo.initLogName(), loglevel=1, logger=self.b_title).getlog() self.rds = RedisToo() self.timeToo = TimeToo() self.b_heads = self.initHeads() self.b_mysqlStr = self.initMysqlStr()
def __init__(self, environmentalType, maxBookNex): self.b_bookPageSize = 10 self.b_bookIdSize = 5 self.b_bookTXTGroupSize = 100 self.b_environmentalType = int(environmentalType) self.b_maxBookNex = int(maxBookNex) self.b_title = 'SaveBookToRedis' self.b_second = 1 self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') self.b_catalogList = [] self.b_bookTXTData = [] self.errorUrl = [] self.request404 = [] self.countNum = 0 self.con = ConfigParser() self.logName = self.intLogName() self.rds = RedisToo() self.mySql = MySqlToo(logName=self.logName) self.dataToo = DataToo(logName=self.b_title, second=self.b_second) self.logger = Logger(logname=self.intLogName(), loglevel=1, logger=self.b_title).getlog() self.timeToo = TimeToo() self.b_mysqlStr = self.initMysqlStr()
def __init__(self): self.con = ConfigParser() self.links = [] self.pool = redis.ConnectionPool(host=self.con.getConfig('redisConfig', 'host'), port=self.con.getConfig('redisConfig', 'port'), db=self.con.getConfig('redisConfig', 'db')) self.r = redis.Redis(connection_pool=self.pool)
class RedisToo(): def __init__(self): self.con = ConfigParser() self.links = [] self.pool = redis.ConnectionPool(host=self.con.getConfig('redisConfig', 'host'), port=self.con.getConfig('redisConfig', 'port'), db=self.con.getConfig('redisConfig', 'db')) self.r = redis.Redis(connection_pool=self.pool) # 获取 并 删除 列表某些元素 def getListData(self, name="list_name1", num=1): dataList = [] for i in range(int(num)): data = self.r.lpop(name) if data != None: nData = bytes_to_str(data, 'utf-8') dataList.append(nData) return dataList # 批量添加列表 def setListData(self, name='list_name1', lists=[]): if len(lists) <= 0: return False self.r.rpush(name, *lists) return True
def __init__(self, logName): self.con = ConfigParser() self.logger = Logger(logname=logName, loglevel=1, logger="MySQLToo").getlog() mysqlConfig = self.con.getConfig('mysql', 'host'), self.con.getConfig( 'mysql', 'user'), self.con.getConfig( 'mysql', 'password'), self.con.getConfig('mysql', 'database') self.logger.info( "\n\t mySqlConfig:\n\t\t host: %s\n\t\t user : %s\n\t\t password : %s\n\t\t database : %s " % (mysqlConfig))
def __init__(self, second, dataToo, logger): self.b_second = second self.errorUrl = [] self.request404 = [] self.bookInfoList = [] self.bookCountNum = 0 self.freeBookCountNum = 0 self.bookCatalogCountNum = 0 self.bookTxtCountNum = 0 self.con = ConfigParser() self.dataToo = dataToo self.logger = logger
def __init__(self): # self.b_getBookIdsListSize = int(getBookIdsListSize) # self.b_rdsKeyName = rdsKeyName self.b_title = 'getFreeBookTXT' self.b_second = 1 self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') self.dataToo = DataTool(logName=self.b_title, second=self.b_second, timeStr=self.b_timeStr) self.mySql = MySqlTool(logName=self.dataToo.initLogName()) self.logger = Logger(logname=self.dataToo.initLogName(), loglevel=1, logger=self.b_title).getlog() self.rds = RedisTool() self.getBookInfoToo = GetBookInfoTool(second=self.b_second, dataToo=self.dataToo, logger=self.logger) self.saveBookInfoToMySqlToo = SaveBookInfoToMySqlToo( second=self.b_second, logger=self.logger, getBookInfoToo=self.getBookInfoToo, mySql=self.mySql, dataToo=self.dataToo) self.con = ConfigParser()
class MySqlToo(): def __init__(self, logName): self.con = ConfigParser() self.logger = Logger(logname=logName, loglevel=1, logger="MySQLToo").getlog() mysqlConfig = self.con.getConfig('mysql', 'host'), self.con.getConfig( 'mysql', 'user'), self.con.getConfig( 'mysql', 'password'), self.con.getConfig('mysql', 'database') self.logger.info( "\n\t mySqlConfig:\n\t\t host: %s\n\t\t user : %s\n\t\t password : %s\n\t\t database : %s " % (mysqlConfig)) # 数据库信息 def openMySqlConfig(self): return pymysql.connect(self.con.getConfig('mysql', 'host'), self.con.getConfig('mysql', 'user'), self.con.getConfig('mysql', 'password'), self.con.getConfig('mysql', 'database')) # 批量添加 信息 # 批量添加 信息 def batchAdd(self, sql, data_info): # self.logger.warning(data_info) db = self.openMySqlConfig() # 使用cursor()方法获取操作游标 cursor = db.cursor() try: # 执行sql语句 cursor.executemany(sql, data_info) # 提交到数据库执行 db.commit() db.close() self.logger.info('存储成功') return True except: # 如果发生错误则回滚 db.rollback() db.close() self.logger.debug('存储失败:[ sql ] %s ' % (str(sql))) self.logger.debug('存储失败:[ data_info ] %s ' % (str(data_info))) return False # 获取列表数据 def getListData(self, sql): db = self.openMySqlConfig() # 使用cursor()方法获取操作游标 cursor = db.cursor() results = [] try: # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 results = cursor.fetchall() db.close() self.logger.debug("查询成功[ %s ]: sql==> %s" % (len(results), sql)) return results except: self.logger.debug("查询失败: sql==> %s" % (sql)) # 关闭数据库连接 db.close() return results
class BookTXTLoad(object): def __init__(self, second, environmentalType, maxBookNex): self.b_bookPageSize = 10 self.b_bookIdSize = 5 self.b_bookTXTGroupSize = 100 self.b_second = int(second) self.b_environmentalType = int(environmentalType) self.b_maxBookNex = int(maxBookNex) self.b_title = 'getBookTXT' self.b_catalogList = [] self.b_bookTXTData = [] self.errorUrl = [] self.request404 = [] self.countNum = 0 self.con = ConfigParser() self.logName = self.intLogName() self.mySql = MySqlToo(logName=self.logName) self.dataToo = DataToo(logName=self.logName, second=self.b_second) self.logger = Logger(logname=self.logName, loglevel=1, logger=self.b_title).getlog() self.rds = self.initRds() self.timeToo = TimeToo() self.b_heads = self.initHeads() self.b_mysqlStr = self.initMysqlStr() def initMysqlStr(self): if self.b_environmentalType == 2: environmental = 'online' getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s" % self.b_maxBookNex elif self.b_environmentalType == 1: environmental = 'test' testBookId = '10000,20000' getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s limit %s" % ( self.b_maxBookNex, testBookId) else: environmental = 'dev' testBookId = "'10000611804961003','10000828104982003'" getBookIdsSql = "SELECT book_Id FROM books WHERE book_Id in (%s)" % testBookId return { 'saveText': "INSERT INTO `links` (`url`,article) VALUES (%s, %s) ON DUPLICATE KEY UPDATE article = VALUES (article), nex = nex+1", 'getBookIdsSql': getBookIdsSql, 'getCatalogData': "SELECT url FROM links WHERE fs = 0 AND book_Id in " } def initHeads(self): heads = {} heads[ 'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' heads['Accept-Encoding'] = 'gzip, deflate, br' heads['Accept-Language'] = 'zh-CN,zh;q=0.9' heads['Connection'] = 'keep-alive' heads[ 'Cookie'] = 'newstatisticUUID=1547076169_1527614489; qdrs=0%7C3%7C0%7C0%7C1; qdgd=1' heads['Host'] = 'www.xs8.cn' heads['Upgrade-Insecure-Requests'] = '1' heads['Referer'] = '' heads[ 'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36' return heads def intLogName(self): timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') return '%s_%s.txt' % (self.b_title, timeStr) def initRds(self): pool = redis.ConnectionPool( host=self.con.getConfig('redisConfig', 'host'), port=self.con.getConfig('redisConfig', 'port'), db=self.con.getConfig('redisConfig', 'db')) return redis.StrictRedis(connection_pool=pool) def second(self): time.sleep(self.b_second) # 2、调用mySQL类 mysqlUtils.getListData 获取数据列表 def getBookData(self): bookList = [] bookData = self.mySql.getListData(sql=self.b_mysqlStr['getBookIdsSql']) for item in bookData: bookList.append(item[0]) return bookList def getCatalogData(self, bookId, index): catalogList = [] sql = '%s %s' % (self.b_mysqlStr['getCatalogData'], self.dataToo.listToStr(bookId)) self.logger.info('查询小说章节 [ %s ]...\n' % (sql)) catalogData = self.mySql.getListData(sql=sql) for item in catalogData: catalogList.append(item[0]) self.b_catalogList.append(catalogList) # 4、 章节目录 catalog 数据整理 数组 def setCatalogList(self, bookGroupingData): bookData = bookGroupingData['listTaskList'] if len(bookData) <= 0: self.logger.debug('setCatalogList 没有数据\n') return bookIdGroupingData = self.dataToo.groupingData( list=bookData, pageSize=self.b_bookIdSize) listTaskList = bookIdGroupingData['listTaskList'] for i in range(bookIdGroupingData['listGroupSize']): if len(listTaskList[i]) <= 0: continue self.dataToo.threads(listTaskList[i], self.getCatalogData) def getArticle(self, link, group, bookCatalogUrlGroupingData, ngroup, nindex): bkd = bookCatalogUrlGroupingData self.b_heads['Referer'] = link self.logger.info( '已采集 [ %s ] 书籍组 [ %s / %s ] 目录组 [ %s / %s ] 文章组 [ %s / %s ] 链接 [ %s ] %s 秒后开始抓取' % (self.countNum, group + 1, len( self.b_catalogList), ngroup + 1, bkd['listGroupSize'], nindex + 1, bkd['listTaskSize'], link, self.b_second)) self.second() text = self.dataToo.getText(link=link, heads=self.b_heads) if len(text['data']) <= 0: self.errorUrl.append(link) self.countNum += 1 self.logger.debug('第 %s 条链接:数据抓取异常 :%s\n' % (self.countNum, text)) return html = etree.HTML(text['data']) content_list = html.xpath('//div[@class="read-content j_readContent"]') if len(content_list) <= 0: self.countNum += 1 title = html.xpath('//title/text()') requestIntercept = html.xpath( '//div[@class="empty-text"]//strong/text()') request404 = html.xpath('//h3[@class="lang"]/text()') self.logger.debug('第 %s 条链接:HTML解析异常!' % (self.countNum)) self.logger.debug('第 %s 条链接[title]:%s' % (self.countNum, title)) if len(requestIntercept) > 0: self.errorUrl.append(link) second = self.b_second * 180 self.logger.debug( '第 %s 条链接[requestIntercept]:%s 被拦截了暂停 %s 秒后 抓取下一条链接 ' % (self.countNum, requestIntercept, second)) time.sleep(second) if len(request404) > 0: self.request404.append(link) self.logger.debug('第 %s 条链接[request404]:%s' % (self.countNum, request404)) self.logger.debug('第 %s 条链接[text]:%s\n' % (self.countNum, text)) return content_list = content_list[0] content = etree.tostring(content_list, method='xml').decode('utf-8') res = self.mySql.batchAdd(sql=self.b_mysqlStr['saveText'], data_info=[(link, content)]) if res: self.errorUrl.append(link) self.countNum += 1 self.logger.debug('第 %s 条链接: %s\n' % (self.countNum, res)) # self.b_bookTXTData.append((link, content)) # 6、循环调用 getBookTxt() # 根据章节 catalogId、url 抓取页面数据 def getBookTXT(self, catalogList, index): if len(catalogList) <= 0: self.logger.debug('书籍组 [ %s / %s ] :getBookTXT 没有数据\n' % (index + 1, len(self.b_catalogList))) return bookCatalogUrlGroupingData = self.dataToo.groupingData( list=catalogList, pageSize=self.b_bookTXTGroupSize, fixed=True) listTaskList = bookCatalogUrlGroupingData['listTaskList'] for i in range(bookCatalogUrlGroupingData['listGroupSize']): if len(listTaskList[i]) <= 0: continue start = time.time() for j in range(len(listTaskList[i])): self.second() self.getArticle(listTaskList[i][j], index, bookCatalogUrlGroupingData, i, j) end = time.time() self.logger.debug( '书籍组 [ %s / %s ] 目录组 [ %s / %s ] : 开始时间:%s : 结束时间:%s ==> 共消耗时间 :%s 秒 [ %s ]\n' % (index + 1, len(self.b_catalogList), i + 1, bookCatalogUrlGroupingData['listGroupSize'], float(start), float(end), int(float(end) - float(start)), self.timeToo.changeTime(int(float(end) - float(start))))) def saveText(self): for i in range(len(self.b_catalogList)): if len(self.b_catalogList[i]) <= 0: self.logger.debug('书籍组 [ %s / %s ] saveText 没有数据\n' % (i + 1, len(self.b_catalogList))) continue start = time.time() self.getBookTXT(self.b_catalogList[i], i) end = time.time() self.logger.debug( '书籍组 [ %s / %s ] : 开始时间:%s : 结束时间:%s ==> 共消耗时间 :%s 秒 [ %s ]\n' % (i + 1, len(self.b_catalogList), float(start), float(end), int(float(end) - float(start)), self.timeToo.changeTime(int(float(end) - float(start))))) self.logger.info('*-*-*-*-*-*-' * 15) # res = mySql.batchAdd(sql=self.b_mysqlStr['saveText'], data_info=self.b_bookTXTData) # if res: self.b_bookTXTData = [] # 文章内容存储 def bookTxtLoad(self): start = time.time() bookData = self.getBookData() if len(bookData) <= 0: self.logger.debug('bookTxtLoad 没有数据\n') return bookGroupingData = self.dataToo.groupingData( list=bookData, pageSize=self.b_bookPageSize) self.logger.info('========' * 15) self.logger.info("\t时间: %s" % (moment.now().format('YYYY-MM-DD HH:mm:ss'))) self.logger.info("\t网站:%s" % (self.con.getConfig('webConfig', 'host'))) self.logger.info("\t\t\t本次将采集 %s 本小说。\n" % (bookGroupingData['listSize'])) self.logger.info( '\t\t\t%s 本小说,共分为 %s 个组,每组 %s 本小说。 \n' % (bookGroupingData['listSize'], bookGroupingData['listGroupSize'], bookGroupingData['listTaskSize'])) self.logger.info( '\t\t\t采集时间预算:共 %s 组,每组采集间隔 %s 秒,每组 %s 本小说,每本小说 预计 %s 秒,每组预计 %s 秒,总计 %s 秒 [ %s ]\n' % (bookGroupingData['listGroupSize'], self.b_second, bookGroupingData['listTaskSize'], self.b_second + 10, self.b_second + (self.b_second + 10) * bookGroupingData['listTaskSize'], (self.b_second + (bookGroupingData['listTaskSize'] * (self.b_second + 10))) * bookGroupingData['listGroupSize'], self.timeToo.changeTime( ((self.b_second + (bookGroupingData['listTaskSize'] * (self.b_second + 10)))) * bookGroupingData['listGroupSize']))) self.logger.info('========' * 15) self.setCatalogList(bookGroupingData) self.saveText() end = time.time() self.logger.info('---' * 30) self.logger.info('\t\t时间 :%s' % (moment.now().format('YYYY-MM-DD HH:mm:ss'))) self.logger.info('\t\t消耗 时间 :%s 秒 [ %s ]' % (float(end) - float(start), self.timeToo.changeTime(float(end) - float(start)))) self.logger.info('\t\t采集 链接 : %s 条' % (self.countNum)) self.logger.info('\t\t采集 失败链接 : %s 条' % (len(self.errorUrl))) self.logger.info('\t\t请求 失败链接 : %s 条' % (len(self.request404))) self.logger.info('\t\t采集 失败链接 :\n\t\t\t' % (self.errorUrl)) self.logger.info('\t\t请求 失败链接 :\n\t\t\t' % (self.request404))
# bookGroupingData['listGroupSize'], # self.b_second, bookGroupingData['listTaskSize'], # self.b_second + 10, # self.b_second + (self.b_second + 10) * bookGroupingData['listTaskSize'], # (self.b_second + (bookGroupingData['listTaskSize'] # * (self.b_second + 10))) * bookGroupingData['listGroupSize'], # timeToo.changeTime(((self.b_second + (bookGroupingData['listTaskSize'] * (self.b_second + 10)))) # * bookGroupingData['listGroupSize']))) # logger.info('========' * 15) self.setCatalogList(bookGroupingData) # self.saveText() if __name__ == '__main__': start = time.time() con = ConfigParser() r = RedisToo() maxBookNex = 0 maxCatalogNex = 1 bookPageSize = 10 environmentalType = 1 bookIdSize = 5 bookTXTGroupSize = 1 second = 1 timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') logName = 'getBookTXT_%s.txt' % (timeStr) logger = Logger(logname=logName, loglevel=1, logger="getBookTXT").getlog() if environmentalType == 2: environmental = 'online'
class GetBookInfoTool(): def __init__(self, second, dataToo, logger): self.b_second = second self.errorUrl = [] self.request404 = [] self.bookInfoList = [] self.bookCountNum = 0 self.freeBookCountNum = 0 self.bookCatalogCountNum = 0 self.bookTxtCountNum = 0 self.con = ConfigParser() self.dataToo = dataToo self.logger = logger def exceptions(self, html, link, text): title = html.xpath('//title/text()') requestIntercept = html.xpath( '//div[@class="empty-text"]//strong/text()') request404 = html.xpath('//h3[@class="lang"]/text()') requestNode = html.xpath('//div[@class="no-data"]/h3/text()') self.logger.debug('链接 [ %s ] :HTML解析异常!' % (link)) self.logger.debug('[title]:%s' % (title)) if len(requestIntercept) > 0: self.errorUrl.append(link) second = self.b_second * 180 self.logger.debug('[requestIntercept]:%s 被拦截了暂停 %s 秒后 抓取下一条链接 ' % (requestIntercept, second)) time.sleep(second) if len(requestNode) > 0: self.errorUrl.append(link) self.logger.debug('[requestNode]:%s' % (requestNode)) if len(request404) > 0: self.request404.append(link) self.logger.debug('[request404]:%s' % (request404)) self.logger.debug('[text]:%s\n' % (text)) def noData(self, link, text): self.errorUrl.append(link) self.logger.debug('链接 [ %s ] :数据抓取异常 :%s\n' % (link, text)) def getContentList(self, link, xpath): contentList = [] text = self.dataToo.getText(link=link) if len(text['data']) <= 0: self.noData(link, text) return contentList html = etree.HTML(text['data']) contentList = html.xpath(xpath) if len(contentList) <= 0: self.exceptions(html, link, text) return contentList # 去书籍搜索列表获取书籍列表 def toAllBookListPageGetBookList(self, link): time.sleep(self.b_second) content_list = self.getContentList( link=link, xpath='//div[@class="right-book-list"]//li') if len(content_list) <= 0: self.bookCountNum += 1 return bookInfoList = [] for item in content_list: book_Id = item.xpath( './/div[@class="book-info"]/h3/a/@href')[0][6:] book_name = item.xpath('.//div[@class="book-info"]/h3/a/text()')[0] author = item.xpath('.//div[@class="book-info"]/h4/a/text()')[0] tag = item.xpath( './/div[@class="book-info"]/p[@class="tag"]/span/text()') synoptic = item.xpath( './/div[@class="book-info"]/p[@class="intro"]/text()')[0] img_url = item.xpath('.//div[@class="book-img"]/a/img/@src')[0] chan_name = tag[0] state = tag[1] bookInfoList.append({ 'book_Id': book_Id, 'book_name': book_name, 'state': state, 'author': author, 'chan_name': chan_name, 'synoptic': str(synoptic), 'img_url': img_url }) self.logger.info('书籍列 [ %s ] 表信息采集完成:%s' % (link, bookInfoList)) self.freeBookCountNum += 1 return bookInfoList # 去免费书籍页面获取书籍列表 def toFreeBookListPageGetBookList(self, freeBookListPage): link = freeBookListPage content_list = self.getContentList( link=link, xpath='//*[@id="limit-list"]/div/ul/li') if len(content_list) <= 0: self.freeBookCountNum += 1 return bookInfoList = [] for item in content_list: book_Id = item.xpath( './/div[@class="book-mid-info"]/h4/a/@href')[0][6:] book_name = item.xpath( './/div[@class="book-mid-info"]/h4/a/text()')[0] author = item.xpath( './/p[@class="author"]/a[contains(concat("",@class,"name"),"")]/text()' )[0] chan_name = item.xpath('.//p[@class="author"]/a[2]/text()')[0] state = item.xpath('.//p[@class="author"]/span/text()')[0] img_url = item.xpath('.//div[@class="book-img-box"]/a/img/@src')[0] synopticHtml = item.xpath( './/div[@class="book-mid-info"]//p[@class="intro"]')[0] synoptic = etree.tostring(synopticHtml, method='xml').decode('utf-8') bookInfoList.append({ 'book_Id': book_Id, 'book_name': book_name, 'author': author, 'chan_name': chan_name, 'state': state, 'img_url': img_url, 'synoptic': synoptic }) self.logger.info('免费书籍列表【 %s 】信息采集完成:%s' % (link, bookInfoList)) self.freeBookCountNum += 1 return bookInfoList # 链接处理 def bookLinkLoad(self, bookId): day = datetime.now() unix = datetime.timestamp(day) return self.con.getConfig( 'webConfig', 'host' ) + '/ajax/chapter/userChapterList?_csrfToken=&bookId=' + str( bookId) + '&_=' + str(int(unix)) def getCatalogInfo(self, bookId): link = self.bookLinkLoad(bookId) time.sleep(self.b_second) jsonData = self.dataToo.getJson(link=link) if len(jsonData['data']['data']) <= 0: self.bookCatalogCountNum += 1 self.noData(link, jsonData) return [] self.logger.info('书籍【 %s 】目录信息采集完成:%s' % (link, jsonData['data'])) self.bookCatalogCountNum += 1 return jsonData['data'] def getTxtInfo(self, link): content = '' time.sleep(self.b_second) content_list = self.getContentList( link=link, xpath='//div[@class="read-content j_readContent"]') if len(content_list) <= 0: self.freeBookCountNum += 1 return content content_list = content_list[0] content = etree.tostring(content_list, method='xml').decode('utf-8') self.logger.info('书籍【 %s 】章节信息采集完成' % (link)) self.bookTxtCountNum += 1 return content
class SaveBookToRedis(): def __init__(self, environmentalType, maxBookNex): self.b_bookPageSize = 10 self.b_bookIdSize = 5 self.b_bookTXTGroupSize = 100 self.b_environmentalType = int(environmentalType) self.b_maxBookNex = int(maxBookNex) self.b_title = 'SaveBookToRedis' self.b_second = 1 self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') self.b_catalogList = [] self.b_bookTXTData = [] self.errorUrl = [] self.request404 = [] self.countNum = 0 self.con = ConfigParser() self.logName = self.intLogName() self.rds = RedisToo() self.mySql = MySqlToo(logName=self.logName) self.dataToo = DataToo(logName=self.b_title, second=self.b_second) self.logger = Logger(logname=self.intLogName(), loglevel=1, logger=self.b_title).getlog() self.timeToo = TimeToo() self.b_mysqlStr = self.initMysqlStr() def intLogName(self): timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') return '%s_%s.log' % (self.b_title, timeStr) def initMysqlStr(self): if self.b_environmentalType == 2: environmental = 'online' getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s" % self.b_maxBookNex elif self.b_environmentalType == 1: environmental = 'test' testBookId = '10000,20000' getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s limit %s" % ( self.b_maxBookNex, testBookId) else: environmental = 'dev' testBookId = "'10000611804961003','10000828104982003'" getBookIdsSql = "SELECT book_Id FROM books WHERE book_Id in (%s)" % testBookId self.b_bookPageSize = 2 self.b_bookIdSize = 2 self.b_bookTXTGroupSize = 1 return { 'saveText': "INSERT INTO `links` (`url`,article) VALUES (%s, %s) ON DUPLICATE KEY UPDATE article = VALUES (article), nex = nex+1", 'getBookIdsSql': getBookIdsSql, 'getCatalogData': "SELECT url FROM links WHERE fs = 0 AND book_Id in " } def second(self): time.sleep(self.b_second) def getBookData(self): bookList = [] bookData = self.mySql.getListData(sql=self.b_mysqlStr['getBookIdsSql']) for item in bookData: bookList.append(item[0]) return bookList def setCatalogList(self, bookGroupingData): bookData = bookGroupingData['listTaskList'] if len(bookData) <= 0: self.logger.debug('setCatalogList 没有数据\n') return bookIdGroupingData = self.dataToo.groupingData( list=bookData, pageSize=self.b_bookIdSize) listTaskList = bookIdGroupingData['listTaskList'] for i in range(bookIdGroupingData['listGroupSize']): time.sleep(10) if len(listTaskList[i]) <= 0: continue data = [] for item in listTaskList[i]: data.append(','.join(item)) self.rds.setListData('bookIdsList', data) def bookTxtLoad(self): start = time.time() bookData = self.getBookData() if len(bookData) <= 0: self.logger.debug('bookTxtLoad 没有数据\n') return bookGroupingData = self.dataToo.groupingData( list=bookData, pageSize=self.b_bookPageSize) self.setCatalogList(bookGroupingData) end = time.time() self.logger.info('========' * 15) self.logger.info("startTime: %s" % (moment.now().format('YYYY-MM-DD HH:mm:ss'))) self.logger.info("webHost:%s" % (self.con.getConfig('webConfig', 'host'))) self.logger.info("author:%s" % (self.con.getConfig('webConfig', 'author'))) self.logger.info("email:%s" % (self.con.getConfig('webConfig', 'email'))) self.logger.info( '本次将采集 [ %s ] 本小说,共分为 %s 个组,每组 %s 本小说。' % (bookGroupingData['listSize'], bookGroupingData['listGroupSize'], bookGroupingData['listTaskSize'])) self.logger.info( 'saveBooksToRedis [ %s ] 组 小说,消耗时间:%s 秒 [ %s ]' % (bookGroupingData['listGroupSize'], float(end) - float(start), self.timeToo.changeTime(float(end) - float(start)))) self.logger.info('========' * 15)
class getFreeBookTXT(object): def __init__(self): # self.b_getBookIdsListSize = int(getBookIdsListSize) # self.b_rdsKeyName = rdsKeyName self.b_title = 'getFreeBookTXT' self.b_second = 1 self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss') self.dataToo = DataTool(logName=self.b_title, second=self.b_second, timeStr=self.b_timeStr) self.mySql = MySqlTool(logName=self.dataToo.initLogName()) self.logger = Logger(logname=self.dataToo.initLogName(), loglevel=1, logger=self.b_title).getlog() self.rds = RedisTool() self.getBookInfoToo = GetBookInfoTool(second=self.b_second, dataToo=self.dataToo, logger=self.logger) self.saveBookInfoToMySqlToo = SaveBookInfoToMySqlToo( second=self.b_second, logger=self.logger, getBookInfoToo=self.getBookInfoToo, mySql=self.mySql, dataToo=self.dataToo) self.con = ConfigParser() def getFreeBookLink(self): bookInfoList = self.getBookInfoToo.toFreeBookListPageGetBookList( freeBookListPage=self.con.getConfig('webConfig', 'freeBookListPage')) self.logger.debug(bookInfoList) return bookInfoList # def target(self): # # links = [] # for i in range(self.b_getBookIdsListSize): # link = self.rds.r.lpop(self.b_rdsKeyName) # if link != None: # link = link.decode(encoding='utf-8') # links.append(link) # return links def formatCatalogInfo(self, data): catalogData = data['vs'] links = [] for i in catalogData: for j in i['cs']: url = self.con.getConfig( 'webConfig', 'host') + '/chapter/' + data['bookId'] + '/' + j['id'] # links.append({j['id'], j['cN'], bookId, bookName, j['cnt'], url, j['uuid'], j['fS']}) # links.append(str(url)) self.saveBookInfoToMySqlToo.saveText(link=str(url)) return links def contentsLoad(self): links = self.getFreeBookLink() if len(links) <= 0: self.logger.debug('getFreeBookLink 没有数据\n') return for item in links: # self.logger.debug(item) # self.logger.debug(item['book_Id']) time.sleep(self.b_second) jsonData = self.getBookInfoToo.getCatalogInfo( bookId=item['book_Id']) self.logger.debug(jsonData) catalogData = self.formatCatalogInfo(data=jsonData['data']) self.logger.debug(catalogData) self.saveBookInfoToMySqlToo.saveCatalog( bookId=jsonData['data']['bookId'])