コード例 #1
0
    def __init__(self, second, environmentalType, maxBookNex):
        self.b_bookPageSize = 10
        self.b_bookIdSize = 5
        self.b_bookTXTGroupSize = 100
        self.b_second = int(second)
        self.b_environmentalType = int(environmentalType)
        self.b_maxBookNex = int(maxBookNex)
        self.b_title = 'getBookTXT'

        self.b_catalogList = []
        self.b_bookTXTData = []
        self.errorUrl = []
        self.request404 = []
        self.countNum = 0

        self.con = ConfigParser()
        self.logName = self.intLogName()
        self.mySql = MySqlToo(logName=self.logName)
        self.dataToo = DataToo(logName=self.logName, second=self.b_second)
        self.logger = Logger(logname=self.logName,
                             loglevel=1,
                             logger=self.b_title).getlog()
        self.rds = self.initRds()
        self.timeToo = TimeToo()
        self.b_heads = self.initHeads()
        self.b_mysqlStr = self.initMysqlStr()
コード例 #2
0
ファイル: getBookTXT.py プロジェクト: 520wsl/python-test
    def __init__(self, maxCatalogNex, getBookIdsListSize):
        self.b_getBookIdsListSize = int(getBookIdsListSize)
        self.b_bookPageSize = 10
        self.b_bookIdSize = 5
        self.b_bookTXTGroupSize = 10
        self.b_fs = 0
        self.b_maxCatalogNex = int(maxCatalogNex)
        self.b_title = 'getBookTXT'
        self.b_second = 1
        self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')

        self.b_catalogList = []
        self.b_bookTXTData = []
        self.errorUrl = []
        self.request404 = []
        self.countNum = 0

        self.con = ConfigParser()
        self.logName = self.intLogName()
        self.mySql = MySqlToo(logName=self.logName)
        self.dataToo = DataToo(logName=self.b_title,
                               second=self.b_second,
                               timeStr=self.b_timeStr)
        self.logger = Logger(logname=self.dataToo.initLogName(),
                             loglevel=1,
                             logger=self.b_title).getlog()
        self.rds = RedisToo()
        self.timeToo = TimeToo()
        self.b_heads = self.initHeads()
        self.b_mysqlStr = self.initMysqlStr()
コード例 #3
0
    def __init__(self, environmentalType, maxBookNex):
        self.b_bookPageSize = 10
        self.b_bookIdSize = 5
        self.b_bookTXTGroupSize = 100
        self.b_environmentalType = int(environmentalType)
        self.b_maxBookNex = int(maxBookNex)
        self.b_title = 'SaveBookToRedis'
        self.b_second = 1
        self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')

        self.b_catalogList = []
        self.b_bookTXTData = []
        self.errorUrl = []
        self.request404 = []
        self.countNum = 0

        self.con = ConfigParser()
        self.logName = self.intLogName()
        self.rds = RedisToo()
        self.mySql = MySqlToo(logName=self.logName)
        self.dataToo = DataToo(logName=self.b_title, second=self.b_second)
        self.logger = Logger(logname=self.intLogName(),
                             loglevel=1,
                             logger=self.b_title).getlog()
        self.timeToo = TimeToo()
        self.b_mysqlStr = self.initMysqlStr()
コード例 #4
0
 def __init__(self):
     self.con = ConfigParser()
     self.links = []
     self.pool = redis.ConnectionPool(host=self.con.getConfig('redisConfig', 'host'),
                                      port=self.con.getConfig('redisConfig', 'port'),
                                      db=self.con.getConfig('redisConfig', 'db'))
     self.r = redis.Redis(connection_pool=self.pool)
コード例 #5
0
class RedisToo():
    def __init__(self):
        self.con = ConfigParser()
        self.links = []
        self.pool = redis.ConnectionPool(host=self.con.getConfig('redisConfig', 'host'),
                                         port=self.con.getConfig('redisConfig', 'port'),
                                         db=self.con.getConfig('redisConfig', 'db'))
        self.r = redis.Redis(connection_pool=self.pool)

    # 获取 并 删除 列表某些元素
    def getListData(self, name="list_name1", num=1):
        dataList = []
        for i in range(int(num)):
            data = self.r.lpop(name)
            if data != None:
                nData = bytes_to_str(data, 'utf-8')
                dataList.append(nData)

        return dataList

    # 批量添加列表
    def setListData(self, name='list_name1', lists=[]):
        if len(lists) <= 0:
            return False
        self.r.rpush(name, *lists)
        return True
コード例 #6
0
ファイル: MySqlToo.py プロジェクト: 520wsl/python-test
 def __init__(self, logName):
     self.con = ConfigParser()
     self.logger = Logger(logname=logName, loglevel=1,
                          logger="MySQLToo").getlog()
     mysqlConfig = self.con.getConfig('mysql', 'host'), self.con.getConfig(
         'mysql', 'user'), self.con.getConfig(
             'mysql', 'password'), self.con.getConfig('mysql', 'database')
     self.logger.info(
         "\n\t mySqlConfig:\n\t\t host: %s\n\t\t user : %s\n\t\t password : %s\n\t\t database : %s "
         % (mysqlConfig))
コード例 #7
0
    def __init__(self, second, dataToo, logger):
        self.b_second = second

        self.errorUrl = []
        self.request404 = []
        self.bookInfoList = []
        self.bookCountNum = 0
        self.freeBookCountNum = 0
        self.bookCatalogCountNum = 0
        self.bookTxtCountNum = 0

        self.con = ConfigParser()
        self.dataToo = dataToo
        self.logger = logger
コード例 #8
0
 def __init__(self):
     # self.b_getBookIdsListSize = int(getBookIdsListSize)
     # self.b_rdsKeyName = rdsKeyName
     self.b_title = 'getFreeBookTXT'
     self.b_second = 1
     self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')
     self.dataToo = DataTool(logName=self.b_title,
                             second=self.b_second,
                             timeStr=self.b_timeStr)
     self.mySql = MySqlTool(logName=self.dataToo.initLogName())
     self.logger = Logger(logname=self.dataToo.initLogName(),
                          loglevel=1,
                          logger=self.b_title).getlog()
     self.rds = RedisTool()
     self.getBookInfoToo = GetBookInfoTool(second=self.b_second,
                                           dataToo=self.dataToo,
                                           logger=self.logger)
     self.saveBookInfoToMySqlToo = SaveBookInfoToMySqlToo(
         second=self.b_second,
         logger=self.logger,
         getBookInfoToo=self.getBookInfoToo,
         mySql=self.mySql,
         dataToo=self.dataToo)
     self.con = ConfigParser()
コード例 #9
0
ファイル: MySqlToo.py プロジェクト: 520wsl/python-test
class MySqlToo():
    def __init__(self, logName):
        self.con = ConfigParser()
        self.logger = Logger(logname=logName, loglevel=1,
                             logger="MySQLToo").getlog()
        mysqlConfig = self.con.getConfig('mysql', 'host'), self.con.getConfig(
            'mysql', 'user'), self.con.getConfig(
                'mysql', 'password'), self.con.getConfig('mysql', 'database')
        self.logger.info(
            "\n\t mySqlConfig:\n\t\t host: %s\n\t\t user : %s\n\t\t password : %s\n\t\t database : %s "
            % (mysqlConfig))

    # 数据库信息
    def openMySqlConfig(self):
        return pymysql.connect(self.con.getConfig('mysql', 'host'),
                               self.con.getConfig('mysql', 'user'),
                               self.con.getConfig('mysql', 'password'),
                               self.con.getConfig('mysql', 'database'))
        # 批量添加 信息

        # 批量添加 信息

    def batchAdd(self, sql, data_info):
        # self.logger.warning(data_info)
        db = self.openMySqlConfig()
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()

        try:
            # 执行sql语句
            cursor.executemany(sql, data_info)
            # 提交到数据库执行
            db.commit()
            db.close()
            self.logger.info('存储成功')
            return True
        except:
            # 如果发生错误则回滚
            db.rollback()
            db.close()
            self.logger.debug('存储失败:[ sql ] %s ' % (str(sql)))
            self.logger.debug('存储失败:[ data_info ] %s ' % (str(data_info)))
            return False

    # 获取列表数据
    def getListData(self, sql):
        db = self.openMySqlConfig()
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()
        results = []

        try:
            # 执行SQL语句
            cursor.execute(sql)
            # 获取所有记录列表
            results = cursor.fetchall()
            db.close()
            self.logger.debug("查询成功[ %s ]: sql==> %s" % (len(results), sql))
            return results
        except:
            self.logger.debug("查询失败: sql==> %s" % (sql))
            # 关闭数据库连接
            db.close()
            return results
コード例 #10
0
class BookTXTLoad(object):
    def __init__(self, second, environmentalType, maxBookNex):
        self.b_bookPageSize = 10
        self.b_bookIdSize = 5
        self.b_bookTXTGroupSize = 100
        self.b_second = int(second)
        self.b_environmentalType = int(environmentalType)
        self.b_maxBookNex = int(maxBookNex)
        self.b_title = 'getBookTXT'

        self.b_catalogList = []
        self.b_bookTXTData = []
        self.errorUrl = []
        self.request404 = []
        self.countNum = 0

        self.con = ConfigParser()
        self.logName = self.intLogName()
        self.mySql = MySqlToo(logName=self.logName)
        self.dataToo = DataToo(logName=self.logName, second=self.b_second)
        self.logger = Logger(logname=self.logName,
                             loglevel=1,
                             logger=self.b_title).getlog()
        self.rds = self.initRds()
        self.timeToo = TimeToo()
        self.b_heads = self.initHeads()
        self.b_mysqlStr = self.initMysqlStr()

    def initMysqlStr(self):
        if self.b_environmentalType == 2:
            environmental = 'online'
            getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s" % self.b_maxBookNex
        elif self.b_environmentalType == 1:
            environmental = 'test'
            testBookId = '10000,20000'
            getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s limit %s" % (
                self.b_maxBookNex, testBookId)
        else:
            environmental = 'dev'
            testBookId = "'10000611804961003','10000828104982003'"
            getBookIdsSql = "SELECT book_Id FROM books WHERE book_Id in (%s)" % testBookId

        return {
            'saveText':
            "INSERT INTO `links` (`url`,article) VALUES (%s, %s) ON DUPLICATE KEY UPDATE article = VALUES (article), nex = nex+1",
            'getBookIdsSql': getBookIdsSql,
            'getCatalogData':
            "SELECT url FROM links WHERE fs = 0 AND book_Id in "
        }

    def initHeads(self):
        heads = {}
        heads[
            'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
        heads['Accept-Encoding'] = 'gzip, deflate, br'
        heads['Accept-Language'] = 'zh-CN,zh;q=0.9'
        heads['Connection'] = 'keep-alive'
        heads[
            'Cookie'] = 'newstatisticUUID=1547076169_1527614489; qdrs=0%7C3%7C0%7C0%7C1; qdgd=1'
        heads['Host'] = 'www.xs8.cn'
        heads['Upgrade-Insecure-Requests'] = '1'
        heads['Referer'] = ''
        heads[
            'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'
        return heads

    def intLogName(self):
        timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')
        return '%s_%s.txt' % (self.b_title, timeStr)

    def initRds(self):
        pool = redis.ConnectionPool(
            host=self.con.getConfig('redisConfig', 'host'),
            port=self.con.getConfig('redisConfig', 'port'),
            db=self.con.getConfig('redisConfig', 'db'))
        return redis.StrictRedis(connection_pool=pool)

    def second(self):
        time.sleep(self.b_second)

    # 2、调用mySQL类 mysqlUtils.getListData 获取数据列表
    def getBookData(self):
        bookList = []
        bookData = self.mySql.getListData(sql=self.b_mysqlStr['getBookIdsSql'])
        for item in bookData:
            bookList.append(item[0])
        return bookList

    def getCatalogData(self, bookId, index):
        catalogList = []
        sql = '%s %s' % (self.b_mysqlStr['getCatalogData'],
                         self.dataToo.listToStr(bookId))
        self.logger.info('查询小说章节 [ %s ]...\n' % (sql))
        catalogData = self.mySql.getListData(sql=sql)
        for item in catalogData:
            catalogList.append(item[0])
        self.b_catalogList.append(catalogList)

    #     4、 章节目录 catalog 数据整理 数组
    def setCatalogList(self, bookGroupingData):
        bookData = bookGroupingData['listTaskList']
        if len(bookData) <= 0:
            self.logger.debug('setCatalogList 没有数据\n')
            return
        bookIdGroupingData = self.dataToo.groupingData(
            list=bookData, pageSize=self.b_bookIdSize)
        listTaskList = bookIdGroupingData['listTaskList']
        for i in range(bookIdGroupingData['listGroupSize']):
            if len(listTaskList[i]) <= 0: continue
            self.dataToo.threads(listTaskList[i], self.getCatalogData)

    def getArticle(self, link, group, bookCatalogUrlGroupingData, ngroup,
                   nindex):
        bkd = bookCatalogUrlGroupingData
        self.b_heads['Referer'] = link
        self.logger.info(
            '已采集 [ %s ] 书籍组 [ %s / %s ] 目录组 [ %s / %s ] 文章组 [ %s / %s ] 链接 [ %s ] %s 秒后开始抓取'
            % (self.countNum, group + 1, len(
                self.b_catalogList), ngroup + 1, bkd['listGroupSize'],
               nindex + 1, bkd['listTaskSize'], link, self.b_second))
        self.second()
        text = self.dataToo.getText(link=link, heads=self.b_heads)
        if len(text['data']) <= 0:
            self.errorUrl.append(link)
            self.countNum += 1
            self.logger.debug('第 %s 条链接:数据抓取异常 :%s\n' % (self.countNum, text))
            return
        html = etree.HTML(text['data'])
        content_list = html.xpath('//div[@class="read-content j_readContent"]')
        if len(content_list) <= 0:
            self.countNum += 1
            title = html.xpath('//title/text()')
            requestIntercept = html.xpath(
                '//div[@class="empty-text"]//strong/text()')
            request404 = html.xpath('//h3[@class="lang"]/text()')
            self.logger.debug('第 %s 条链接:HTML解析异常!' % (self.countNum))
            self.logger.debug('第 %s 条链接[title]:%s' % (self.countNum, title))
            if len(requestIntercept) > 0:
                self.errorUrl.append(link)
                second = self.b_second * 180
                self.logger.debug(
                    '第 %s 条链接[requestIntercept]:%s 被拦截了暂停 %s 秒后 抓取下一条链接 ' %
                    (self.countNum, requestIntercept, second))
                time.sleep(second)
            if len(request404) > 0:
                self.request404.append(link)
                self.logger.debug('第 %s 条链接[request404]:%s' %
                                  (self.countNum, request404))
            self.logger.debug('第 %s 条链接[text]:%s\n' % (self.countNum, text))
            return
        content_list = content_list[0]
        content = etree.tostring(content_list, method='xml').decode('utf-8')
        res = self.mySql.batchAdd(sql=self.b_mysqlStr['saveText'],
                                  data_info=[(link, content)])
        if res:
            self.errorUrl.append(link)
        self.countNum += 1
        self.logger.debug('第 %s 条链接: %s\n' % (self.countNum, res))
        # self.b_bookTXTData.append((link, content))

    #     6、循环调用 getBookTxt()

    # 根据章节 catalogId、url 抓取页面数据
    def getBookTXT(self, catalogList, index):
        if len(catalogList) <= 0:
            self.logger.debug('书籍组 [ %s / %s ] :getBookTXT 没有数据\n' %
                              (index + 1, len(self.b_catalogList)))
            return
        bookCatalogUrlGroupingData = self.dataToo.groupingData(
            list=catalogList, pageSize=self.b_bookTXTGroupSize, fixed=True)
        listTaskList = bookCatalogUrlGroupingData['listTaskList']
        for i in range(bookCatalogUrlGroupingData['listGroupSize']):
            if len(listTaskList[i]) <= 0: continue
            start = time.time()
            for j in range(len(listTaskList[i])):
                self.second()
                self.getArticle(listTaskList[i][j], index,
                                bookCatalogUrlGroupingData, i, j)
            end = time.time()
            self.logger.debug(
                '书籍组 [ %s / %s ] 目录组 [ %s / %s ] : 开始时间:%s : 结束时间:%s ==> 共消耗时间 :%s 秒 [ %s ]\n'
                % (index + 1, len(self.b_catalogList), i + 1,
                   bookCatalogUrlGroupingData['listGroupSize'], float(start),
                   float(end), int(float(end) - float(start)),
                   self.timeToo.changeTime(int(float(end) - float(start)))))

    def saveText(self):
        for i in range(len(self.b_catalogList)):
            if len(self.b_catalogList[i]) <= 0:
                self.logger.debug('书籍组 [ %s / %s ] saveText 没有数据\n' %
                                  (i + 1, len(self.b_catalogList)))
                continue
            start = time.time()
            self.getBookTXT(self.b_catalogList[i], i)
            end = time.time()
            self.logger.debug(
                '书籍组 [ %s / %s ] : 开始时间:%s : 结束时间:%s ==> 共消耗时间 :%s 秒 [ %s ]\n'
                % (i + 1, len(self.b_catalogList), float(start), float(end),
                   int(float(end) - float(start)),
                   self.timeToo.changeTime(int(float(end) - float(start)))))
            self.logger.info('*-*-*-*-*-*-' * 15)
            # res = mySql.batchAdd(sql=self.b_mysqlStr['saveText'], data_info=self.b_bookTXTData)
            # if res: self.b_bookTXTData = []

    # 文章内容存储
    def bookTxtLoad(self):
        start = time.time()
        bookData = self.getBookData()
        if len(bookData) <= 0:
            self.logger.debug('bookTxtLoad 没有数据\n')
            return
        bookGroupingData = self.dataToo.groupingData(
            list=bookData, pageSize=self.b_bookPageSize)

        self.logger.info('========' * 15)
        self.logger.info("\t时间: %s" %
                         (moment.now().format('YYYY-MM-DD HH:mm:ss')))
        self.logger.info("\t网站:%s" % (self.con.getConfig('webConfig', 'host')))
        self.logger.info("\t\t\t本次将采集 %s 本小说。\n" %
                         (bookGroupingData['listSize']))
        self.logger.info(
            '\t\t\t%s 本小说,共分为 %s 个组,每组 %s 本小说。 \n' %
            (bookGroupingData['listSize'], bookGroupingData['listGroupSize'],
             bookGroupingData['listTaskSize']))
        self.logger.info(
            '\t\t\t采集时间预算:共 %s 组,每组采集间隔 %s 秒,每组 %s 本小说,每本小说 预计 %s 秒,每组预计 %s 秒,总计 %s 秒 [ %s ]\n'
            % (bookGroupingData['listGroupSize'], self.b_second,
               bookGroupingData['listTaskSize'], self.b_second + 10,
               self.b_second +
               (self.b_second + 10) * bookGroupingData['listTaskSize'],
               (self.b_second +
                (bookGroupingData['listTaskSize'] *
                 (self.b_second + 10))) * bookGroupingData['listGroupSize'],
               self.timeToo.changeTime(
                   ((self.b_second + (bookGroupingData['listTaskSize'] *
                                      (self.b_second + 10)))) *
                   bookGroupingData['listGroupSize'])))
        self.logger.info('========' * 15)
        self.setCatalogList(bookGroupingData)
        self.saveText()

        end = time.time()

        self.logger.info('---' * 30)
        self.logger.info('\t\t时间              :%s' %
                         (moment.now().format('YYYY-MM-DD HH:mm:ss')))
        self.logger.info('\t\t消耗 时间         :%s 秒 [ %s ]' %
                         (float(end) - float(start),
                          self.timeToo.changeTime(float(end) - float(start))))
        self.logger.info('\t\t采集 链接         : %s 条' % (self.countNum))
        self.logger.info('\t\t采集 失败链接     : %s 条' % (len(self.errorUrl)))
        self.logger.info('\t\t请求 失败链接     : %s 条' % (len(self.request404)))
        self.logger.info('\t\t采集 失败链接     :\n\t\t\t' % (self.errorUrl))
        self.logger.info('\t\t请求 失败链接     :\n\t\t\t' % (self.request404))
コード例 #11
0
ファイル: getTest14.py プロジェクト: 520wsl/python-test
        #     bookGroupingData['listGroupSize'],
        #     self.b_second, bookGroupingData['listTaskSize'],
        #     self.b_second + 10,
        #     self.b_second + (self.b_second + 10) * bookGroupingData['listTaskSize'],
        #     (self.b_second + (bookGroupingData['listTaskSize']
        #                       * (self.b_second + 10))) * bookGroupingData['listGroupSize'],
        #     timeToo.changeTime(((self.b_second + (bookGroupingData['listTaskSize'] * (self.b_second + 10))))
        #                        * bookGroupingData['listGroupSize'])))
        # logger.info('========' * 15)
        self.setCatalogList(bookGroupingData)
        # self.saveText()


if __name__ == '__main__':
    start = time.time()
    con = ConfigParser()
    r = RedisToo()
    maxBookNex = 0
    maxCatalogNex = 1
    bookPageSize = 10
    environmentalType = 1
    bookIdSize = 5
    bookTXTGroupSize = 1
    second = 1

    timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')
    logName = 'getBookTXT_%s.txt' % (timeStr)
    logger = Logger(logname=logName, loglevel=1, logger="getBookTXT").getlog()

    if environmentalType == 2:
        environmental = 'online'
コード例 #12
0
class GetBookInfoTool():
    def __init__(self, second, dataToo, logger):
        self.b_second = second

        self.errorUrl = []
        self.request404 = []
        self.bookInfoList = []
        self.bookCountNum = 0
        self.freeBookCountNum = 0
        self.bookCatalogCountNum = 0
        self.bookTxtCountNum = 0

        self.con = ConfigParser()
        self.dataToo = dataToo
        self.logger = logger

    def exceptions(self, html, link, text):
        title = html.xpath('//title/text()')
        requestIntercept = html.xpath(
            '//div[@class="empty-text"]//strong/text()')
        request404 = html.xpath('//h3[@class="lang"]/text()')
        requestNode = html.xpath('//div[@class="no-data"]/h3/text()')
        self.logger.debug('链接 [ %s ] :HTML解析异常!' % (link))
        self.logger.debug('[title]:%s' % (title))

        if len(requestIntercept) > 0:
            self.errorUrl.append(link)
            second = self.b_second * 180
            self.logger.debug('[requestIntercept]:%s 被拦截了暂停 %s 秒后 抓取下一条链接 ' %
                              (requestIntercept, second))
            time.sleep(second)

        if len(requestNode) > 0:
            self.errorUrl.append(link)
            self.logger.debug('[requestNode]:%s' % (requestNode))

        if len(request404) > 0:
            self.request404.append(link)
            self.logger.debug('[request404]:%s' % (request404))

        self.logger.debug('[text]:%s\n' % (text))

    def noData(self, link, text):
        self.errorUrl.append(link)
        self.logger.debug('链接 [ %s ] :数据抓取异常 :%s\n' % (link, text))

    def getContentList(self, link, xpath):
        contentList = []
        text = self.dataToo.getText(link=link)
        if len(text['data']) <= 0:
            self.noData(link, text)
            return contentList
        html = etree.HTML(text['data'])
        contentList = html.xpath(xpath)
        if len(contentList) <= 0:
            self.exceptions(html, link, text)
        return contentList

    # 去书籍搜索列表获取书籍列表
    def toAllBookListPageGetBookList(self, link):
        time.sleep(self.b_second)
        content_list = self.getContentList(
            link=link, xpath='//div[@class="right-book-list"]//li')
        if len(content_list) <= 0:
            self.bookCountNum += 1
            return
        bookInfoList = []
        for item in content_list:
            book_Id = item.xpath(
                './/div[@class="book-info"]/h3/a/@href')[0][6:]
            book_name = item.xpath('.//div[@class="book-info"]/h3/a/text()')[0]
            author = item.xpath('.//div[@class="book-info"]/h4/a/text()')[0]
            tag = item.xpath(
                './/div[@class="book-info"]/p[@class="tag"]/span/text()')
            synoptic = item.xpath(
                './/div[@class="book-info"]/p[@class="intro"]/text()')[0]
            img_url = item.xpath('.//div[@class="book-img"]/a/img/@src')[0]
            chan_name = tag[0]
            state = tag[1]
            bookInfoList.append({
                'book_Id': book_Id,
                'book_name': book_name,
                'state': state,
                'author': author,
                'chan_name': chan_name,
                'synoptic': str(synoptic),
                'img_url': img_url
            })
        self.logger.info('书籍列 [ %s ] 表信息采集完成:%s' % (link, bookInfoList))
        self.freeBookCountNum += 1
        return bookInfoList

    # 去免费书籍页面获取书籍列表
    def toFreeBookListPageGetBookList(self, freeBookListPage):
        link = freeBookListPage
        content_list = self.getContentList(
            link=link, xpath='//*[@id="limit-list"]/div/ul/li')
        if len(content_list) <= 0:
            self.freeBookCountNum += 1
            return
        bookInfoList = []
        for item in content_list:
            book_Id = item.xpath(
                './/div[@class="book-mid-info"]/h4/a/@href')[0][6:]
            book_name = item.xpath(
                './/div[@class="book-mid-info"]/h4/a/text()')[0]
            author = item.xpath(
                './/p[@class="author"]/a[contains(concat("",@class,"name"),"")]/text()'
            )[0]
            chan_name = item.xpath('.//p[@class="author"]/a[2]/text()')[0]
            state = item.xpath('.//p[@class="author"]/span/text()')[0]
            img_url = item.xpath('.//div[@class="book-img-box"]/a/img/@src')[0]
            synopticHtml = item.xpath(
                './/div[@class="book-mid-info"]//p[@class="intro"]')[0]
            synoptic = etree.tostring(synopticHtml,
                                      method='xml').decode('utf-8')
            bookInfoList.append({
                'book_Id': book_Id,
                'book_name': book_name,
                'author': author,
                'chan_name': chan_name,
                'state': state,
                'img_url': img_url,
                'synoptic': synoptic
            })
        self.logger.info('免费书籍列表【 %s 】信息采集完成:%s' % (link, bookInfoList))
        self.freeBookCountNum += 1
        return bookInfoList

    # 链接处理
    def bookLinkLoad(self, bookId):
        day = datetime.now()
        unix = datetime.timestamp(day)
        return self.con.getConfig(
            'webConfig', 'host'
        ) + '/ajax/chapter/userChapterList?_csrfToken=&bookId=' + str(
            bookId) + '&_=' + str(int(unix))

    def getCatalogInfo(self, bookId):
        link = self.bookLinkLoad(bookId)
        time.sleep(self.b_second)
        jsonData = self.dataToo.getJson(link=link)
        if len(jsonData['data']['data']) <= 0:
            self.bookCatalogCountNum += 1
            self.noData(link, jsonData)
            return []

        self.logger.info('书籍【 %s 】目录信息采集完成:%s' % (link, jsonData['data']))
        self.bookCatalogCountNum += 1
        return jsonData['data']

    def getTxtInfo(self, link):
        content = ''
        time.sleep(self.b_second)
        content_list = self.getContentList(
            link=link, xpath='//div[@class="read-content j_readContent"]')
        if len(content_list) <= 0:
            self.freeBookCountNum += 1
            return content
        content_list = content_list[0]
        content = etree.tostring(content_list, method='xml').decode('utf-8')
        self.logger.info('书籍【 %s 】章节信息采集完成' % (link))
        self.bookTxtCountNum += 1
        return content
コード例 #13
0
class SaveBookToRedis():
    def __init__(self, environmentalType, maxBookNex):
        self.b_bookPageSize = 10
        self.b_bookIdSize = 5
        self.b_bookTXTGroupSize = 100
        self.b_environmentalType = int(environmentalType)
        self.b_maxBookNex = int(maxBookNex)
        self.b_title = 'SaveBookToRedis'
        self.b_second = 1
        self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')

        self.b_catalogList = []
        self.b_bookTXTData = []
        self.errorUrl = []
        self.request404 = []
        self.countNum = 0

        self.con = ConfigParser()
        self.logName = self.intLogName()
        self.rds = RedisToo()
        self.mySql = MySqlToo(logName=self.logName)
        self.dataToo = DataToo(logName=self.b_title, second=self.b_second)
        self.logger = Logger(logname=self.intLogName(),
                             loglevel=1,
                             logger=self.b_title).getlog()
        self.timeToo = TimeToo()
        self.b_mysqlStr = self.initMysqlStr()

    def intLogName(self):
        timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')
        return '%s_%s.log' % (self.b_title, timeStr)

    def initMysqlStr(self):
        if self.b_environmentalType == 2:
            environmental = 'online'
            getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s" % self.b_maxBookNex
        elif self.b_environmentalType == 1:
            environmental = 'test'
            testBookId = '10000,20000'
            getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s limit %s" % (
                self.b_maxBookNex, testBookId)
        else:
            environmental = 'dev'
            testBookId = "'10000611804961003','10000828104982003'"
            getBookIdsSql = "SELECT book_Id FROM books WHERE book_Id in (%s)" % testBookId
            self.b_bookPageSize = 2
            self.b_bookIdSize = 2
            self.b_bookTXTGroupSize = 1

        return {
            'saveText':
            "INSERT INTO `links` (`url`,article) VALUES (%s, %s) ON DUPLICATE KEY UPDATE article = VALUES (article), nex = nex+1",
            'getBookIdsSql': getBookIdsSql,
            'getCatalogData':
            "SELECT url FROM links WHERE fs = 0 AND book_Id in "
        }

    def second(self):
        time.sleep(self.b_second)

    def getBookData(self):
        bookList = []
        bookData = self.mySql.getListData(sql=self.b_mysqlStr['getBookIdsSql'])
        for item in bookData:
            bookList.append(item[0])
        return bookList

    def setCatalogList(self, bookGroupingData):
        bookData = bookGroupingData['listTaskList']
        if len(bookData) <= 0:
            self.logger.debug('setCatalogList 没有数据\n')
            return
        bookIdGroupingData = self.dataToo.groupingData(
            list=bookData, pageSize=self.b_bookIdSize)
        listTaskList = bookIdGroupingData['listTaskList']
        for i in range(bookIdGroupingData['listGroupSize']):
            time.sleep(10)
            if len(listTaskList[i]) <= 0: continue
            data = []
            for item in listTaskList[i]:
                data.append(','.join(item))
            self.rds.setListData('bookIdsList', data)

    def bookTxtLoad(self):
        start = time.time()
        bookData = self.getBookData()
        if len(bookData) <= 0:
            self.logger.debug('bookTxtLoad 没有数据\n')
            return
        bookGroupingData = self.dataToo.groupingData(
            list=bookData, pageSize=self.b_bookPageSize)
        self.setCatalogList(bookGroupingData)

        end = time.time()
        self.logger.info('========' * 15)
        self.logger.info("startTime: %s" %
                         (moment.now().format('YYYY-MM-DD HH:mm:ss')))
        self.logger.info("webHost:%s" %
                         (self.con.getConfig('webConfig', 'host')))
        self.logger.info("author:%s" %
                         (self.con.getConfig('webConfig', 'author')))
        self.logger.info("email:%s" %
                         (self.con.getConfig('webConfig', 'email')))
        self.logger.info(
            '本次将采集 [ %s ] 本小说,共分为 %s 个组,每组 %s 本小说。' %
            (bookGroupingData['listSize'], bookGroupingData['listGroupSize'],
             bookGroupingData['listTaskSize']))
        self.logger.info(
            'saveBooksToRedis [ %s ] 组 小说,消耗时间:%s 秒 [ %s ]' %
            (bookGroupingData['listGroupSize'], float(end) - float(start),
             self.timeToo.changeTime(float(end) - float(start))))
        self.logger.info('========' * 15)
コード例 #14
0
class getFreeBookTXT(object):
    def __init__(self):
        # self.b_getBookIdsListSize = int(getBookIdsListSize)
        # self.b_rdsKeyName = rdsKeyName
        self.b_title = 'getFreeBookTXT'
        self.b_second = 1
        self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')
        self.dataToo = DataTool(logName=self.b_title,
                                second=self.b_second,
                                timeStr=self.b_timeStr)
        self.mySql = MySqlTool(logName=self.dataToo.initLogName())
        self.logger = Logger(logname=self.dataToo.initLogName(),
                             loglevel=1,
                             logger=self.b_title).getlog()
        self.rds = RedisTool()
        self.getBookInfoToo = GetBookInfoTool(second=self.b_second,
                                              dataToo=self.dataToo,
                                              logger=self.logger)
        self.saveBookInfoToMySqlToo = SaveBookInfoToMySqlToo(
            second=self.b_second,
            logger=self.logger,
            getBookInfoToo=self.getBookInfoToo,
            mySql=self.mySql,
            dataToo=self.dataToo)
        self.con = ConfigParser()

    def getFreeBookLink(self):
        bookInfoList = self.getBookInfoToo.toFreeBookListPageGetBookList(
            freeBookListPage=self.con.getConfig('webConfig',
                                                'freeBookListPage'))
        self.logger.debug(bookInfoList)
        return bookInfoList

    # def target(self):
    #     # links = []
    # for i in range(self.b_getBookIdsListSize):
    #     link = self.rds.r.lpop(self.b_rdsKeyName)
    #     if link != None:
    #         link = link.decode(encoding='utf-8')
    #         links.append(link)
    # return links
    def formatCatalogInfo(self, data):
        catalogData = data['vs']
        links = []
        for i in catalogData:
            for j in i['cs']:
                url = self.con.getConfig(
                    'webConfig',
                    'host') + '/chapter/' + data['bookId'] + '/' + j['id']
                # links.append({j['id'], j['cN'], bookId, bookName, j['cnt'], url, j['uuid'], j['fS']})
                # links.append(str(url))
                self.saveBookInfoToMySqlToo.saveText(link=str(url))
        return links

    def contentsLoad(self):
        links = self.getFreeBookLink()
        if len(links) <= 0:
            self.logger.debug('getFreeBookLink 没有数据\n')
            return
        for item in links:
            # self.logger.debug(item)
            # self.logger.debug(item['book_Id'])
            time.sleep(self.b_second)
            jsonData = self.getBookInfoToo.getCatalogInfo(
                bookId=item['book_Id'])
            self.logger.debug(jsonData)
            catalogData = self.formatCatalogInfo(data=jsonData['data'])
            self.logger.debug(catalogData)
            self.saveBookInfoToMySqlToo.saveCatalog(
                bookId=jsonData['data']['bookId'])