Ejemplo n.º 1
0
class Publish():
    def __init__(self):
        self.b_title = 'Publish'
        self.b_second = 1
        self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')

        self.rds = RedisToo()
        self.dataToo = DataToo(logName=self.b_title,
                               second=self.b_second,
                               timeStr=self.b_timeStr)
        self.logger = Logger(logname=self.dataToo.initLogName(),
                             loglevel=1,
                             logger=self.b_title).getlog()

    def saveBookToRedisAction(self):
        environmentalType = input("请输入0、1、2(0:dev,1:test,2:online): >>")
        maxBookNex = 0
        self.logger.debug('\n\n参数确认: 环境 : %s | 最大抓取数 : %s \n\n' %
                          (environmentalType, maxBookNex))
        time.sleep(1)
        isStart = input("是否开始?(y/n): >>")
        if (isStart == 'y'):
            self.rds.p.publish(
                'bookChannel',
                str(
                    json.dumps({
                        'type': 'SaveBookToRedis',
                        'environmentalType': environmentalType,
                        'maxBookNex': maxBookNex
                    })))
        else:
            print('取消抓取')

    def getBookTXTAction(self):
        getBookIdsListSize = input("获取多少组数据(最大10): >>")
        maxCatalogNex = 1
        print('\n\n参数确认: maxCatalogNex : %s | getBookIdsListSize : %s \n\n' %
              (maxCatalogNex, getBookIdsListSize))
        time.sleep(1)
        isStart = input("是否开始?(y/n): >>")
        if (isStart == 'y'):
            self.rds.p.publish(
                'bookChannel',
                str(
                    json.dumps({
                        'type': 'GetBookTXT',
                        'maxCatalogNex': maxCatalogNex,
                        'getBookIdsListSize': getBookIdsListSize
                    })))
        else:
            print('取消抓取')
Ejemplo n.º 2
0
class Subscribe():
    def __init__(self):
        self.b_title = 'Subscribe'
        self.b_second = 1
        self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')
        self.rds = RedisTool()
        self.dataToo = DataTool(logName=self.b_title,
                                second=self.b_second,
                                timeStr=self.b_timeStr)
        self.logger = Logger(logname=self.dataToo.initLogName(),
                             loglevel=1,
                             logger=self.b_title).getlog()
        self.mySql = MySqlTool(logName=self.dataToo.initLogName())

    def saveBookToRedisAction(self, params):
        self.logger.debug(params)
        book = SaveBookToRedisTool(
            environmentalType=params['environmentalType'],
            rds=self.rds,
            dataToo=self.dataToo,
            mySql=self.mySql,
            logger=self.logger)
        book.saveAllBookListToRedis(rdsKeyName=params['rdsKeyName'])
        self.logger.debug('saveBookToRedisAction处理结束')

    def getBookTXTAction(self, params):
        self.logger.debug(params)
        book = GetBookTXT(getBookIdsListSize=params['getBookIdsListSize'],
                          rdsKeyName=params['rdsKeyName'])
        book.contentsLoad()
        self.logger.debug('getBookTXTAction处理结束')
Ejemplo n.º 3
0
class GetBookTXT(object):
    def __init__(self, getBookIdsListSize, rdsKeyName):
        self.b_getBookIdsListSize = int(getBookIdsListSize)
        self.b_rdsKeyName = rdsKeyName
        self.b_title = 'getBookTXT'
        self.b_second = 1
        self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')
        self.dataToo = DataTool(logName=self.b_title,
                                second=self.b_second,
                                timeStr=self.b_timeStr)
        self.mySql = MySqlTool(logName=self.dataToo.initLogName())
        self.logger = Logger(logname=self.dataToo.initLogName(),
                             loglevel=1,
                             logger=self.b_title).getlog()
        self.rds = RedisTool()
        self.getBookInfoToo = GetBookInfoTool(second=self.b_second,
                                              dataToo=self.dataToo,
                                              logger=self.logger)
        self.saveBookInfoToMySqlToo = SaveBookInfoToMySqlToo(
            second=self.b_second,
            logger=self.logger,
            getBookInfoToo=self.getBookInfoToo,
            mySql=self.mySql,
            dataToo=self.dataToo)

    def target(self):
        links = []
        for i in range(self.b_getBookIdsListSize):
            link = self.rds.r.lpop(self.b_rdsKeyName)
            if link != None:
                link = link.decode(encoding='utf-8')
                links.append(link)
        return links

    def contentsLoad(self):
        links = self.target()
        if len(links) <= 0:
            self.logger.debug('bookTxtLoad 没有数据\n')
            return
        for item in links:
            self.logger.debug(item)
            self.saveBookInfoToMySqlToo.saveText(link=item)
        self.isOk()

    def isOk(self):
        self.contentsLoad()
Ejemplo n.º 4
0
class Subscribe():
    def __init__(self):
        self.b_title = 'Subscribe'
        self.b_second = 1
        self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')

        self.rds = RedisToo()
        self.dataToo = DataToo(logName=self.b_title, second=self.b_second, timeStr=self.b_timeStr)
        self.logger = Logger(logname=self.dataToo.initLogName(), loglevel=1, logger=self.b_title).getlog()

    def saveBookToRedisAction(self, params):
        self.logger.debug(params)
        book = SaveBookToRedis(environmentalType=params['environmentalType'], maxBookNex=params['maxBookNex'])
        book.bookTxtLoad()
        self.logger.debug('saveBookToRedisAction处理结束')

    def getBookTXTAction(self, params):
        self.logger.debug(params)
        book = GetBookTXT(maxCatalogNex=params['maxCatalogNex'], getBookIdsListSize=params['getBookIdsListSize'])
        book.contentsLoad()
        self.logger.debug('getBookTXT处理结束')
Ejemplo n.º 5
0
class MySqlToo():
    def __init__(self, logName):
        self.con = ConfigParser()
        self.logger = Logger(logname=logName, loglevel=1,
                             logger="MySQLToo").getlog()
        mysqlConfig = self.con.getConfig('mysql', 'host'), self.con.getConfig(
            'mysql', 'user'), self.con.getConfig(
                'mysql', 'password'), self.con.getConfig('mysql', 'database')
        self.logger.info(
            "\n\t mySqlConfig:\n\t\t host: %s\n\t\t user : %s\n\t\t password : %s\n\t\t database : %s "
            % (mysqlConfig))

    # 数据库信息
    def openMySqlConfig(self):
        return pymysql.connect(self.con.getConfig('mysql', 'host'),
                               self.con.getConfig('mysql', 'user'),
                               self.con.getConfig('mysql', 'password'),
                               self.con.getConfig('mysql', 'database'))
        # 批量添加 信息

        # 批量添加 信息

    def batchAdd(self, sql, data_info):
        # self.logger.warning(data_info)
        db = self.openMySqlConfig()
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()

        try:
            # 执行sql语句
            cursor.executemany(sql, data_info)
            # 提交到数据库执行
            db.commit()
            db.close()
            self.logger.info('存储成功')
            return True
        except:
            # 如果发生错误则回滚
            db.rollback()
            db.close()
            self.logger.debug('存储失败:[ sql ] %s ' % (str(sql)))
            self.logger.debug('存储失败:[ data_info ] %s ' % (str(data_info)))
            return False

    # 获取列表数据
    def getListData(self, sql):
        db = self.openMySqlConfig()
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()
        results = []

        try:
            # 执行SQL语句
            cursor.execute(sql)
            # 获取所有记录列表
            results = cursor.fetchall()
            db.close()
            self.logger.debug("查询成功[ %s ]: sql==> %s" % (len(results), sql))
            return results
        except:
            self.logger.debug("查询失败: sql==> %s" % (sql))
            # 关闭数据库连接
            db.close()
            return results
Ejemplo n.º 6
0
class BookTXTLoad(object):
    def __init__(self, second, environmentalType, maxBookNex):
        self.b_bookPageSize = 10
        self.b_bookIdSize = 5
        self.b_bookTXTGroupSize = 100
        self.b_second = int(second)
        self.b_environmentalType = int(environmentalType)
        self.b_maxBookNex = int(maxBookNex)
        self.b_title = 'getBookTXT'

        self.b_catalogList = []
        self.b_bookTXTData = []
        self.errorUrl = []
        self.request404 = []
        self.countNum = 0

        self.con = ConfigParser()
        self.logName = self.intLogName()
        self.mySql = MySqlToo(logName=self.logName)
        self.dataToo = DataToo(logName=self.logName, second=self.b_second)
        self.logger = Logger(logname=self.logName,
                             loglevel=1,
                             logger=self.b_title).getlog()
        self.rds = self.initRds()
        self.timeToo = TimeToo()
        self.b_heads = self.initHeads()
        self.b_mysqlStr = self.initMysqlStr()

    def initMysqlStr(self):
        if self.b_environmentalType == 2:
            environmental = 'online'
            getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s" % self.b_maxBookNex
        elif self.b_environmentalType == 1:
            environmental = 'test'
            testBookId = '10000,20000'
            getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s limit %s" % (
                self.b_maxBookNex, testBookId)
        else:
            environmental = 'dev'
            testBookId = "'10000611804961003','10000828104982003'"
            getBookIdsSql = "SELECT book_Id FROM books WHERE book_Id in (%s)" % testBookId

        return {
            'saveText':
            "INSERT INTO `links` (`url`,article) VALUES (%s, %s) ON DUPLICATE KEY UPDATE article = VALUES (article), nex = nex+1",
            'getBookIdsSql': getBookIdsSql,
            'getCatalogData':
            "SELECT url FROM links WHERE fs = 0 AND book_Id in "
        }

    def initHeads(self):
        heads = {}
        heads[
            'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
        heads['Accept-Encoding'] = 'gzip, deflate, br'
        heads['Accept-Language'] = 'zh-CN,zh;q=0.9'
        heads['Connection'] = 'keep-alive'
        heads[
            'Cookie'] = 'newstatisticUUID=1547076169_1527614489; qdrs=0%7C3%7C0%7C0%7C1; qdgd=1'
        heads['Host'] = 'www.xs8.cn'
        heads['Upgrade-Insecure-Requests'] = '1'
        heads['Referer'] = ''
        heads[
            'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'
        return heads

    def intLogName(self):
        timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')
        return '%s_%s.txt' % (self.b_title, timeStr)

    def initRds(self):
        pool = redis.ConnectionPool(
            host=self.con.getConfig('redisConfig', 'host'),
            port=self.con.getConfig('redisConfig', 'port'),
            db=self.con.getConfig('redisConfig', 'db'))
        return redis.StrictRedis(connection_pool=pool)

    def second(self):
        time.sleep(self.b_second)

    # 2、调用mySQL类 mysqlUtils.getListData 获取数据列表
    def getBookData(self):
        bookList = []
        bookData = self.mySql.getListData(sql=self.b_mysqlStr['getBookIdsSql'])
        for item in bookData:
            bookList.append(item[0])
        return bookList

    def getCatalogData(self, bookId, index):
        catalogList = []
        sql = '%s %s' % (self.b_mysqlStr['getCatalogData'],
                         self.dataToo.listToStr(bookId))
        self.logger.info('查询小说章节 [ %s ]...\n' % (sql))
        catalogData = self.mySql.getListData(sql=sql)
        for item in catalogData:
            catalogList.append(item[0])
        self.b_catalogList.append(catalogList)

    #     4、 章节目录 catalog 数据整理 数组
    def setCatalogList(self, bookGroupingData):
        bookData = bookGroupingData['listTaskList']
        if len(bookData) <= 0:
            self.logger.debug('setCatalogList 没有数据\n')
            return
        bookIdGroupingData = self.dataToo.groupingData(
            list=bookData, pageSize=self.b_bookIdSize)
        listTaskList = bookIdGroupingData['listTaskList']
        for i in range(bookIdGroupingData['listGroupSize']):
            if len(listTaskList[i]) <= 0: continue
            self.dataToo.threads(listTaskList[i], self.getCatalogData)

    def getArticle(self, link, group, bookCatalogUrlGroupingData, ngroup,
                   nindex):
        bkd = bookCatalogUrlGroupingData
        self.b_heads['Referer'] = link
        self.logger.info(
            '已采集 [ %s ] 书籍组 [ %s / %s ] 目录组 [ %s / %s ] 文章组 [ %s / %s ] 链接 [ %s ] %s 秒后开始抓取'
            % (self.countNum, group + 1, len(
                self.b_catalogList), ngroup + 1, bkd['listGroupSize'],
               nindex + 1, bkd['listTaskSize'], link, self.b_second))
        self.second()
        text = self.dataToo.getText(link=link, heads=self.b_heads)
        if len(text['data']) <= 0:
            self.errorUrl.append(link)
            self.countNum += 1
            self.logger.debug('第 %s 条链接:数据抓取异常 :%s\n' % (self.countNum, text))
            return
        html = etree.HTML(text['data'])
        content_list = html.xpath('//div[@class="read-content j_readContent"]')
        if len(content_list) <= 0:
            self.countNum += 1
            title = html.xpath('//title/text()')
            requestIntercept = html.xpath(
                '//div[@class="empty-text"]//strong/text()')
            request404 = html.xpath('//h3[@class="lang"]/text()')
            self.logger.debug('第 %s 条链接:HTML解析异常!' % (self.countNum))
            self.logger.debug('第 %s 条链接[title]:%s' % (self.countNum, title))
            if len(requestIntercept) > 0:
                self.errorUrl.append(link)
                second = self.b_second * 180
                self.logger.debug(
                    '第 %s 条链接[requestIntercept]:%s 被拦截了暂停 %s 秒后 抓取下一条链接 ' %
                    (self.countNum, requestIntercept, second))
                time.sleep(second)
            if len(request404) > 0:
                self.request404.append(link)
                self.logger.debug('第 %s 条链接[request404]:%s' %
                                  (self.countNum, request404))
            self.logger.debug('第 %s 条链接[text]:%s\n' % (self.countNum, text))
            return
        content_list = content_list[0]
        content = etree.tostring(content_list, method='xml').decode('utf-8')
        res = self.mySql.batchAdd(sql=self.b_mysqlStr['saveText'],
                                  data_info=[(link, content)])
        if res:
            self.errorUrl.append(link)
        self.countNum += 1
        self.logger.debug('第 %s 条链接: %s\n' % (self.countNum, res))
        # self.b_bookTXTData.append((link, content))

    #     6、循环调用 getBookTxt()

    # 根据章节 catalogId、url 抓取页面数据
    def getBookTXT(self, catalogList, index):
        if len(catalogList) <= 0:
            self.logger.debug('书籍组 [ %s / %s ] :getBookTXT 没有数据\n' %
                              (index + 1, len(self.b_catalogList)))
            return
        bookCatalogUrlGroupingData = self.dataToo.groupingData(
            list=catalogList, pageSize=self.b_bookTXTGroupSize, fixed=True)
        listTaskList = bookCatalogUrlGroupingData['listTaskList']
        for i in range(bookCatalogUrlGroupingData['listGroupSize']):
            if len(listTaskList[i]) <= 0: continue
            start = time.time()
            for j in range(len(listTaskList[i])):
                self.second()
                self.getArticle(listTaskList[i][j], index,
                                bookCatalogUrlGroupingData, i, j)
            end = time.time()
            self.logger.debug(
                '书籍组 [ %s / %s ] 目录组 [ %s / %s ] : 开始时间:%s : 结束时间:%s ==> 共消耗时间 :%s 秒 [ %s ]\n'
                % (index + 1, len(self.b_catalogList), i + 1,
                   bookCatalogUrlGroupingData['listGroupSize'], float(start),
                   float(end), int(float(end) - float(start)),
                   self.timeToo.changeTime(int(float(end) - float(start)))))

    def saveText(self):
        for i in range(len(self.b_catalogList)):
            if len(self.b_catalogList[i]) <= 0:
                self.logger.debug('书籍组 [ %s / %s ] saveText 没有数据\n' %
                                  (i + 1, len(self.b_catalogList)))
                continue
            start = time.time()
            self.getBookTXT(self.b_catalogList[i], i)
            end = time.time()
            self.logger.debug(
                '书籍组 [ %s / %s ] : 开始时间:%s : 结束时间:%s ==> 共消耗时间 :%s 秒 [ %s ]\n'
                % (i + 1, len(self.b_catalogList), float(start), float(end),
                   int(float(end) - float(start)),
                   self.timeToo.changeTime(int(float(end) - float(start)))))
            self.logger.info('*-*-*-*-*-*-' * 15)
            # res = mySql.batchAdd(sql=self.b_mysqlStr['saveText'], data_info=self.b_bookTXTData)
            # if res: self.b_bookTXTData = []

    # 文章内容存储
    def bookTxtLoad(self):
        start = time.time()
        bookData = self.getBookData()
        if len(bookData) <= 0:
            self.logger.debug('bookTxtLoad 没有数据\n')
            return
        bookGroupingData = self.dataToo.groupingData(
            list=bookData, pageSize=self.b_bookPageSize)

        self.logger.info('========' * 15)
        self.logger.info("\t时间: %s" %
                         (moment.now().format('YYYY-MM-DD HH:mm:ss')))
        self.logger.info("\t网站:%s" % (self.con.getConfig('webConfig', 'host')))
        self.logger.info("\t\t\t本次将采集 %s 本小说。\n" %
                         (bookGroupingData['listSize']))
        self.logger.info(
            '\t\t\t%s 本小说,共分为 %s 个组,每组 %s 本小说。 \n' %
            (bookGroupingData['listSize'], bookGroupingData['listGroupSize'],
             bookGroupingData['listTaskSize']))
        self.logger.info(
            '\t\t\t采集时间预算:共 %s 组,每组采集间隔 %s 秒,每组 %s 本小说,每本小说 预计 %s 秒,每组预计 %s 秒,总计 %s 秒 [ %s ]\n'
            % (bookGroupingData['listGroupSize'], self.b_second,
               bookGroupingData['listTaskSize'], self.b_second + 10,
               self.b_second +
               (self.b_second + 10) * bookGroupingData['listTaskSize'],
               (self.b_second +
                (bookGroupingData['listTaskSize'] *
                 (self.b_second + 10))) * bookGroupingData['listGroupSize'],
               self.timeToo.changeTime(
                   ((self.b_second + (bookGroupingData['listTaskSize'] *
                                      (self.b_second + 10)))) *
                   bookGroupingData['listGroupSize'])))
        self.logger.info('========' * 15)
        self.setCatalogList(bookGroupingData)
        self.saveText()

        end = time.time()

        self.logger.info('---' * 30)
        self.logger.info('\t\t时间              :%s' %
                         (moment.now().format('YYYY-MM-DD HH:mm:ss')))
        self.logger.info('\t\t消耗 时间         :%s 秒 [ %s ]' %
                         (float(end) - float(start),
                          self.timeToo.changeTime(float(end) - float(start))))
        self.logger.info('\t\t采集 链接         : %s 条' % (self.countNum))
        self.logger.info('\t\t采集 失败链接     : %s 条' % (len(self.errorUrl)))
        self.logger.info('\t\t请求 失败链接     : %s 条' % (len(self.request404)))
        self.logger.info('\t\t采集 失败链接     :\n\t\t\t' % (self.errorUrl))
        self.logger.info('\t\t请求 失败链接     :\n\t\t\t' % (self.request404))
Ejemplo n.º 7
0
class DataTool():
    def __init__(self, logName, second, timeStr):
        self.b_second = second
        self.b_timeStr = timeStr
        self.b_logName = logName
        self.logger = Logger(logname=self.initLogName(),
                             loglevel=1,
                             logger="DataTool").getlog()

    def groupingData(self, list, pageSize, fixed=False):
        listSize = len(list)
        if fixed:
            listGroupSize = pageSize
        else:
            listGroupSize = math.ceil(float(listSize) / pageSize)

        nloops = range(listGroupSize)
        listTaskList = []
        listTaskSize = math.ceil(float(listSize) / listGroupSize)
        for i in nloops:
            try:
                self.logger.info(
                    "第 %s 组 :[ %s ] \n\t" %
                    (i + 1, len(
                        list[i * listTaskSize:(i + 1) * listTaskSize])))
                listTaskList.append(list[i * listTaskSize:(i + 1) *
                                         listTaskSize])
            except:
                self.logger.info("第 %s 组 :[ %s ] \n\t" %
                                 (i + 1, len(list[i * listTaskSize:])))
                listTaskList.append(list[i * listTaskSize:])
        res = {
            'listSize': listSize,
            'listGroupSize': listGroupSize,
            'listTaskSize': listTaskSize,
            'listTaskList': listTaskList
        }
        self.logger.info('groupingData : %s' % res)
        return res

    def threads(self, taskList, target):
        nloops = range(len(taskList))
        threads = []
        for i in nloops:
            if len(taskList[i]) <= 0: continue
            t = threading.Thread(target=target, args=(taskList[i], i))
            threads.append(t)

        for i in nloops:
            if len(taskList[i]) <= 0: continue
            threads[i].start()

        for i in nloops:
            if len(taskList[i]) <= 0: continue
            threads[i].join()

        # 调接口获取数据

    def getHTMLTxt(self, link, heads):
        result = {'status': '200', 'data': '', 'link': link}
        #
        # r = requests.get(link, headers=heads, timeout=100)
        # r.encoding = "utr-8"
        # result['data'] = r.text

        try:
            r = requests.get(link, headers=heads, timeout=100)
            r.encoding = "utr-8"
            result['data'] = r.text
        except:
            second = random.randint(0, self.b_second * 60)
            self.logger.debug('[ %s ][ 403 ] 可能被拦截了暂停 %s 秒后 抓取下一条链接 !\n' %
                              (link, second))
            time.sleep(second)
            result['status'] = '403'
        return result

    def getJsonTxt(self, link, heads):
        result = {'status': '200', 'data': '', 'link': link}
        try:
            r = requests.get(link, headers=heads)
            r.encoding = "utr-8"
            result['data'] = json.loads(r.text)
        except:
            second = random.randint(0, self.b_second * 60)
            self.logger.debug('[ %s ][ 403 ] 可能被拦截了暂停 %s 秒后 抓取下一条链接 !\n' %
                              (link, second))
            time.sleep(second)
            result['status'] = '403'
        return result

    def listToStr(self, data_info=[]):
        return tuple(data_info)

    def getText(self, link):
        if len(link) <= 0: return
        heads = self.initHeads('html')
        return self.getHTMLTxt(link=link, heads=heads)

    def getJson(self, link):
        if len(link) <= 0: return
        heads = self.initHeads('json')
        return self.getJsonTxt(link=link, heads=heads)

    def initHeads(self, type):
        if type == 'html':
            heads = {}
            heads[
                'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
            heads['Accept-Encoding'] = 'gzip, deflate, br'
            heads['Accept-Language'] = 'zh-CN,zh;q=0.9'
            heads['Connection'] = 'keep-alive'
            heads[
                'Cookie'] = 'newstatisticUUID=1547894850_1903849637; qdrs=0%7C3%7C0%7C0%7C1; qdgd=1'
            heads['Host'] = 'www.xs8.cn'
            heads['Upgrade-Insecure-Requests'] = '1'
            heads['Referer'] = ''
            heads[
                'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'
            return heads
        elif type == 'json':
            heads = {}
            heads['Accept'] = 'application/json, text/javascript, */*; q=0.01'
            heads['Accept-Encoding'] = 'gzip, deflate, br'
            heads['Accept-Language'] = 'zh-CN,zh;q=0.9'
            heads['Connection'] = 'keep-alive'
            heads[
                'Cookie'] = 'newstatisticUUID=1547123562_436906659; qdrs=0%7C3%7C0%7C0%7C1; qdgd=1'
            heads['Host'] = 'www.xs8.cn'
            heads['Referer'] = ''
            heads[
                'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
            heads['X-Requested-With'] = 'XMLHttpRequest'

    def initLogName(self):
        return '%s_%s.log' % (self.b_logName, self.b_timeStr)

    def getLinkArr(self, data):
        linkArr = []
        for item in data:
            linkArr.append(item)
        return linkArr
Ejemplo n.º 8
0
class DataToo():
    def __init__(self, logName, second):
        self.b_second = second
        self.logger = Logger(logname=logName, loglevel=1, logger="DataToo").getlog()

    def groupingData(self, list, pageSize, fixed=False):
        listSize = len(list)
        if fixed:
            listGroupSize = pageSize
        else:
            listGroupSize = math.ceil(float(listSize) / pageSize)

        nloops = range(listGroupSize)
        listTaskList = []
        listTaskSize = math.ceil(float(listSize) / listGroupSize)
        for i in nloops:
            try:
                self.logger.info("第 %s 组 :[ %s ] \n\t" % (i + 1, len(list[i * listTaskSize:(i + 1) * listTaskSize])))
                listTaskList.append(list[i * listTaskSize:(i + 1) * listTaskSize])
            except:
                self.logger.info("第 %s 组 :[ %s ] \n\t" % (i + 1, len(list[i * listTaskSize:])))
                listTaskList.append(list[i * listTaskSize:])
        res = {
            'listSize': listSize,
            'listGroupSize': listGroupSize,
            'listTaskSize': listTaskSize,
            'listTaskList': listTaskList
        }
        self.logger.info('groupingData : %s' % res)
        return res

    def threads(self, taskList, target):
        nloops = range(len(taskList))
        # self.logger.debug('threads:==>\n\t %s \n\t %s' % (nloops, taskList))

        threads = []
        for i in nloops:
            if len(taskList[i]) <= 0: continue
            t = threading.Thread(target=target, args=(taskList[i], i))
            threads.append(t)

        for i in nloops:
            if len(taskList[i]) <= 0: continue
            threads[i].start()

        for i in nloops:
            if len(taskList[i]) <= 0: continue
            threads[i].join()

        # 调接口获取数据

    def getHTMLTxt(self, link, heads):
        result = {'status': '200', 'data': '', 'link': link}

        # r = requests.get(link, headers=heads)
        # r.encoding = "utr-8"
        #
        # result['data'] = r.text
        # return result

        try:
            r = requests.get(link, headers=heads, timeout=10)
            r.encoding = "utr-8"
            result['data'] = r.text
        except:
            second = random.randint(0, self.b_second * 60)
            self.logger.debug('[ %s ][ 403 ] 可能被拦截了暂停 %s 秒后 抓取下一条链接 !\n' % (link, second))
            time.sleep(second)
            result['status'] = '403'
        return result

    def listToStr(self, data_info):
        # links = ','.join(data_info)
        return tuple(data_info)
        # for item in links:
            # print(item)
            # print(str(item))

        # return str(','.join(data_info))

    def getText(self, link, heads):
        if len(link) <= 0: return
        return self.getHTMLTxt(link=link, heads=heads)
Ejemplo n.º 9
0
    mysqlStr = {
        'saveText':
        "INSERT INTO `links` (`url`,article) VALUES (%s, %s) ON DUPLICATE KEY UPDATE article = VALUES (article), nex = nex+1",
        'getBookIdsSql': getBookIdsSql,
        'getCatalogData': "SELECT url FROM links WHERE fs = 0 AND book_Id in "
    }
    # saveBookToRedis.control()
    end = time.time()

    pool = redis.ConnectionPool(host='192.168.2.202', port=6379, db=8)
    r2 = redis.StrictRedis(connection_pool=pool)
    while True:
        msg = input("publish: >>")
        if msg == "stop":
            logger.debug("停止发布")
            break
        if msg == "catalog":
            logger.debug("发布抓取通知")
            saveBookToRedis = SaveBookToRedis(mysqlStr=mysqlStr,
                                              bookPageSize=bookPageSize,
                                              bookIdSize=bookIdSize)
            saveBookToRedis.control()
            time.sleep(60)
            r2.publish('getBookCatalog', msg)
        if msg == "txt":
            logger.debug("发布抓取通知")
            saveBookToRedis = SaveBookToRedis(mysqlStr=mysqlStr,
                                              bookPageSize=bookPageSize,
                                              bookIdSize=bookIdSize)
            saveBookToRedis.control()
Ejemplo n.º 10
0
class SaveBookToRedis():
    def __init__(self, environmentalType, maxBookNex):
        self.b_bookPageSize = 10
        self.b_bookIdSize = 5
        self.b_bookTXTGroupSize = 100
        self.b_environmentalType = int(environmentalType)
        self.b_maxBookNex = int(maxBookNex)
        self.b_title = 'SaveBookToRedis'
        self.b_second = 1
        self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')

        self.b_catalogList = []
        self.b_bookTXTData = []
        self.errorUrl = []
        self.request404 = []
        self.countNum = 0

        self.con = ConfigParser()
        self.logName = self.intLogName()
        self.rds = RedisToo()
        self.mySql = MySqlToo(logName=self.logName)
        self.dataToo = DataToo(logName=self.b_title, second=self.b_second)
        self.logger = Logger(logname=self.intLogName(),
                             loglevel=1,
                             logger=self.b_title).getlog()
        self.timeToo = TimeToo()
        self.b_mysqlStr = self.initMysqlStr()

    def intLogName(self):
        timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')
        return '%s_%s.log' % (self.b_title, timeStr)

    def initMysqlStr(self):
        if self.b_environmentalType == 2:
            environmental = 'online'
            getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s" % self.b_maxBookNex
        elif self.b_environmentalType == 1:
            environmental = 'test'
            testBookId = '10000,20000'
            getBookIdsSql = "SELECT book_Id FROM books WHERE nex > %s limit %s" % (
                self.b_maxBookNex, testBookId)
        else:
            environmental = 'dev'
            testBookId = "'10000611804961003','10000828104982003'"
            getBookIdsSql = "SELECT book_Id FROM books WHERE book_Id in (%s)" % testBookId
            self.b_bookPageSize = 2
            self.b_bookIdSize = 2
            self.b_bookTXTGroupSize = 1

        return {
            'saveText':
            "INSERT INTO `links` (`url`,article) VALUES (%s, %s) ON DUPLICATE KEY UPDATE article = VALUES (article), nex = nex+1",
            'getBookIdsSql': getBookIdsSql,
            'getCatalogData':
            "SELECT url FROM links WHERE fs = 0 AND book_Id in "
        }

    def second(self):
        time.sleep(self.b_second)

    def getBookData(self):
        bookList = []
        bookData = self.mySql.getListData(sql=self.b_mysqlStr['getBookIdsSql'])
        for item in bookData:
            bookList.append(item[0])
        return bookList

    def setCatalogList(self, bookGroupingData):
        bookData = bookGroupingData['listTaskList']
        if len(bookData) <= 0:
            self.logger.debug('setCatalogList 没有数据\n')
            return
        bookIdGroupingData = self.dataToo.groupingData(
            list=bookData, pageSize=self.b_bookIdSize)
        listTaskList = bookIdGroupingData['listTaskList']
        for i in range(bookIdGroupingData['listGroupSize']):
            time.sleep(10)
            if len(listTaskList[i]) <= 0: continue
            data = []
            for item in listTaskList[i]:
                data.append(','.join(item))
            self.rds.setListData('bookIdsList', data)

    def bookTxtLoad(self):
        start = time.time()
        bookData = self.getBookData()
        if len(bookData) <= 0:
            self.logger.debug('bookTxtLoad 没有数据\n')
            return
        bookGroupingData = self.dataToo.groupingData(
            list=bookData, pageSize=self.b_bookPageSize)
        self.setCatalogList(bookGroupingData)

        end = time.time()
        self.logger.info('========' * 15)
        self.logger.info("startTime: %s" %
                         (moment.now().format('YYYY-MM-DD HH:mm:ss')))
        self.logger.info("webHost:%s" %
                         (self.con.getConfig('webConfig', 'host')))
        self.logger.info("author:%s" %
                         (self.con.getConfig('webConfig', 'author')))
        self.logger.info("email:%s" %
                         (self.con.getConfig('webConfig', 'email')))
        self.logger.info(
            '本次将采集 [ %s ] 本小说,共分为 %s 个组,每组 %s 本小说。' %
            (bookGroupingData['listSize'], bookGroupingData['listGroupSize'],
             bookGroupingData['listTaskSize']))
        self.logger.info(
            'saveBooksToRedis [ %s ] 组 小说,消耗时间:%s 秒 [ %s ]' %
            (bookGroupingData['listGroupSize'], float(end) - float(start),
             self.timeToo.changeTime(float(end) - float(start))))
        self.logger.info('========' * 15)
Ejemplo n.º 11
0
class getFreeBookTXT(object):
    def __init__(self):
        # self.b_getBookIdsListSize = int(getBookIdsListSize)
        # self.b_rdsKeyName = rdsKeyName
        self.b_title = 'getFreeBookTXT'
        self.b_second = 1
        self.b_timeStr = moment.now().format('YYYY-MM-DD-HH-mm-ss')
        self.dataToo = DataTool(logName=self.b_title,
                                second=self.b_second,
                                timeStr=self.b_timeStr)
        self.mySql = MySqlTool(logName=self.dataToo.initLogName())
        self.logger = Logger(logname=self.dataToo.initLogName(),
                             loglevel=1,
                             logger=self.b_title).getlog()
        self.rds = RedisTool()
        self.getBookInfoToo = GetBookInfoTool(second=self.b_second,
                                              dataToo=self.dataToo,
                                              logger=self.logger)
        self.saveBookInfoToMySqlToo = SaveBookInfoToMySqlToo(
            second=self.b_second,
            logger=self.logger,
            getBookInfoToo=self.getBookInfoToo,
            mySql=self.mySql,
            dataToo=self.dataToo)
        self.con = ConfigParser()

    def getFreeBookLink(self):
        bookInfoList = self.getBookInfoToo.toFreeBookListPageGetBookList(
            freeBookListPage=self.con.getConfig('webConfig',
                                                'freeBookListPage'))
        self.logger.debug(bookInfoList)
        return bookInfoList

    # def target(self):
    #     # links = []
    # for i in range(self.b_getBookIdsListSize):
    #     link = self.rds.r.lpop(self.b_rdsKeyName)
    #     if link != None:
    #         link = link.decode(encoding='utf-8')
    #         links.append(link)
    # return links
    def formatCatalogInfo(self, data):
        catalogData = data['vs']
        links = []
        for i in catalogData:
            for j in i['cs']:
                url = self.con.getConfig(
                    'webConfig',
                    'host') + '/chapter/' + data['bookId'] + '/' + j['id']
                # links.append({j['id'], j['cN'], bookId, bookName, j['cnt'], url, j['uuid'], j['fS']})
                # links.append(str(url))
                self.saveBookInfoToMySqlToo.saveText(link=str(url))
        return links

    def contentsLoad(self):
        links = self.getFreeBookLink()
        if len(links) <= 0:
            self.logger.debug('getFreeBookLink 没有数据\n')
            return
        for item in links:
            # self.logger.debug(item)
            # self.logger.debug(item['book_Id'])
            time.sleep(self.b_second)
            jsonData = self.getBookInfoToo.getCatalogInfo(
                bookId=item['book_Id'])
            self.logger.debug(jsonData)
            catalogData = self.formatCatalogInfo(data=jsonData['data'])
            self.logger.debug(catalogData)
            self.saveBookInfoToMySqlToo.saveCatalog(
                bookId=jsonData['data']['bookId'])