Exemple #1
0
    def run(self):
        while True:
            logger.info("Start to run spider")
            sqlHelper = SqliteHelper()
            logger.info('Start to run validator')
            validator = Validator(sqlHelper)
            count = validator.run_db()
            logger.info('Finished to run validator, count=%s' % count)
            if count[0] < MINNUM:
                proxys = self.crawl_pool.map(self.crawl, parserList)
                #这个时候proxys的格式是[[{},{},{}],[{},{},{}]]
                # print proxys
                #这个时候应该去重:

                proxys_tmp = []
                for proxy in proxys:
                    proxys_tmp.extend(proxy)

                proxys = proxys_tmp
                logger.info('first_proxys: %s' % len(proxys))
                #这个时候proxys的格式是[{},{},{},{},{},{}]
                proxys_tmp = None
                #这个时候开始去重:
                proxys = [
                    dict(t)
                    for t in set([tuple(proxy.items()) for proxy in proxys])
                ]
                logger.info('end_proxy: %s' % len(proxys))
                logger.info('spider proxys: %s' % type(proxys))
                proxys = validator.run_list(proxys)  #这个是检测后的ip地址

                sqlHelper.batch_insert(sqlHelper.tableName, proxys)

                logger.info('success ip: %s' % sqlHelper.selectCount())
                sqlHelper.close()
            logger.info('Finished to run spider')
            time.sleep(UPDATE_TIME)
    def run(self):
        while True:
            print 'spider beginning -------'
            sqlHelper = SqliteHelper()
            print 'validator beginning -------'
            validator = Validator(sqlHelper)
            count = validator.run_db()
            print 'validator end ----count=%s'%count
            if count[0]< MINNUM:
                proxys = self.crawl_pool.map(self.crawl,parserList)
                #这个时候proxys的格式是[[{},{},{}],[{},{},{}]]
                # print proxys
                #这个时候应该去重:

                proxys_tmp = []
                for proxy in proxys:
                    proxys_tmp.extend(proxy)

                proxys = proxys_tmp
                print 'first_proxys--%s',len(proxys)
                #这个时候proxys的格式是[{},{},{},{},{},{}]
                proxys_tmp=None
                #这个时候开始去重:
                proxys = [dict(t) for t in set([tuple(proxy.items()) for proxy in proxys])]
                print 'end_proxys--%s',len(proxys)
                print 'spider proxys -------%s'%type(proxys)
                proxys = validator.run_list(proxys)#这个是检测后的ip地址


                sqlHelper.batch_insert(sqlHelper.tableName,proxys)


                print 'success ip =%s'%sqlHelper.selectCount()
                sqlHelper.close()
            print 'spider end -------'
            time.sleep(UPDATE_TIME)