def run(self): while True: logger.info("Start to run spider") sqlHelper = SqliteHelper() logger.info('Start to run validator') validator = Validator(sqlHelper) count = validator.run_db() logger.info('Finished to run validator, count=%s' % count) if count[0] < MINNUM: proxys = self.crawl_pool.map(self.crawl, parserList) #这个时候proxys的格式是[[{},{},{}],[{},{},{}]] # print proxys #这个时候应该去重: proxys_tmp = [] for proxy in proxys: proxys_tmp.extend(proxy) proxys = proxys_tmp logger.info('first_proxys: %s' % len(proxys)) #这个时候proxys的格式是[{},{},{},{},{},{}] proxys_tmp = None #这个时候开始去重: proxys = [ dict(t) for t in set([tuple(proxy.items()) for proxy in proxys]) ] logger.info('end_proxy: %s' % len(proxys)) logger.info('spider proxys: %s' % type(proxys)) proxys = validator.run_list(proxys) #这个是检测后的ip地址 sqlHelper.batch_insert(sqlHelper.tableName, proxys) logger.info('success ip: %s' % sqlHelper.selectCount()) sqlHelper.close() logger.info('Finished to run spider') time.sleep(UPDATE_TIME)
def run(self): while True: print 'spider beginning -------' sqlHelper = SqliteHelper() print 'validator beginning -------' validator = Validator(sqlHelper) count = validator.run_db() print 'validator end ----count=%s'%count if count[0]< MINNUM: proxys = self.crawl_pool.map(self.crawl,parserList) #这个时候proxys的格式是[[{},{},{}],[{},{},{}]] # print proxys #这个时候应该去重: proxys_tmp = [] for proxy in proxys: proxys_tmp.extend(proxy) proxys = proxys_tmp print 'first_proxys--%s',len(proxys) #这个时候proxys的格式是[{},{},{},{},{},{}] proxys_tmp=None #这个时候开始去重: proxys = [dict(t) for t in set([tuple(proxy.items()) for proxy in proxys])] print 'end_proxys--%s',len(proxys) print 'spider proxys -------%s'%type(proxys) proxys = validator.run_list(proxys)#这个是检测后的ip地址 sqlHelper.batch_insert(sqlHelper.tableName,proxys) print 'success ip =%s'%sqlHelper.selectCount() sqlHelper.close() print 'spider end -------' time.sleep(UPDATE_TIME)