resp = requests.get(url) if 200 == resp.status_code: logging.info('Start Crawl kuai: %s', url) bs = BeautifulSoup(resp.text, 'html.parser') for t_body in bs.find_all('tbody'): for tr in t_body.find_all('tr'): td_list = tr.find_all('td') ip, port, proxy_type = td_list[0].text, td_list[ 1].text, td_list[3].text judgeProxy(ip, port, proxy_type.lower()) def cron_crawl_proxy(): """定时爬取快代理数据""" logging.info("Cron crawl kuai proxy") # 对应高匿代理,普通代理,https代理和http代理 crawl_list = ['inha', 'intr'] multi_pool = multiprocessing.Pool(len(crawl_list)) multi_pool.map(crawlProxy, crawl_list) multi_pool.close() if __name__ == '__main__': # 定时爬取 days, hours = '0-6', '0-23' cron.cron_blocking(job=cron_crawl_proxy, day_of_week=days, hour=hours, minute='45')
def handleProxy(keys): """检测代理是否有效""" for key in keys: for member in redis.smembers(key): proxy = {key: member} if member != checkProxy(proxy, timeout=5): logging.warning("Handle Proxy %s is invalid", member) redis.srem(key, member) def cron_handle_proxy(): """定时检测Redis代理数据""" logging.info("Cron handle proxy") keys = redis.keys('http*') multi_pool = multiprocessing.Pool(len(keys)) multi_pool.map(handleProxy, keys) multi_pool.close() if __name__ == '__main__': # 定时检测 days, hours = '0-6', '0-23' cron.cron_blocking(job=cron_handle_proxy, day_of_week=days, hour=hours, minute='30')
import sys from utils import cron from scrapy.cmdline import execute # 爬取数据 # execute(['scrapy', 'crawl', 'job', '-s', 'CLOSESPIDER_PAGECOUNT=10']) # execute(['scrapy', 'crawl', 'job', '-s', 'CLOSESPIDER_ITEMCOUNT=5']) # 判断过期 # execute(['scrapy', 'crawl', 'expire']) def run(): execute(['scrapy', 'crawl', 'job']) if __name__ == '__main__': if len(sys.argv) == 1: cron.cron_blocking(job=run, day_of_week='4', hour='21') else: run()
import sys from utils import cron from scrapy.cmdline import execute def run(): execute('scrapy crawl job'.split(' ')) if __name__ == '__main__': if len(sys.argv) == 1: cron.cron_blocking(job=run, day_of_week='5') else: run()