async def timed_crawler(self, url, freq, ws_id): redis_conn = redis.Redis(connection_pool=self.redis) while True: ws_freq_check = redis_conn.get(ws_id) if not ws_freq_check or not int(ws_freq_check) >= 0: logging.info('got shutdown signal {}'.format(str(ws_id))) break crawler = Crawler([url], loop=self.loop, db_rpc=DbRpcClient()) await crawler.crawl() crawler.close() await asyncio.sleep(int(freq) * 60)
class CrawlerTest(unittest.TestCase): def test_output_count(self): self.crawler = Crawler("https://stackoverflow.com/questions", 5, timeout=10) self.crawler.engage() self.assertEqual(len(self.crawler.directory.keys()), 5) def test_visited_count(self): self.crawler = Crawler("https://stackoverflow.com/questions", 2) self.crawler.engage() self.assertEqual(len(self.crawler.visits), 2) def test_zero_url_count(self): with self.assertRaises(RuntimeError): self.crawler = Crawler("http://www.google.com", 0)
class Getter: '''获取器,获取ip''' def __init__(self): self.crawler = Crawler() self.redis = RedisOperate() def run_getter(self): '''存入代理池,设置分数最高''' print('获取器开始运行,抓取代理中。。。') for callback in self.crawler.__CrawlFun__: # print('抓取代理的函数有',callback) new_proxy_list = self.crawler.get_proxy(callback) for proxy in new_proxy_list: self.redis.add(proxy)
class Getter: def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def limit(self, limit_num=500): """ 判断代理数量是否超过代理池设定值 :param limit_num: :return: """ if self.redis.count() >= limit_num: return True else: return False def run(self): print("Getter is running...") if not self.limit(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.put(proxy)
def get_pages(self, url, no_verbose=False): """Return list of all webpages""" crawler = Crawler(url, no_verbose, limit=self.pagelimit) return crawler.start()
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
async def one_time_crawler(self, url, freq, ws_id): crawler = Crawler([url], loop=self.loop, db_rpc=DbRpcClient()) await crawler.crawl() crawler.close()
def test_output_count(self): self.crawler = Crawler("https://stackoverflow.com/questions", 5, timeout=10) self.crawler.engage() self.assertEqual(len(self.crawler.directory.keys()), 5)
def test_zero_url_count(self): with self.assertRaises(RuntimeError): self.crawler = Crawler("http://www.google.com", 0)
def test_visited_count(self): self.crawler = Crawler("https://stackoverflow.com/questions", 2) self.crawler.engage() self.assertEqual(len(self.crawler.visits), 2)
# Reference link: # # # # pig6:https://github.com/pig6/login_taobao # # Germey:https://github.com/Python3WebSpider/ProxyPool # # https://github.com/Python3WebSpider/Weixin # # # # Thanks! # # ******************************************************************************************************************** # from spider.crawler import Crawler from spider.login import Login if __name__ == '__main__': # 这三个表单参数在浏览器中复制,可以多次使用,ua和password2分别为加密后的账号和密码 # 获取过程: # 1、清除淘宝网的cookies,访问淘宝网:https://www.taobao.com # 2、在搜索栏搜索任意商品,跳转到登录页面 # 3、点击登陆,用抓包工具或浏览器开发者模式截获url:https://login.taobao.com/newlogin/login.do?appName=taobao&fromSite=0 # 4、复制此url内的表单信息ua、loginId、password2 ua = '' loginId = '' password2 = '' login = Login(ua, loginId, password2) crawler = Crawler() if login.logged(): login.print_title() crawler.run() else: crawler.run()
def __init__(self): self.crawler = Crawler() self.redis = RedisOperate()
""" Developer : Kavitha Madhavaraj To interrupt the program: * Use ctrl+c once """ from spider.crawler import Crawler #Case 1: Specify the number of URL's to crawl cr1 = Crawler('http://www.windowscentral.com', strict=True, timeout=5, multi=10) directory = cr1.engage() print "\n\nPages visited (Case1) : ", len(cr1.visits) #Save the crawled directory in a JSON file cr1.save_directory('Result1') """ #Case 2: Crawl untill interrupted (ctrl+c) cr2 = Crawler('http://www.bitsathy.ac.in/sitemap.xml', timeout=5, strict=True) directory = cr2.engage() print "\n\nPages visited (Case2) : ", len(cr2.visits) #Save the crawled directory in a JSON file cr2.save_directory('Result2') """