def gtaskManager(self,urls,extractSearchResults,proxy_flag = 0,ua_flag = 0): task_log = None gtaskpool.setlogging(logging.INFO,task_log) purl1 = ["http://192.168.120.17:8014/proxy/get_http_proxy_list"] uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list" limited_urls = [ ('^https{0,1}://', 0) ] global proxymgr if proxy_flag == 1: proxymgr = ProxyManager(get_http_proxies, limited_urls, {'refresh': True, 'interval': 30 * 60, 'delay': 8 * 60}, *purl1) else: proxymgr = None print proxymgr global useragents if ua_flag == 1: useragents = get_useragents(uurl1) else: useragents = [None] if useragents == []: useragents = [None] gtaskpool.runtasks(AccessUrls.taskGenerator(self,urls,extractSearchResults))
def gtaskmanager(self,engine_type): #task_log = 'task_log.log' task_log = None gtaskpool.setlogging(logging.INFO,task_log) purl1 = ["http://192.168.120.185:5500/get_google_http_proxy_list"] uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list" limited_urls = [ ('^https://search\.disconnect\.me', 1) ] global proxymgr proxymgr = ProxyManager(get_http_proxies, limited_urls, {'refresh': True, 'interval': 30 * 60, 'delay': 8 * 60}, *purl1) global useragents useragents = get_useragents(uurl1) if useragents == []: useragents = [None] gtaskpool.runtasks(self.task_generator(self,engine_type))
def gtaskmanager(self, engine_type): #task_log = 'task_log.log' task_log = None gtaskpool.setlogging(logging.INFO, task_log) purl1 = ["http://192.168.120.185:5500/get_google_http_proxy_list"] uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list" limited_urls = [('^https://search\.disconnect\.me', 1)] global proxymgr proxymgr = ProxyManager(get_http_proxies, limited_urls, { 'refresh': True, 'interval': 30 * 60, 'delay': 8 * 60 }, *purl1) global useragents useragents = get_useragents(uurl1) if useragents == []: useragents = [None] gtaskpool.runtasks(self.task_generator(self, engine_type))
"task(%s) - response(len=%s): %s\n" % (url, len(r.text), rtext[: min(400, len(rtext))].replace("\r", "").replace("\n", "")) ) fresult.flush() return True def task_generator(): base_url = "http://www.baidu.com/s?wd=apple&pn=" task_num = 300 for i in xrange(task_num): yield gtaskpool.Task(task_retry, [base_url + str(i * 10)]) if __name__ == "__main__": gtaskpool.setlogging(logging.INFO) purl1 = ["http://192.168.120.17:8014/proxy/get_http_proxy_list"] purl2 = ["http://192.168.1.14:5500/get_http_proxy_list"] uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list" uurl2 = "http://192.168.1.14:5500/get_useragent_list" # Create a ProxyManager if you need limited_urls = [("^http://www\.baidu\.com/s\?wd=apple&pn=\d+$", 1)] proxymgr = ProxyManager( get_http_proxies, limited_urls, {"refresh": True, "interval": 30 * 60, "delay": 8 * 60}, *purl2 ) # Or if you don't want refresh proxies periodcally # proxymgr = ProxyManager(get_http_proxies, *purl2, limited_urls, \ # {'refresh': False}, *purl2)
#!/usr/bin/env python # encoding: utf-8 import gtaskpool import requests import logging def task(n1, n2): logging.info("task(%s, %s): called", n1, n2) r = requests.get("http://www.baidu.com") print "task(%s, %s): response (len=%s): %s..." % \ (n1, n2, len(r.text), r.text[:min(100, len(r.text))]) logging.info("task(%s, %s): finished", n1, n2) def task_generator(): task_num = 10 for i in xrange(1, task_num + 1): yield gtaskpool.Task(task, [i, i]) if __name__ == "__main__": gtaskpool.setlogging(logging.INFO) gtaskpool.runtasks(task_generator())