def save_ip(self, response): website_name = response.get('meta').get('website_name') response_time = response.get('content') target_url = response.get('meta').get('target_url') _ip = response.get('url') msg = '[{ip}] can visit the target url [{target_url}], source is [{source}]'.format( ip=_ip, target_url=target_url, source=website_name) logger.info(msg) # mongodb 集合名称 insert_data = {} insert_data['_id'] = _ip + '_' + target_url insert_data['ip'] = _ip insert_data['source'] = website_name insert_data['response_time'] = response_time insert_data['target_url'] = target_url insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S') # 保存数入库 self.pipeline.process_item(insert_data, self.collection_name)
def show_size(): while 1: if not work_queue.empty() or not save_queue.empty(): msg = 'AT %s ,work queue size is [%d]' % (time.strftime('%Y-%m-%d %H:%M:%S'), work_queue.qsize()) logger.info(msg) msg = 'AT %s ,save queue size is [%d]' % (time.strftime('%Y-%m-%d %H:%M:%S'), save_queue.qsize()) logger.info(msg) msg = 'work threading total count is [%d], active count is [%d]' % ( len(work_threading_list), tools.isThreadAlive(work_threading_list)) logger.info(msg) msg = 'save threading total count is [%d], active count is [%d]' % ( len(save_threading_list), tools.isThreadAlive(save_threading_list)) logger.info(msg) time.sleep(2)
def request(self, _args, dont_filter): url = _args.get('url') sleep_time = _args.get('sleep_time') if _args.get( 'sleep_time') else self.sleep_time time_out = _args.get('time_out') if _args.get( 'time_out') else self.time_out retry_times = _args.get('retry_times') if _args.get( 'retry_times') else self.retry_times use_proxy = _args.get('use_proxy') if _args.get( 'use_proxy') else self.use_proxy _ip = _args.get('ip') if _args.get('ip') else self.ip ua_type = _args.get('ua_type') if _args.get( 'ua_type') else self.ua_type diy_header = _args.get('diy_header') if _args.get( 'diy_header') else self.diy_header if not dont_filter: check_result = self.check(url) if not check_result: return 'HAS CRAWLED', url else: msg = 'new url' logger.info(msg) if not url.startswith('http'): raise ValueError('url has to be started with http or https') if diy_header: header = diy_header else: host = urlparse.urlparse(url).netloc header = { 'User-Agent': random.choice(PC_USER_AGENTS), 'Host': host, } if ua_type == 'mobile': header = { 'User-Agent': random.choice(MOBILE_USER_AGENTS), 'Host': host } times = 0 con = None while retry_times > 0: times += 1 self.log.info('request %s, times: %d' % (url, times)) try: if use_proxy: ip = _ip if ip: proxy = { 'http': 'http://%s' % ip, 'https': 'http://%s' % ip } con = requests.get(url, headers=header, proxies=proxy, timeout=time_out) if con.status_code not in self.status_code: self.log.error('status code is %s' % con.status_code) raise ValueError( 'status code not in the code in config.py, check your log' ) time.sleep(sleep_time) else: msg = 'ip can not be none while use_proxy is True' self.log.error(msg) os._exit(0) else: con = requests.get(url, headers=header, timeout=time_out) if con.status_code not in self.status_code: self.log.error('status code is %s' % con.status_code) raise ValueError( 'status code not in the code in config.py, check your log' ) time.sleep(sleep_time) except Exception, e: self.log.error(e) retry_times -= 1 self.log.warning('retrying request: [%s], times: %s' % (url, times)) if times == 10: self.log.error( 'give up retrying request: [%s], times: %s is bigger than setting' % (url, times)) return None, None else: self.log.info('[%s] has requested successfully' % url) if con: if not dont_filter: url = self.md5_url(url) sbf.add(url) return con.content, con.url else: self.log.error('content is None, url is %s' % url) return None, None
def request(self, _args, dont_filter): #如果参数给了具体的值,就设定就可以了 url = _args.get("url") sleep_time = _args.get("sleep_time") if _args.get( "sleep_time") else self.sleep_time time_out = _args.get("time_out") if _args.get( "time_out") else self.time_out retry_times = _args.get("retry_times") if _args.get( 'retry_times') else self.retry_times use_proxy = _args.get("use_proxy") if _args.get( "use_proxy") else self.use_proxy _ip = _args.get("ua_type") if _args.get("ua_type") else self.ua_type diy_header = _args.get("diy_header") if _args.get( "diy_header") else self.diy_header method = _args.get('method') if _args.get('method') else self.method post_data = _args.get('submit_data') if _args.get( 'submit_data') else self.submit_data if not dont_filter: check_result = self.check(url) if not check_result: return "HAS CRAWLED", url else: msg = "new url" logger.info(msg) #过滤掉非http和https开头的东西 if not url.startswith("http"): raise ValueError("url has to be started with http or https") if diy_header: header = diy_header else: host = urlparse.urlparse(url).netloc #netloc举例,返回http后www打头的host ## import urllib.parse as urlparse ## urlparse.urlparse("http://www.baidu.com").netloc ## result = 'www.baidu.com' ##UA模块,模拟浏览器header header = { 'User-Agent': random.choice(PC_USER_AGENTS), 'host': host } if ua_type == 'mobile': header = { 'User-Agent': random.choice(MOBILE_USER_AGENTS), 'Host': host } times = 0 con = None # Retry和 while retry_times > 0: times += 1 self.log.info("request %s,time: %d" % (url, times)) try: if use_proxy: ip = _ip if ip: ### IP POOLS! proxy = { 'http': 'http://%s' % ip, 'https': 'http://%s' % ip } #scraping threads!!! if method == "get": con = request_session.get(url, headers=header, proxies=proxy, timeout=time_out, params=post_data, verify=False) elif method == "post": if post_data and isinstance(post_data, dict): con = request_session.post(url, header=header, proxies=proxy, timeout=time_out, data=post_data, verify=False) else: self.log.error( 'while method is post, post_data must be defined and defined as dict' ) # if status_code is not correct,raise a if con.status_code not in self.status_code: self.log.error('status code is %s' % con.status_code) raise ValueError( 'status code not in the code in config.py,check your log' ) time.sleep(sleep_time) else: msg = 'ip can not be none while use_proxy is True' self.log.error(msg) os._exit(0) ## 不设置ip代理的情况 else: if method == "get": con = request_session.get(url, headers=header, timeout=time_out, params=post_data, verify=False) elif method == "post": if post_data and isinstance(post_data, dict): con = request_session.post(url, headers=header, timeout=time_out, verify=False) else: self.log.error( 'while method is post,post_data must be defined and defined as a dict' ) os._exit(0) if con.status_code not in self.status_code: self.log.error("status code is %s" % con.status_code) raise ValueError( "status code not in the code in config.py,check your log" ) time.sleep(sleep_time) except Exception as e: self.log.error(e) retry_times -= 1 self.log.warning('retrying request:[%s],times:%s' % (Url, times)) if times == 10: self.log.error( 'give up retrying request:[%s],times:%s is bigger than setting' % (url, times)) return None, None else: self.log.info('[%s] has requested successfully' % url) if con: if not dont_filter: url = self.md5_url(url) sbf.add(url) return con.content, con.url else: self.log.error('content is None,url is %s' % url) return None, None
from threads import start, work_queue, save_queue from log_format import logger from proxy_basic_config import url_parse_dict from _request import valid from get_proxies_base_spider import SpiderMain class WorkSpider(SpiderMain): def __init__(self): super(WorkSpider, self).__init__() # 重写run方法, # 若请求的函数为自定义, 则可以在crawl函数中设置: request=your_request_function, 默认为框架中的request def run(self): start() self.craw() if __name__ == '__main__': work_spider = WorkSpider() work_spider.run() # Blocking work_queue.join() save_queue.join() # Done logger.info('All Job Finishing, Please Check!')
ip=_ip, target_url=target_url, source=website_name) logger.info(msg) # mongodb 集合名称 insert_data = {} insert_data['_id'] = _ip + '_' + target_url insert_data['ip'] = _ip insert_data['source'] = website_name insert_data['response_time'] = response_time insert_data['target_url'] = target_url insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S') # 保存数入库 self.pipeline.process_item(insert_data, self.collection_name) if __name__ == '__main__': # 测试代码 spidermain = SpiderMain() spidermain.run() # blocking work_queue.join() save_queue.join() # finishing crawl origin ip logger.info( 'available proxy has been saved in your database, please check!')