def save_ip(self, response): website_name = response.get('meta').get('website_name') response_time = response.get('content') target_url = response.get('meta').get('target_url') _ip = response.get('url') msg = '[{ip}] can visit the target url [{target_url}], source is [{source}]'.format( ip=_ip, target_url=target_url, source=website_name) logger.info(msg) # mongodb 集合名称 insert_data = {} insert_data['_id'] = _ip + '_' + target_url insert_data['ip'] = _ip insert_data['source'] = website_name insert_data['response_time'] = response_time insert_data['target_url'] = target_url insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S') # 保存数入库 self.pipeline.process_item(insert_data, self.collection_name)
def save_ip(self, response): website_name = response.get('meta').get('website_name') response_time = response.get('content') target_url = response.get('meta').get('target_url') _ip = response.get('url') msg = '[{ip}] can visit the target url [{target_url}], source is [{source}]'.format(ip=_ip, target_url=target_url, source=website_name) logger.info(msg) # mongodb 集合名称 insert_data = {} insert_data['_id'] = _ip+'_'+target_url insert_data['ip'] = _ip insert_data['source'] = website_name insert_data['response_time'] = response_time insert_data['target_url'] = target_url insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S') # 保存数入库 self.pipeline.process_item(insert_data, self.collection_name)
msg = '[{ip}] can visit the target url [{target_url}], source is [{source}]'.format( ip=_ip, target_url=target_url, source=website_name) logger.info(msg) # mongodb 集合名称 insert_data = {} insert_data['_id'] = _ip + '_' + target_url insert_data['ip'] = _ip insert_data['source'] = website_name insert_data['response_time'] = response_time insert_data['target_url'] = target_url insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S') # 保存数入库 self.pipeline.process_item(insert_data, self.collection_name) if __name__ == '__main__': # 测试代码 spidermain = SpiderMain() spidermain.run() # blocking work_queue.join() save_queue.join() # finishing crawl origin ip logger.info( 'available proxy has been saved in your database, please check!')
source=website_name) logger.info(msg) # mongodb 集合名称 insert_data = {} insert_data['_id'] = _ip+'_'+target_url insert_data['ip'] = _ip insert_data['source'] = website_name insert_data['response_time'] = response_time insert_data['target_url'] = target_url insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S') # 保存数入库 self.pipeline.process_item(insert_data, self.collection_name) if __name__ == '__main__': # 测试代码 spidermain = SpiderMain() spidermain.run() # blocking work_queue.join() save_queue.join() # finishing crawl origin ip logger.info('available proxy has been saved in your database, please check!')
from spider.threads import start, work_queue, save_queue from spider.log_format import logger from proxy_basic_config import url_parse_dict from _request import valid from get_proxies_base_spider import SpiderMain class WorkSpider(SpiderMain): def __init__(self): super(WorkSpider, self).__init__() # 重写run方法, # 若请求的函数为自定义, 则可以在crawl函数中设置: request=your_request_function, 默认为框架中的request def run(self): start() #启动工作线程和保存线程 self.craw() if __name__ == '__main__': work_spider = WorkSpider() work_spider.run() # Blocking work_queue.join() save_queue.join() # Done logger.info('All Job Finishing, Please Check!')
from spider.threads import start, work_queue, save_queue from spider.log_format import logger from proxy_basic_config import url_parse_dict from _request import valid from get_proxies_base_spider import SpiderMain class WorkSpider(SpiderMain): def __init__(self): super(WorkSpider, self).__init__() # 重写run方法, # 若请求的函数为自定义, 则可以在crawl函数中设置: request=your_request_function, 默认为框架中的request def run(self): start() self.craw() if __name__ == '__main__': work_spider = WorkSpider() work_spider.run() # Blocking work_queue.join() save_queue.join() # Done logger.info('All Job Finishing, Please Check!')