def start(): """程序入口""" logger = LoggerUtils().loglog('../logs/test.log') #测试redis_utils中的方法 redis = RedisUtils(REDIS_HOST, REDIS_PORT, REDIS_PASSWORD) redis.set_str('hello','11111') logger.info(redis.get_str('hello')) # a = redis.get_list('runoobkey') # for i in a: # print(i.decode()) #测试mysql_utils类中的方法 mysql_conn = MysqlUtils(MYSQL_HOST, MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_PORT, MYSQL_DATABASE) sql = "select * from sys_user" a = mysql_conn.execute_query(sql) logger.info(a) #ftp工具类测试 try: ftp = FTPUtils(FTP_HOST, FTP_PORT, FTP_USERNAME, FTP_PASSWORD) os.chdir("d:/") ftp.upload_file('centos7_init.sh', FTP_DIR) except Exception as e: logger.info(e) #csv工具类使用测试 csv = CSV_Utils() os.chdir("d:/") csv.export_sql_result('result.csv', a) pass
def __init__(self, **kwargs): """ :return: :class:Consumer object :rtype: Consumer """ self.context = kwargs.pop('context') self.__cookie_file = kwargs.pop('cookie_file') self.redis_handle = RedisUtils(db=kwargs.pop('redis_db'), tld=kwargs.pop('tld'))
def __init__(self, **kwargs): """ :return: :class:Producer object :rtype: Producer """ self.context = kwargs.pop('context') self.__mongo_db = kwargs.pop('mongo_db') self.mongo_handle = None self.redis_handle = RedisUtils(db=kwargs.pop('redis_db'), tld=kwargs.pop('tld'))
class Consumer(object): def __init__(self, **kwargs): """ :return: :class:Consumer object :rtype: Consumer """ self.context = kwargs.pop('context') self.__cookie_file = kwargs.pop('cookie_file') self.redis_handle = RedisUtils(db=kwargs.pop('redis_db'), tld=kwargs.pop('tld')) def consume(self): if not self.redis_handle.connected: logger.error('no redis connection found in consumer! exit.') return while True: try: url = self.redis_handle.fetch_one_task() with self.context['lock']: self.context['live_spider_counts'].value += 1 self.context['task_counts'].value -= 1 logger.info('get task url: %s' % url) logger.info('%d tasks left' % self.context['task_counts'].value) if not self.redis_handle.is_blocked(URL(url)): self.start_spider(url, self.__cookie_file) except: logger.exception('consumer exception!') if not self.redis_handle.connected: logger.error('redis disconnected! reconnecting...') self.redis_handle.connect() time.sleep(10) finally: with self.context['lock']: self.context['live_spider_counts'].value -= 1 def start_spider(self, url, cookie_file=None): results = SpiderPage(url, cookie_file=cookie_file).spider() with self.context['lock']: self.context['result_counts'].value += len(results) for _ in results: self.redis_handle.insert_result(_)
buil scanned pattern cache for redis from mongodb Copyright (c) 2016-2017 [email protected] (http://twi1ight.com/) See the file 'doc/COPYING' for copying permission """ import sys sys.path.append('../') from core.utils.mongo_utils import MongoUtils from core.utils.redis_utils import RedisUtils from core.utils.url import URL reload(sys) sys.setdefaultencoding('utf-8') m = MongoUtils() r = RedisUtils() def build_saved_cache(): for doc in m.query({}, {"_id": 0, "method": 1, "url": 1}): url = URL(doc['url']) r.set_url_saved(doc['method'], url) for doc in m.query({}, {"_id": 0, "method": 1, "url": 1}, is_target=False): url = URL(doc['url']) r.set_url_saved(doc['method'], url) if __name__ == '__main__': build_saved_cache()
help='Mongodb database name, default "tspider"') db.add_argument('--redis-db', metavar='NUMBER', dest='redis_db', type=int, default=RedisConf.db, help='Redis db index, default 0') args = parser.parse_args() if not any([args.url, args.file, args.keepon]): parser.exit(parser.format_help()) return args if __name__ == '__main__': args = cmdparse() redis_handle = RedisUtils(db=args.redis_db) if args.keepon: redis_handle.restore_startup_params(args) logger.info(args) if os.path.exists(TMPDIR_PATH): for f in os.listdir(TMPDIR_PATH): os.remove(os.path.join(TMPDIR_PATH, f)) tspider_context = { 'live_spider_counts': Value('i', 0), 'task_counts': Value('i', 0), 'result_counts': Value('i', 0), 'task_done': Event(), 'lock': Lock() } kwargs = { 'tld': args.tld,
#!/usr/bin/python # -*- coding: utf-8 -*- """ Created on 2016/8/7 16:17 add blacklist domain or subdomain in runtime Copyright (c) 2016-2017 [email protected] (http://twi1ight.com/) See the file 'doc/COPYING' for copying permission """ import sys from core.utils.redis_utils import RedisUtils if __name__ == '__main__': if len(sys.argv) != 3: print 'usage: block_domain.py db target.com' sys.exit() db = int(sys.argv[1]) domain = sys.argv[2] r = RedisUtils(db=db) r.add_blocklist(domain) print 'add success!'
class Producer(object): """ Producer Class make targets for consumer save results to mongodb """ def __init__(self, **kwargs): """ :return: :class:Producer object :rtype: Producer """ self.context = kwargs.pop('context') self.__mongo_db = kwargs.pop('mongo_db') self.mongo_handle = None self.redis_handle = RedisUtils(db=kwargs.pop('redis_db'), tld=kwargs.pop('tld')) def produce(self): # mongodb with multipleprocessing must be init after fork self.mongo_handle = MongoUtils(db=self.__mongo_db) if not self.redis_handle.connected or not self.mongo_handle.connected: logger.error('no redis/mongodb connection found! exit.') return while True: try: _, req = self.redis_handle.fetch_one_result() with self.context['lock']: self.context['result_counts'].value -= 1 logger.debug('got req, %d results left' % self.context['result_counts'].value) self.proc_req(req) except: logger.exception('produce exception!') if not self.redis_handle.connected: logger.error('redis disconnected! reconnecting...') self.redis_handle.connect() if not self.mongo_handle.connected: logger.error('mongodb disconnected! reconnecting...') self.mongo_handle.connect() time.sleep(10) finally: with self.context['lock']: if self.context['result_counts'].value == 0: if self.context[ 'live_spider_counts'].value == 0 and self.context[ 'task_counts'].value == 0: self.context['task_done'].set() def proc_req(self, req): try: data = json.loads(req) except: logger.exception('json loads req error: %s' % req) return urlstring = data.get('url', '') if not urlstring: logger.error('empty url found!') return url = URL(urlstring) method = data.get('method', '') if not method: logger.error('not method found!') return # save to mongodb data.update({ 'pattern': url.pattern, 'hostname': url.hostname, 'domain': url.domain }) target = self.redis_handle.is_target(url) if not self.redis_handle.is_url_saved(method, url): logger.debug('redis saved pattern not found!') self.mongo_handle.save(data, is_target=target) self.redis_handle.set_url_saved(method, url) else: logger.debug('redis saved pattern found!') if not target: logger.debug('%s is not target' % url.hostname) return # todo post req if method == 'POST': logger.debug('POST not support now') elif method == 'GET': # new host found, add index page to task queue if self.redis_handle.get_hostname_reqcount(url.hostname) == 0: self.create_task_from_url(URL(url.index_page), add_whitelist=False) # check url validation inside create_url_task self.create_task_from_url(url, add_whitelist=False) else: # not GET nor POST logger.error('HTTP Verb %s found!' % method) logger.debug(data) def create_task_from_url(self, url, **kwargs): with self.context['lock']: if self.redis_handle.create_task_from_url(url, **kwargs): self.context['task_counts'].value += 1 def create_task_from_file(self, fileobj): """ create task from file :param filename: :return: """ with fileobj: for line in fileobj: line = line.strip() if not line: continue url = URL(line) self.create_task_from_url(url)