def to_queue(self, filename, typs): """ Read file data to yield serializable data :param filename: absolute path to the file :param typs: Only 1 or 2, if typs = 1, it is hot news, else typs = 2, it is full news """ data = [] fn = os.path.basename(filename) try: with open(filename) as fp: data.extend([_line.strip() for _line in fp]) data.append(fn) dumps = simplejson.dumps(dict(zip(self.keys, data))) except (IOError, simplejson.JSONDecodeError) as e: logger.info( 'Pickle data from file error: type <{}>, msg <{}>, filename <{}>, file <{}>' .format(self.queues(typs), e.__class__, e, filename, _abs(__file__))) else: if len(self.keys) == len(data): self.push(dumps, typs) else: logger.info( 'Pickle data to redis error: redis key <{}>, msg <file number fail>, filename <{}>, ' 'file <{}>'.format(self.queues(typs), filename, _abs(__file__)))
def ssh_command(self, cmd): ssh = transfer.spawn('ssh root@%s "%s"' % (self._host, cmd)) try: i = ssh.expect(['password:'******'continue connecting(yes/no)?'], timeout=5) if i == 0: ssh.sendline(self._password) elif i == 1: ssh.sendline('yes\n') ssh.expect('password:'******'Run ssh command error: cmd <{}>, type <{}>, msg <{}>, file <{}>'.format( cmd, e.__class__, e, _abs(__file__))) except transfer.TIMEOUT as e: ret = -2 logger.info('Run ssh command error: cmd <{}>, type <{}>, msg <{}>, file <{}>'.format( cmd, e.__class__, e, _abs(__file__))) finally: ssh.close() return ret
def ssh_command(self, cmd): ssh = transfer.spawn('ssh root@%s "%s"' % (self._host, cmd)) try: i = ssh.expect(['password:'******'continue connecting(yes/no)?'], timeout=5) if i == 0: ssh.sendline(self._password) elif i == 1: ssh.sendline('yes\n') ssh.expect('password:'******'Run ssh command error: cmd <{}>, type <{}>, msg <{}>, file <{}>' .format(cmd, e.__class__, e, _abs(__file__))) except transfer.TIMEOUT as e: ret = -2 logger.info( 'Run ssh command error: cmd <{}>, type <{}>, msg <{}>, file <{}>' .format(cmd, e.__class__, e, _abs(__file__))) finally: ssh.close() return ret
def dispatch_job(self, default_type, interval, kw_values): """ dispatch jobs by `default_type` and `interval` :param default_type: int, dispatch job type :param interval: int, interval time :param kw_values: dict, site name """ interval = '*/%s' % interval _kwargs = {'site_name': kw_values} if default_type == 1: # 全量分派任务, 默认间隔为 5 分钟 hour = '6-9' elif default_type == 2: # 全量分派任务, 默认间隔为 8 分钟 hour = '10-19' elif default_type == 3: # 全量分派任务, 默认间隔为 10 分钟 hour = '20-23,0-5' else: logger('full jobs schedule time type <{}> failed: <>'.format( default_type, _abs(__file__))) raise app.add_job(self.schedule, trigger='cron', kwargs=_kwargs, minute=interval, hour=hour, misfire_grace_time=10)
def put(self, local, remote): """ Make local file push remote path :param local: local machine absolutely file path :param remote: remote machine absolutely directory path """ self.ssh_command('mkdir -p %s' % remote) child = transfer.spawn('scp %s root@%s:%s' % (local, self._host, remote)) try: while True: index = child.expect( ["root@%s's password:" % self._host, transfer.TIMEOUT]) if index == 0: child.sendline('%s\n' % self._password) break elif index == 1: pass except (transfer.EOF, transfer.TIMEOUT) as e: logger.info( 'Transfer file error: type <{}>, msg <{}>, file <{}>'.format( e.__class__, e, _abs(__file__))) finally: child.interact() child.close()
def dispatch_full_jobs(): # 该任务一天执行一次, 在每天零点时, 由其他job取消该任务, 并在此重启该任务 bs = BaseSched() if bs.is_migrate is None: return interval_sites = [] remain_sites = [] most_sites = bs.most_sites intervals = Intervals().intervals try: for site, interval_type_dict in intervals.iteritems(): for type_key, interval in interval_type_dict.iteritems(): kw_values = most_sites[site] bs.dispatch_job(type_key, interval, kw_values) interval_sites.append('{}:<{}>'.format(site, kw_values)) rest_sites_keys = set(most_sites.keys()) - set(intervals.keys()) for _rest_keys in rest_sites_keys: # 将剩下的网站按照三个时间段分开添加任务 sites_of_related = most_sites[_rest_keys] bs.dispatch_job(1, 5, sites_of_related) bs.dispatch_job(2, 8, sites_of_related) bs.dispatch_job(3, 10, sites_of_related) remain_sites.append('{}:<{}>'.format(_rest_keys, sites_of_related)) except Exception as e: info = (e.__class__, e, _abs(__file__)) logger.info('Dispatch full jobs error: type <{}>, msg <{}>, file <{}>'.format(*info))
def dispatch_full_jobs(): # 该任务一天执行一次, 在每天零点时, 由其他job取消该任务, 并在此重启该任务 bs = BaseSched() if bs.is_migrate is None: return interval_sites = [] remain_sites = [] most_sites = bs.most_sites intervals = Intervals().intervals try: for site, interval_type_dict in intervals.iteritems(): for type_key, interval in interval_type_dict.iteritems(): kw_values = most_sites[site] bs.dispatch_job(type_key, interval, kw_values) interval_sites.append('{}:<{}>'.format(site, kw_values)) rest_sites_keys = set(most_sites.keys()) - set(intervals.keys()) for _rest_keys in rest_sites_keys: # 将剩下的网站按照三个时间段分开添加任务 sites_of_related = most_sites[_rest_keys] bs.dispatch_job(1, 5, sites_of_related) bs.dispatch_job(2, 8, sites_of_related) bs.dispatch_job(3, 10, sites_of_related) remain_sites.append('{}:<{}>'.format(_rest_keys, sites_of_related)) except Exception as e: info = (e.__class__, e, _abs(__file__)) logger.info( 'Dispatch full jobs error: type <{}>, msg <{}>, file <{}>'.format( *info))
def rem(self, default_key=None, *value): set_key = default_key or self.scrapy_filter_key try: self.redis.srem(set_key, *value) except (AuthenticationError, BusyLoadingError, ConnectionError, DataError, InvalidResponse, ReadOnlyError, RedisError, ResponseError, TimeoutError, WatchError) as e: logger.info('Remove value from Redis error: key <{}>, type <{}>, msg <{}>, file <{}>'.format( set_key, e.__class__, e, _abs(__file__)))
def rem(self, default_key=None, *value): set_key = default_key or self.scrapy_filter_key try: self.redis.srem(set_key, *value) except (AuthenticationError, BusyLoadingError, ConnectionError, DataError, InvalidResponse, ReadOnlyError, RedisError, ResponseError, TimeoutError, WatchError) as e: logger.info( 'Remove value from Redis error: key <{}>, type <{}>, msg <{}>, file <{}>' .format(set_key, e.__class__, e, _abs(__file__)))
def dispatch_sgp_jobs(): bs = BaseSched() if bs.is_migrate is not None: return try: bs.schedule(site_name=bs.sgp_sites) except Exception as e: info = (e.__class__, e, _abs(__file__)) logger.info('Dispatch Sgp jobs error: type <{}>, msg <{}>, file <{}>'.format(*info))
def put(self, key_name, filename): """ :param key_name: absolute key name of Amazon S3, eg: data/csf_hot_news/20160411/aaa.txt :param filename: Store local absolute filename path, eg: /data/csf_hot_news/20160411/aaa.txt """ try: bucket = self.get_buck() key = bucket.new_key(key_name) key.set_contents_from_filename(filename) except Exception as e: logger.info('Upload file to S3 error: type <{}>, msg <{}>, file <{}>'.format( e.__class__, e, _abs(__file__)))
def put(self, key_name, filename): """ :param key_name: key name of Amazon S3 :param filename: Store local directory filename path """ try: bucket = self.get_buck() key = bucket.new_key(key_name) key.set_contents_from_filename(filename) except Exception as e: logger.info('Upload file to S3 error: type <{}>, msg <{}>, file <{}>'.format( e.__class__, e, _abs(__file__)))
def dispatch_sgp_jobs(): bs = BaseSched() if bs.is_migrate is not None: return try: bs.schedule(site_name=bs.sgp_sites) except Exception as e: info = (e.__class__, e, _abs(__file__)) logger.info( 'Dispatch Sgp jobs error: type <{}>, msg <{}>, file <{}>'.format( *info))
def receive_files(): """ Receive message from redis queue, then convert to file """ self = JobBase() hot_path = self.hot_news_path full_path = self.full_news_path try: if self.is_migrate is True: self.uptf.convert_message(hot_path, mq_typ=1) self.uptf.convert_message(full_path, mq_typ=2) except Exception as e: info = (e.__class__, e, _abs(__file__)) logger.info('Receive message from redis yield file error: type <{}>, msg <{}>, file <{}>'.format(*info))
def put(self, key_name, filename): """ :param key_name: absolute key name of Amazon S3, eg: data/csf_hot_news/20160411/aaa.txt :param filename: Store local absolute filename path, eg: /data/csf_hot_news/20160411/aaa.txt """ try: bucket = self.get_buck() key = bucket.new_key(key_name) key.set_contents_from_filename(filename) except Exception as e: logger.info( 'Upload file to S3 error: type <{}>, msg <{}>, file <{}>'. format(e.__class__, e, _abs(__file__)))
def get(self, default_key=None): """ get all members from set of specified key :param default_key: None|string, set key to redis """ set_key = default_key or self.scrapy_filter_key try: return self.redis.smembers(set_key) except (AuthenticationError, BusyLoadingError, ConnectionError, DataError, InvalidResponse, ReadOnlyError, RedisError, ResponseError, TimeoutError, WatchError) as e: logger.info('Get value from Redis error: key <{}>, type <{}>, msg <{}>, file <{}>'.format( set_key, e.__class__, e, _abs(__file__))) return set()
def set(self, default_key=None, *value): """ add url or title value of news to set :param value: string, md5 value :param default_key: None|string, set key to redis """ set_key = default_key or self.scrapy_filter_key try: self.redis.sadd(set_key, *value) except (AuthenticationError, BusyLoadingError, ConnectionError, DataError, InvalidResponse, ReadOnlyError, RedisError, ResponseError, TimeoutError, WatchError) as e: logger.info('Set value to Redis error: key <{}>, type <{}>, msg <{}>, file <{}>'.format( set_key, e.__class__, e, _abs(__file__)))
def to_queue(self, filename, typs): """ Read file data to yield serializable data :param filename: absolute path to the file :param typs: Only 1 or 2, if typs = 1, it is hot news, else typs = 2, it is full news """ data = [] fn = os.path.basename(filename) try: with open(filename) as fp: data.extend([_line.strip() for _line in fp]) data.append(fn) dumps = simplejson.dumps(dict(zip(self.keys, data))) except (IOError, simplejson.JSONDecodeError) as e: logger.info('Pickle data from file error: type <{}>, msg <{}>, filename <{}>, file <{}>'.format( self.queues(typs), e.__class__, e, filename, _abs(__file__))) else: if len(self.keys) == len(data): self.push(dumps, typs) else: logger.info('Pickle data to redis error: redis key <{}>, msg <file number fail>, filename <{}>, ' 'file <{}>'.format(self.queues(typs), filename, _abs(__file__)))
def get(self, default_key=None): """ get all members from set of specified key :param default_key: None|string, set key to redis """ set_key = default_key or self.scrapy_filter_key try: return self.redis.smembers(set_key) except (AuthenticationError, BusyLoadingError, ConnectionError, DataError, InvalidResponse, ReadOnlyError, RedisError, ResponseError, TimeoutError, WatchError) as e: logger.info( 'Get value from Redis error: key <{}>, type <{}>, msg <{}>, file <{}>' .format(set_key, e.__class__, e, _abs(__file__))) return set()
def crawl_proxy_ip(): if JobBase().is_migrate is not True: return try: total_proxy = HttpProxy().run() redis = RedisBase().redis scrapy_proxy_ip_key = news_config.settings['SCRAPY_PROXY_IP_KEY'] if total_proxy: redis.delete(scrapy_proxy_ip_key) redis.rpush(scrapy_proxy_ip_key, *total_proxy) except Exception as e: info = (e.__class__, e, _abs(__file__)) logger.info('Crawl proxy ip error: type <{}>, msg <{}>, file <{}>'.format(*info))
def set(self, default_key=None, *value): """ add url or title value of news to set :param value: string, md5 value :param default_key: None|string, set key to redis """ set_key = default_key or self.scrapy_filter_key try: self.redis.sadd(set_key, *value) except (AuthenticationError, BusyLoadingError, ConnectionError, DataError, InvalidResponse, ReadOnlyError, RedisError, ResponseError, TimeoutError, WatchError) as e: logger.info( 'Set value to Redis error: key <{}>, type <{}>, msg <{}>, file <{}>' .format(set_key, e.__class__, e, _abs(__file__)))
def get_message(self, typ): """ Obtain queue all messages from redis related queue :param typ: Only 1 or 2, if typ =1, that is `sgp_hot_mq`, else typ = 2, is `sgp_news_mq` """ _queue = self.queues(typ) try: return self.redis.rpop(_queue) except (ConnectionError, DataError, ResponseError, TimeoutError, InvalidResponse) as e: info = (_queue, e.__class__, e, _abs(__file__)) logger.info( 'Get message from Queue error: redis key <{}>, type <{}>, msg <{}>, file <{}>' .format(*info)) return []
def get(self, key_name, filename=None): """ :param key_name: key name of Amazon S3, eg: data/csf_hot_news/20160411/aaa.txt :param filename: Store local absolute filename path, eg: /data/csf_hot_news/20160411/aaa.txt :return `boto.s3.key.Key` class instance """ try: bucket = self.get_buck() key = bucket.get_key(key_name) if key and filename is not None: key.get_contents_to_filename(filename) return key except Exception as e: logger.info('Get file from S3 error: type <{}>, msg <{}>, file <{}>'.format( e.__class__, e, _abs(__file__)))
def to_file(self, message, news_path): """ Make serializable data to yield file :param message: json data format, serializable data from redis queue :param news_path: absolute directory path, message will store file """ try: data = simplejson.loads(message) filename = data.pop('fn') lines = [data[_key] for _key in self.keys[:-1]] with open(news_path + filename, 'w') as fp: lines_seq = '\n'.join(lines).encode('u8') fp.writelines(lines_seq) except (KeyError, IOError, simplejson.JSONDecodeError) as e: logger.info('Yield data to file from redis error: type <{}>, msg <{}>, filename <{}>, file <{}>'.format( e.__class__, e, filename, _abs(__file__)))
def get(self, key_name, filename=None): """ :param key_name: key name of Amazon S3 :param filename: Store local directory filename path :return `boto.s3.key.Key` class instance """ try: bucket = self.get_buck() key = bucket.get_key(key_name) if key and filename is not None: key.get_contents_to_filename(filename) return key except ssl.SSLError as e: logger.info('Get file from S3 error: type <{}>, msg <{}>, file <{}>'.format( e.__class__, e, _abs(__file__)))
def crawl_proxy_ip(): if JobBase().is_migrate is not True: return try: total_proxy = HttpProxy().run() redis = RedisBase().redis scrapy_proxy_ip_key = news_config.settings['SCRAPY_PROXY_IP_KEY'] if total_proxy: redis.delete(scrapy_proxy_ip_key) redis.rpush(scrapy_proxy_ip_key, *total_proxy) except Exception as e: info = (e.__class__, e, _abs(__file__)) logger.info( 'Crawl proxy ip error: type <{}>, msg <{}>, file <{}>'.format( *info))
def to_file(self, message, news_path): """ Make serializable data to yield file :param message: json data format, serializable data from redis queue :param news_path: absolute directory path, message will store file """ try: data = simplejson.loads(message) filename = data.pop('fn') lines = [data[_key] for _key in self.keys[:-1]] with open(news_path + filename, 'w') as fp: lines_seq = '\n'.join(lines).encode('u8') fp.writelines(lines_seq) except (KeyError, IOError, simplejson.JSONDecodeError) as e: logger.info( 'Yield data to file from redis error: type <{}>, msg <{}>, filename <{}>, file <{}>' .format(e.__class__, e, filename, _abs(__file__)))
def get(self, key_name, filename=None): """ :param key_name: key name of Amazon S3, eg: data/csf_hot_news/20160411/aaa.txt :param filename: Store local absolute filename path, eg: /data/csf_hot_news/20160411/aaa.txt :return `boto.s3.key.Key` class instance """ try: bucket = self.get_buck() key = bucket.get_key(key_name) if key and filename is not None: key.get_contents_to_filename(filename) return key except Exception as e: logger.info( 'Get file from S3 error: type <{}>, msg <{}>, file <{}>'. format(e.__class__, e, _abs(__file__)))
def clean_redis(): """ cron clean redis data """ self = JobBase() if self.is_migrate is not True: return try: filtering_key = self.config['REDIS_FILTER_KEY'] required_scrapy, required_filtering = get_md5_from_mongo(self) # Clean data from `REDIS_FILTER_KEY` self.redis.rem(filtering_key, *required_filtering) except Exception as e: info = (e.__class__, e, _abs(__file__)) logger.info( 'Clean redis data error: type <{}>, msg <{}>, file <{}>'.format( *info))
def push(self, message, typ): """ Here two queue, `sgp_hot_mq` push part foreign site hot news Or `sgp_news_mq` push part foreign site full news from amazon sgp server :param message: json, which have two element, and every element have many data to related queue name :param typ: if typ = 1, that is `sgp_hot_mq`, else `sgp_news_mq` """ _queue = self.queues(typ) try: self.redis.lpush(_queue, message) except (ConnectionError, DataError, ResponseError, TimeoutError, InvalidResponse) as e: info = (_queue, e.__class__, e, _abs(__file__)) logger.info( 'Push message to Queue error: redis key <{}>, type <{}>, msg <{}>, file <{}>' .format(*info))
def subscribe(self, channel_typ): messages = [] channel = self.select_channel(channel_typ) pub = self.redis.pubsub() pub.subscribe(channel) for _message in pub.listen(): # listen is block, if have not message, which will block here try: data = _message['data'] if isinstance(_message, basestring): msg = simplejson.loads(data['data']) if 'exit' in msg: break messages.append(msg) except (simplejson.JSONDecodeError, KeyError) as e: logger.info('Subscribe message error: redis channel <{}>, type <{}>, msg <{}>, file <{}>'.format( channel, e.__class__, e, _abs(__file__))) return messages
def subscribe(self, channel_typ): messages = [] channel = self.select_channel(channel_typ) pub = self.redis.pubsub() pub.subscribe(channel) for _message in pub.listen(): # listen is block, if have not message, which will block here try: data = _message['data'] if isinstance(_message, basestring): msg = simplejson.loads(data['data']) if 'exit' in msg: break messages.append(msg) except (simplejson.JSONDecodeError, KeyError) as e: logger.info( 'Subscribe message error: redis channel <{}>, type <{}>, msg <{}>, file <{}>' .format(channel, e.__class__, e, _abs(__file__))) return messages
def transport(dir_path, filename, which): """ :param self: Base class instance :param dir_path: hot news ot full news path :param filename: just file name :param which: int, if which is 1, transfer hot news, else transfer full news """ local_path = dir_path + filename try: if self.is_filtering(filename): s3_key = self.s3_key(dir_path, filename) self.bucket.put(s3_key, local_path) if self.is_migrate is True: self.goosy.put(local_path, dir_path) if self.is_migrate is None: # transfer news file to redis self.ptq.send_message(local_path, which) except Exception as e: logger.info('Transfer file between two PC or Upload S3 or Push message to redis Queue on SGP server error: ' 'type <{}>, msg <{}>, file <{}>'.format(e.__class__, e, _abs(__file__)))
# This software is distributed under the 3-clause BSD License. # ___________________________________________________________________________ # # Utility classes for working with the logger # import logging from pyutilib.misc import LogHandler # __file__ fails if script is called in different ways on Windows # __file__ fails if someone does os.chdir() before # sys.argv[0] also fails because it doesn't not always contains the path from os.path import dirname as _dir, abspath as _abs import inspect _pyomo_base = _dir(_dir(_dir(_abs(inspect.getfile(inspect.currentframe()))))) # # Set up the root Pyomo namespace logger # _logger = logging.getLogger('pyomo') _logger.addHandler( LogHandler(_pyomo_base, verbosity=lambda: _logger.isEnabledFor(logging.DEBUG))) _logger.setLevel(logging.WARNING) class LoggingIntercept(object): """Context manager for intercepting messages sent to a log stream This class is designed to enable easy testing of log messages.
def put(self, local, remote): """ Make local file push remote path :param local: local machine absolutely file path :param remote: remote machine absolutely directory path """ self.ssh_command('mkdir -p %s' % remote) child = transfer.spawn('scp %s root@%s:%s' % (local, self._host, remote)) try: while True: index = child.expect(["root@%s's password:" % self._host, transfer.TIMEOUT]) if index == 0: child.sendline('%s\n' % self._password) break elif index == 1: pass except (transfer.EOF, transfer.TIMEOUT) as e: logger.info('Transfer file error: type <{}>, msg <{}>, file <{}>'.format(e.__class__, e,_abs(__file__))) finally: child.interact() child.close()
for indent, par in paragraphs: if not indent: indent = '' # Bulleted lists get indented with a hanging indent if par and len(par[0]) > 1 and par[0][0] in '-*': hang = ' '*4 else: hang = '' self.stream.write( '%s\n' % ( textwrap.fill( ' '.join(par), width=self.wrap, initial_indent=self.subsequent_indent+indent, subsequent_indent=self.subsequent_indent+indent+hang ), )) # # Set up default logging for PyUtilib # # __file__ fails if script is called in different ways on Windows # __file__ fails if someone does os.chdir() before # sys.argv[0] also fails because it does not always contain the path from os.path import dirname as _dir, abspath as _abs import inspect _pyutilib_base = _dir(_dir(_dir(_abs(inspect.getfile(inspect.currentframe()))))) _logger = logging.getLogger('pyutilib') _logger.setLevel(logging.WARNING) _logger.addHandler( LogHandler( _pyutilib_base, verbosity=lambda: _logger.isEnabledFor(logging.DEBUG)))
def dispatch_job(self, default_type, interval, kw_values): """ dispatch jobs by `default_type` and `interval` :param default_type: int, dispatch job type :param interval: int, interval time :param kw_values: dict, site name """ interval = '*/%s' % interval _kwargs = {'site_name': kw_values} if default_type == 1: # 全量分派任务, 默认间隔为 5 分钟 hour = '6-9' elif default_type == 2: # 全量分派任务, 默认间隔为 8 分钟 hour = '10-19' elif default_type == 3: # 全量分派任务, 默认间隔为 10 分钟 hour = '20-23,0-5' else: logger('full jobs schedule time type <{}> failed: <>'.format(default_type, _abs(__file__))) raise app.add_job(self.schedule, trigger='cron', kwargs=_kwargs, minute=interval, hour=hour, misfire_grace_time=10)