def aggmon_data_store(argv): global component ap = argparse.ArgumentParser() ap.add_argument('-g', '--group', default="universe", action="store", help="group/cluster served by this daemon instance") ap.add_argument('-C', '--cmd-port', default="tcp://0.0.0.0:5511", action="store", help="RPC command port") ap.add_argument('-D', '--dispatcher', default="", action="store", help="agg_control dispatcher RPC command port") ap.add_argument('-e', '--expire', default=180, action="store", help="days for expiring value metrics") ap.add_argument('-b', '--backend', default="mongodb", action="store", help="database backend(s), comma separated. Default is 'mongodb'.") ap.add_argument('-H', '--host', default="localhost", action="store", help="data store host") ap.add_argument('-n', '--port', default=None, action="store", help="data store port") ap.add_argument('-d', '--dbname', default="metricdb", action="store", help="database name") ap.add_argument('-P', '--prefix', default="gmetric", action="store", help="collections prefix") ap.add_argument('-u', '--user', default="", action="store", help="user name") ap.add_argument('-p', '--passwd', default="", action="store", help="password") ap.add_argument('-l', '--log', default="info", action="store", help="logging: info, debug, ...") ap.add_argument('-L', '--listen', default="tcp://0.0.0.0:5550", action="store", help="zmq pull port to listen on") ap.add_argument('-M', '--msgbus', default=[], action="append", help="subscription port(s) for message bus. can be used multiple times.") ap.add_argument('-s', '--stats', default=False, action="store_true", help="print statistics info") ap.add_argument('-v', '--verbose', type=int, default=0, action="store", help="verbosity") pargs = ap.parse_args(argv) log_level = eval("logging."+pargs.log.upper()) FMT = "%(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s" logging.basicConfig( stream=sys.stderr, level=log_level, format=FMT ) pargs.backend = pargs.backend.split(",") if pargs.port: pargs.port = pargs.port.split(",") # TODO: get rid of this and move it into (shared) config else: pargs.port = [None, None] # open DB try: store = DataStore(pargs.backend, pargs.host, pargs.port, pargs.dbname, pargs.user, pargs.passwd, pargs.group, coll_prefix=pargs.prefix, value_metrics_ttl=pargs.expire*24*3600) except Exception as e: log.error("Failed to create DataStore: %r" % e) sys.exit(1) store.start() context = zmq.Context() # Socket to receive messages on receiver = context.socket(zmq.PULL) receiver.setsockopt(zmq.RCVHWM, 40000) recv_port = zmq_socket_bind_range(receiver, pargs.listen) assert( recv_port is not None) def subscribe_collectors(__msg): for msgb in pargs.msgbus: log.info( "subscribing to all msgs at '%s'" % msgb ) me_addr = zmq_own_addr_for_uri(msgb) send_rpc(context, msgb, "subscribe", TARGET="tcp://%s:%d" % (me_addr, recv_port)) def unsubscribe_and_quit(__msg): for msgb in pargs.msgbus: log.info( "unsubscribing from '%s'" % msgb ) me_addr = zmq_own_addr_for_uri(msgb) send_rpc(context, msgb, "unsubscribe", TARGET="tcp://%s:%d" % (me_addr, recv_port)) os._exit(0) rpc = RPCThread(context, listen=pargs.cmd_port) rpc.start() rpc.register_rpc("quit", unsubscribe_and_quit, early_reply=True) rpc.register_rpc("resubscribe", subscribe_collectors) if len(pargs.dispatcher) > 0: me_addr = zmq_own_addr_for_uri(pargs.dispatcher) me_listen = "tcp://%s:%d" % (me_addr, recv_port) me_rpc = "tcp://%s:%d" % (me_addr, rpc.port) state = get_kwds(component="data_store", cmd_port=me_rpc, listen=me_listen, group=pargs.group) component = ComponentState(context, pargs.dispatcher, state=state) rpc.register_rpc("resend_state", component.reset_timer) # subscribe to message bus subscribe_collectors(None) tstart = None log.info( "Started msg receiver on %s" % pargs.listen ) count = 0 while True: try: s = receiver.recv() log.debug("received msg on PULL port: %r" % s) msg = json.loads(s) cmd = None if "_COMMAND_" in msg: log.info("_COMMAND_ received: msg = %r" % msg) cmd = msg["_COMMAND_"] if cmd is not None: if cmd["cmd"] == "quit": log.info( "Stopping data_store on 'quit' command.") # raw exit!!! os._exit(0) break elif cmd["cmd"] == "resend_state": log.info( "State resend requested." ) if component is not None: component.reset_timer() continue store.queue.put(msg) if count == 0 or (cmd is not None and cmd["cmd"] == "reset-stats"): tstart = time.time() count = 0 count += 1 if component is not None: component.update({"stats.msgs_recvd": count}) if (pargs.stats and count % 10000 == 0) or \ (cmd is not None and cmd["cmd"] == "show-stats"): tend = time.time() sys.stdout.write("%d msgs in %f seconds, %f msg/s\n" % (count, tend - tstart, float(count)/(tend - tstart))) sys.stdout.flush() except Exception as e: print "Exception in msg receiver: %r" % e break log.info("THE END")
def aggmon_jobagg(argv): global component ap = argparse.ArgumentParser() ap.add_argument('-C', '--cmd-port', default="tcp://0.0.0.0:5501", action="store", help="RPC command port") ap.add_argument('-D', '--dispatcher', default="", action="store", help="agg_control dispatcher RPC command port") ap.add_argument('-j', '--jobid', default="", action="store", help="jobid for which this instance does aggregation") ap.add_argument('-l', '--log', default="info", action="store", help="logging: info, debug, ...") ap.add_argument('-L', '--listen', default="tcp://127.0.0.1:5560", action="store", help="zmq pull port to listen on") ap.add_argument('-M', '--msgbus', default=[], action="append", help="subscription port(s) for message bus. can be used multiple times.") ap.add_argument('-s', '--stats', default=False, action="store_true", help="print statistics info") ap.add_argument('-v', '--verbose', type=int, default=0, action="store", help="verbosity") pargs = ap.parse_args(argv) log_level = eval("logging."+pargs.log.upper()) FMT = "%(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s" logging.basicConfig( stream=sys.stderr, level=log_level, format=FMT ) component = None if len(pargs.jobid) == 0: log.error("jobid argument can not be empty!") sys.exit(1) context = zmq.Context() try: jagg = JobAggregator(pargs.jobid, context) except Exception as e: log.error("Failed to create JobAggregator: %r" % e) sys.exit(1) jagg.start() # Socket to receive messages on receiver = context.socket(zmq.PULL) receiver.setsockopt(zmq.RCVHWM, 40000) recv_port = zmq_socket_bind_range(receiver, pargs.listen) assert(recv_port is not None) def aggregate_rpc(msg): agg_rpcs = component.state.get("stats.agg_rpcs", 0) agg_rpcs += 1 num_sent = jagg.do_aggregate_and_send(msg) aggs_sent = component.state.get("stats.aggs_sent", 0) + num_sent component.update({"stats.agg_rpcs": agg_rpcs, "stats.aggs_sent": aggs_sent}) def show_mcache(msg): return jagg.metric_caches def subscribe_collectors(__msg): for msgb in pargs.msgbus: log.info( "subscribing to msgs of job %s at %s" % (pargs.jobid, msgb) ) me_addr = zmq_own_addr_for_uri(msgb) send_rpc(context, msgb, "subscribe", TARGET="tcp://%s:%d" % (me_addr, recv_port), J=pargs.jobid) def unsubscribe_and_quit(__msg): # subscribe to message bus for msgb in pargs.msgbus: log.info( "unsubscribing jobid %s from %s" % (pargs.jobid, msgb) ) me_addr = zmq_own_addr_for_uri(msgb) send_rpc(context, msgb, "unsubscribe", TARGET="tcp://%s:%d" % (me_addr, recv_port)) os._exit(0) rpc = RPCThread(context, listen=pargs.cmd_port) rpc.start() rpc.register_rpc("agg", aggregate_rpc) rpc.register_rpc("quit", unsubscribe_and_quit, early_reply=True) rpc.register_rpc("resubscribe", subscribe_collectors) rpc.register_rpc("show_mcache", show_mcache) # subscribe to message bus subscribe_collectors(None) if len(pargs.dispatcher) > 0: me_addr = zmq_own_addr_for_uri(pargs.dispatcher) me_listen = "tcp://%s:%d" % (me_addr, recv_port) me_rpc = "tcp://%s:%d" % (me_addr, rpc.port) state = get_kwds(component="job_agg", cmd_port=me_rpc, listen=me_listen, jobid=pargs.jobid) component = ComponentState(context, pargs.dispatcher, state=state) rpc.register_rpc("resend_state", component.reset_timer) tstart = None log.info( "Started msg receiver on %s" % pargs.listen ) count = 0 while True: try: s = receiver.recv() #log.debug("received msg on PULL port: %r" % s) msg = json.loads(s) cmd = None if "_COMMAND_" in msg: cmd = msg["_COMMAND_"] if cmd is not None: if cmd["cmd"] == "quit": log.info( "Stopping job aggregator for jobid %s on 'quit' command." % pargs.jobid ) break elif cmd["cmd"] == "resend_state": log.info( "State resend requested." ) if component is not None: component.reset_timer() continue jagg.queue.put(msg) if count == 0 or (cmd is not None and cmd["cmd"] == "reset-stats"): tstart = time.time() count = 0 count += 1 component.update({"stats.val_msgs_recvd": count}) if (pargs.stats and count % 10000 == 0) or \ (cmd is not None and cmd["cmd"] == "show-stats"): tend = time.time() sys.stderr.write("%d msgs in %f seconds, %f msg/s\n" % (count, tend - tstart, float(count)/(tend - tstart))) sys.stderr.flush() except Exception as e: print "Exception in msg receiver: %r" % e jagg.stopping = True break time.sleep(0.1) print "%d messages received" % count
def aggmon_collector(argv): global component ap = argparse.ArgumentParser() ap.add_argument('-C', '--cmd-port', default="tcp://127.0.0.1:5556", action="store", help="RPC command port") ap.add_argument('-D', '--dispatcher', default="", action="store", help="agg_control dispatcher RPC command port") ap.add_argument('-g', '--group', default="universe", action="store", help="group for this message bus. Default: /universe") ap.add_argument('-l', '--log', default="info", action="store", help="logging: info, debug, ...") ap.add_argument('-L', '--listen', default="tcp://127.0.0.1:5555", action="store", help="zmq pull port to listen on") ap.add_argument('-M', '--msgbus', default="", action="store", help="subscription port for other message bus") ap.add_argument('-s', '--stats', default=False, action="store_true", help="print statistics info") ap.add_argument('-S', '--state-file', default="agg_collector.state", action="store", help="file to store tagger rules and subscriptions") ap.add_argument('-v', '--verbose', type=int, default=0, action="store", help="verbosity") pargs = ap.parse_args(argv) log_level = eval("logging."+pargs.log.upper()) FMT = "%(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s" logging.basicConfig( stream=sys.stderr, level=log_level, format=FMT ) state = [] subs = {} tags = {} # EF 6.7.16 disabled state loading #if len(pargs.state_file) > 0: # state = load_state(pargs.state_file) if len(state) >= 2: subs = state[0] tags = state[1] def spoofed_host(msg): # treat spoofed hosts already here if "H" in msg and ":" in msg["H"]: msg["H"] = msg["H"].split(":")[1] elif "HOST" in msg and ":" in msg["HOST"]: msg["HOST"] = msg["HOST"].split(":")[1] ## ## For debugging duplicates... ## #if "N" in msg and msg["N"] == "cpu_user": # log.info("cpu_user val: %r" % msg) return msg def convert_str_int_float(msg): # convert string values to int or float if "V" in msg: val = msg["V"] if isinstance(val, basestring): if val.isdigit(): val = int(val) msg["V"] = val else: try: val = float(val) msg["V"] = val except ValueError: pass return msg def save_subs_tags(msg): # EF 6.7.16 disabled state saving #save_state(pargs.state_file, [pubsub.subs, tagger.tags]) pass def quit(msg): subq.stopping = True # raw exit for now os._exit(0) try: context = zmq.Context() subq = SubscriberQueue(context, pargs.listen, pre=[spoofed_host, convert_str_int_float]) tagger = MsgTagger(tags=tags) pubsub = AggPubThread(context, subq.queue, subs=subs, tagger=tagger.do_tag) rpc = RPCThread(context, listen=pargs.cmd_port) rpc.start() except Exception as e: log.error(traceback.format_exc()) log.error("Failed to initialize something essential. Exiting.") os._exit(1) rpc.register_rpc("subscribe", pubsub.subscribe, post=save_subs_tags) rpc.register_rpc("unsubscribe", pubsub.unsubscribe, post=save_subs_tags) rpc.register_rpc("show_subs", pubsub.show_subscriptions) rpc.register_rpc("reset_subs", pubsub.reset_subscriptions, post=save_subs_tags) rpc.register_rpc("add_tag", tagger.add_tag, post=save_subs_tags) rpc.register_rpc("remove_tag", tagger.remove_tag, post=save_subs_tags) rpc.register_rpc("reset_tags", tagger.reset_tags, post=save_subs_tags) rpc.register_rpc("show_tags", tagger.show_tags) rpc.register_rpc("quit", quit, early_reply=True) pubsub.start() subq.start() if len(pargs.dispatcher) > 0: me_addr = zmq_own_addr_for_uri(pargs.dispatcher) me_listen = "tcp://%s:%d" % (me_addr, subq.port) me_rpc = "tcp://%s:%d" % (me_addr, rpc.port) state = get_kwds(component="collector", cmd_port=me_rpc, listen=me_listen, group=pargs.group) component = ComponentState(context, pargs.dispatcher, state=state) rpc.register_rpc("resend_state", component.reset_timer) if len(pargs.msgbus) > 0: print "subscribing to all msgs from %s" % pargs.msgbus msg = {"TARGET": pargs.listen} send_rpc(context, pargs.msgbus, "subscribe", **msg) while True: try: subq.join(0.1) except Exception as e: print "main thread exception: %r" % e break