def aggmon_jobagg(argv): global component ap = argparse.ArgumentParser() ap.add_argument('-C', '--cmd-port', default="tcp://0.0.0.0:5501", action="store", help="RPC command port") ap.add_argument('-D', '--dispatcher', default="", action="store", help="agg_control dispatcher RPC command port") ap.add_argument('-j', '--jobid', default="", action="store", help="jobid for which this instance does aggregation") ap.add_argument('-l', '--log', default="info", action="store", help="logging: info, debug, ...") ap.add_argument('-L', '--listen', default="tcp://127.0.0.1:5560", action="store", help="zmq pull port to listen on") ap.add_argument('-M', '--msgbus', default=[], action="append", help="subscription port(s) for message bus. can be used multiple times.") ap.add_argument('-s', '--stats', default=False, action="store_true", help="print statistics info") ap.add_argument('-v', '--verbose', type=int, default=0, action="store", help="verbosity") pargs = ap.parse_args(argv) log_level = eval("logging."+pargs.log.upper()) FMT = "%(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s" logging.basicConfig( stream=sys.stderr, level=log_level, format=FMT ) component = None if len(pargs.jobid) == 0: log.error("jobid argument can not be empty!") sys.exit(1) context = zmq.Context() try: jagg = JobAggregator(pargs.jobid, context) except Exception as e: log.error("Failed to create JobAggregator: %r" % e) sys.exit(1) jagg.start() # Socket to receive messages on receiver = context.socket(zmq.PULL) receiver.setsockopt(zmq.RCVHWM, 40000) recv_port = zmq_socket_bind_range(receiver, pargs.listen) assert(recv_port is not None) def aggregate_rpc(msg): agg_rpcs = component.state.get("stats.agg_rpcs", 0) agg_rpcs += 1 num_sent = jagg.do_aggregate_and_send(msg) aggs_sent = component.state.get("stats.aggs_sent", 0) + num_sent component.update({"stats.agg_rpcs": agg_rpcs, "stats.aggs_sent": aggs_sent}) def show_mcache(msg): return jagg.metric_caches def subscribe_collectors(__msg): for msgb in pargs.msgbus: log.info( "subscribing to msgs of job %s at %s" % (pargs.jobid, msgb) ) me_addr = zmq_own_addr_for_uri(msgb) send_rpc(context, msgb, "subscribe", TARGET="tcp://%s:%d" % (me_addr, recv_port), J=pargs.jobid) def unsubscribe_and_quit(__msg): # subscribe to message bus for msgb in pargs.msgbus: log.info( "unsubscribing jobid %s from %s" % (pargs.jobid, msgb) ) me_addr = zmq_own_addr_for_uri(msgb) send_rpc(context, msgb, "unsubscribe", TARGET="tcp://%s:%d" % (me_addr, recv_port)) os._exit(0) rpc = RPCThread(context, listen=pargs.cmd_port) rpc.start() rpc.register_rpc("agg", aggregate_rpc) rpc.register_rpc("quit", unsubscribe_and_quit, early_reply=True) rpc.register_rpc("resubscribe", subscribe_collectors) rpc.register_rpc("show_mcache", show_mcache) # subscribe to message bus subscribe_collectors(None) if len(pargs.dispatcher) > 0: me_addr = zmq_own_addr_for_uri(pargs.dispatcher) me_listen = "tcp://%s:%d" % (me_addr, recv_port) me_rpc = "tcp://%s:%d" % (me_addr, rpc.port) state = get_kwds(component="job_agg", cmd_port=me_rpc, listen=me_listen, jobid=pargs.jobid) component = ComponentState(context, pargs.dispatcher, state=state) rpc.register_rpc("resend_state", component.reset_timer) tstart = None log.info( "Started msg receiver on %s" % pargs.listen ) count = 0 while True: try: s = receiver.recv() #log.debug("received msg on PULL port: %r" % s) msg = json.loads(s) cmd = None if "_COMMAND_" in msg: cmd = msg["_COMMAND_"] if cmd is not None: if cmd["cmd"] == "quit": log.info( "Stopping job aggregator for jobid %s on 'quit' command." % pargs.jobid ) break elif cmd["cmd"] == "resend_state": log.info( "State resend requested." ) if component is not None: component.reset_timer() continue jagg.queue.put(msg) if count == 0 or (cmd is not None and cmd["cmd"] == "reset-stats"): tstart = time.time() count = 0 count += 1 component.update({"stats.val_msgs_recvd": count}) if (pargs.stats and count % 10000 == 0) or \ (cmd is not None and cmd["cmd"] == "show-stats"): tend = time.time() sys.stderr.write("%d msgs in %f seconds, %f msg/s\n" % (count, tend - tstart, float(count)/(tend - tstart))) sys.stderr.flush() except Exception as e: print "Exception in msg receiver: %r" % e jagg.stopping = True break time.sleep(0.1) print "%d messages received" % count
def aggmon_data_store(argv): global component ap = argparse.ArgumentParser() ap.add_argument('-g', '--group', default="universe", action="store", help="group/cluster served by this daemon instance") ap.add_argument('-C', '--cmd-port', default="tcp://0.0.0.0:5511", action="store", help="RPC command port") ap.add_argument('-D', '--dispatcher', default="", action="store", help="agg_control dispatcher RPC command port") ap.add_argument('-e', '--expire', default=180, action="store", help="days for expiring value metrics") ap.add_argument('-b', '--backend', default="mongodb", action="store", help="database backend(s), comma separated. Default is 'mongodb'.") ap.add_argument('-H', '--host', default="localhost", action="store", help="data store host") ap.add_argument('-n', '--port', default=None, action="store", help="data store port") ap.add_argument('-d', '--dbname', default="metricdb", action="store", help="database name") ap.add_argument('-P', '--prefix', default="gmetric", action="store", help="collections prefix") ap.add_argument('-u', '--user', default="", action="store", help="user name") ap.add_argument('-p', '--passwd', default="", action="store", help="password") ap.add_argument('-l', '--log', default="info", action="store", help="logging: info, debug, ...") ap.add_argument('-L', '--listen', default="tcp://0.0.0.0:5550", action="store", help="zmq pull port to listen on") ap.add_argument('-M', '--msgbus', default=[], action="append", help="subscription port(s) for message bus. can be used multiple times.") ap.add_argument('-s', '--stats', default=False, action="store_true", help="print statistics info") ap.add_argument('-v', '--verbose', type=int, default=0, action="store", help="verbosity") pargs = ap.parse_args(argv) log_level = eval("logging."+pargs.log.upper()) FMT = "%(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s" logging.basicConfig( stream=sys.stderr, level=log_level, format=FMT ) pargs.backend = pargs.backend.split(",") if pargs.port: pargs.port = pargs.port.split(",") # TODO: get rid of this and move it into (shared) config else: pargs.port = [None, None] # open DB try: store = DataStore(pargs.backend, pargs.host, pargs.port, pargs.dbname, pargs.user, pargs.passwd, pargs.group, coll_prefix=pargs.prefix, value_metrics_ttl=pargs.expire*24*3600) except Exception as e: log.error("Failed to create DataStore: %r" % e) sys.exit(1) store.start() context = zmq.Context() # Socket to receive messages on receiver = context.socket(zmq.PULL) receiver.setsockopt(zmq.RCVHWM, 40000) recv_port = zmq_socket_bind_range(receiver, pargs.listen) assert( recv_port is not None) def subscribe_collectors(__msg): for msgb in pargs.msgbus: log.info( "subscribing to all msgs at '%s'" % msgb ) me_addr = zmq_own_addr_for_uri(msgb) send_rpc(context, msgb, "subscribe", TARGET="tcp://%s:%d" % (me_addr, recv_port)) def unsubscribe_and_quit(__msg): for msgb in pargs.msgbus: log.info( "unsubscribing from '%s'" % msgb ) me_addr = zmq_own_addr_for_uri(msgb) send_rpc(context, msgb, "unsubscribe", TARGET="tcp://%s:%d" % (me_addr, recv_port)) os._exit(0) rpc = RPCThread(context, listen=pargs.cmd_port) rpc.start() rpc.register_rpc("quit", unsubscribe_and_quit, early_reply=True) rpc.register_rpc("resubscribe", subscribe_collectors) if len(pargs.dispatcher) > 0: me_addr = zmq_own_addr_for_uri(pargs.dispatcher) me_listen = "tcp://%s:%d" % (me_addr, recv_port) me_rpc = "tcp://%s:%d" % (me_addr, rpc.port) state = get_kwds(component="data_store", cmd_port=me_rpc, listen=me_listen, group=pargs.group) component = ComponentState(context, pargs.dispatcher, state=state) rpc.register_rpc("resend_state", component.reset_timer) # subscribe to message bus subscribe_collectors(None) tstart = None log.info( "Started msg receiver on %s" % pargs.listen ) count = 0 while True: try: s = receiver.recv() log.debug("received msg on PULL port: %r" % s) msg = json.loads(s) cmd = None if "_COMMAND_" in msg: log.info("_COMMAND_ received: msg = %r" % msg) cmd = msg["_COMMAND_"] if cmd is not None: if cmd["cmd"] == "quit": log.info( "Stopping data_store on 'quit' command.") # raw exit!!! os._exit(0) break elif cmd["cmd"] == "resend_state": log.info( "State resend requested." ) if component is not None: component.reset_timer() continue store.queue.put(msg) if count == 0 or (cmd is not None and cmd["cmd"] == "reset-stats"): tstart = time.time() count = 0 count += 1 if component is not None: component.update({"stats.msgs_recvd": count}) if (pargs.stats and count % 10000 == 0) or \ (cmd is not None and cmd["cmd"] == "show-stats"): tend = time.time() sys.stdout.write("%d msgs in %f seconds, %f msg/s\n" % (count, tend - tstart, float(count)/(tend - tstart))) sys.stdout.flush() except Exception as e: print "Exception in msg receiver: %r" % e break log.info("THE END")