def __init__(self, name=ev("PUBLISHER_NAME", "kombu-publisher"), auth_url=ev("BROKER_URL", "redis://localhost:6379/0"), ssl_options={}, max_general_failures=-1): # infinite retries """ Available Transports: https://github.com/celery/kombu#transport-comparison """ self.state = "not_ready" self.name = name self.auth_url = auth_url self.ssl_options = ssl_options self.exchange = None self.queue = None self.declare_entities = [] self.conn = None self.channel = None self.producer = None self.num_setup_failures = 0 self.num_publish_failures = 0 self.max_general_failures = max_general_failures self.exchange_name = "" self.exchange_type = "direct" self.queue_name = "" self.routing_key = "" self.serializer = "json"
def __init__(self, name="message-processor", sub_auth_url=ev("SUB_BROKER_URL", "redis://localhost:6379/0"), sub_ssl_options={}, sub_serializer="application/json", sub_silent=False, pub_auth_url=ev("PUB_BROKER_URL", "redis://localhost:6379/0"), pub_ssl_options={}, pub_serializer="json", pub_silent=False): self.name = name self.recv_msgs = [] self.sub_auth_url = sub_auth_url self.pub_auth_url = pub_auth_url self.sub_ssl_options = sub_ssl_options self.pub_ssl_options = pub_ssl_options self.sub_serializer = sub_serializer self.pub_serializer = pub_serializer self.pub_queue_name = None self.sub = None self.pub = None self.exchange = None self.exchange_name = "" self.queue = None self.queue_name = "" self.routing_key = None self.pub_routing_key = None self.pub_hook_version = 1 self.sub_verbose = not sub_silent self.pub_verbose = not pub_silent
def capture_arp_packets(): """capture_arp_packets Capture ``ARP`` packets and call the ``handle_packets`` method Change the network interface by ``export CAP_DEVICE=eth0`` """ dev = ev("CAP_DEVICE", "lo") """ Ignore ports for forwarding to consolidators: Redis VM: 6379, 16379 RabbitMQ VM: 5672, 15672, 25672 """ # http://biot.com/capstats/bpf.html default_filter = "arp" custom_filter = ev("NETWORK_FILTER", default_filter) log.info(("starting device={} filter={}").format(dev, custom_filter)) scapy.sniff(filter=custom_filter, prn=handle_packets) log.info("done")
def example_capture(): """example_capture An example capture script Change the network interface by ``export CAP_DEVICE=eth0`` """ dev = ev("CAP_DEVICE", "lo") """ Ignore ports for forwarding to consolidators: Redis Internal VM: 6379, 16379 RabbitMQ Internal VM: 5672, 15672, 25672 """ # http://biot.com/capstats/bpf.html custom_filter = ("(udp and portrange 10000-17001) " "or (tcp and portrange 80) " "or arp " "or icmp") log.info(("starting device={} filter={}").format(dev, custom_filter)) scapy.sniff(filter=custom_filter, prn=handle_packets) log.info("done")
def __init__(self, name=ev("SUBSCRIBER_NAME", "celery-subscriber"), auth_url=ev("BROKER_URL", "redis://localhost:6379/0"), app=None, ssl_options={}, transport_options={}, worker_log_format="%(asctime)s: %(levelname)s %(message)s", **kwargs): """ Available Brokers: http://docs.celeryproject.org/en/latest/getting-started/brokers/index.html Redis: http://docs.celeryproject.org/en/latest/getting-started/brokers/redis.html RabbitMQ: http://docs.celeryproject.org/en/latest/getting-started/brokers/rabbitmq.html SQS: http://docs.celeryproject.org/en/latest/getting-started/brokers/sqs.html """ self.state = "not_ready" self.name = name self.auth_url = auth_url self.ssl_options = ssl_options self.transport_options = transport_options self.subscriber_app = None # allow passing in an initialized Celery application if app: self.subscriber_app = app else: self.subscriber_app = Celery() # update the celery configuration from the kwargs dictionary self.subscriber_app.conf.update(kwargs) # make sure to set the broker_url self.subscriber_app.conf.broker_url = self.auth_url self.subscriber_app.conf.worker_log_format = worker_log_format self.exchange = None self.consume_from_queues = []
def __init__(self, name=ev("SUBSCRIBER_NAME", "kombu-subscriber"), auth_url=ev("BROKER_URL", "redis://localhost:6379/0"), ssl_options={}, max_general_failures=-1): # infinite retries """ Available Brokers: http://docs.celeryproject.org/en/latest/getting-started/brokers/index.html Redis: http://docs.celeryproject.org/en/latest/getting-started/brokers/redis.html RabbitMQ: http://docs.celeryproject.org/en/latest/getting-started/brokers/rabbitmq.html SQS: http://docs.celeryproject.org/en/latest/getting-started/brokers/sqs.html """ self.state = "not_ready" self.name = name self.auth_url = auth_url self.ssl_options = ssl_options self.conn = None self.new_conn = None self.channel = None self.consumer = None self.process_message_callback = None self.drain_time = 1.0 self.num_setup_failures = 0 self.num_consume_failures = 0 self.max_general_failures = max_general_failures self.exchange = None self.exchange_name = "" self.routing_key = "" self.serializer = "json" self.queue = None self.queue_name = "" self.consume_from_queues = []
def build_ssl_options(ca_cert="", keyfile="", certfile="", ssl_required="0"): use_ca_certs = ev("SSL_CA_CERT", ca_cert) use_keyfile = ev("SSL_KEYFILE", keyfile) use_certfile = ev("SSL_CERTFILE", certfile) use_ssl_required = ev("SSL_REQUIRED", ssl_required) == "1" ssl_options = {} if use_ca_certs: ssl_options["ca_certs"] = use_ca_certs if use_keyfile: ssl_options["keyfile"] = use_keyfile if use_certfile: ssl_options["certfile"] = use_certfile if use_ssl_required: ssl_options["cert_reqs"] = ssl.CERT_REQUIRED return ssl_options
def setUp(self): if self.debug: print("setUp") # state trips in the custom classes os.environ["TEST_STOP_DONE"] = "1" self.last_pub_msg = None self.last_sub_msg = None self.pub = None self.sub = None self.pub_msgs = [] self.sub_msgs = [] self.exchange_name = ev("TEST_EXCHANGE", "test.events") self.routing_key = ev("TEST_ROUTING_KEY", "test.events.conversions") self.queue_name = ev("TEST_QUEUE", "test.events.conversions") self.exchange = None self.queue = None self.last_sub_callback = None
def relay_callback(body, message): pub_auth_url = ev("RELAY_WORKER_BROKER_URL", "pyamqp://*****:*****@localhost:5672//") pub_backend_url = ev("RELAY_BACKEND_URL", "redis://localhost:6379/12") path_to_config_module = ev("RELAY_CONFIG_MODULE", "ecomm_app.ecommerce.celeryconfig_pub_sub") app = ecomm_app.ecommerce.tasks.get_celery_app( name=ev("RELAY_NAME", "ecomm-relay"), auth_url=pub_auth_url, backend_url=pub_backend_url, path_to_config_module=path_to_config_module) task_name = ev("RELAY_TASK_NAME", "ecomm_app.ecommerce.tasks.handle_user_conversion_events") now = datetime.now().isoformat() body = { "account_id": 999, "subscription_id": 321, "stripe_id": 876, "created": now, "product_id": "JJJ", "version": 1, "org_msg": body, "msg_id": str(uuid.uuid4()) } source_info = {"msg_proc": ev("RELAY_NAME", "ecomm_relay")} log.info(("Sending broker={} " "body={}").format(app.conf.broker_url, body)) result = app.send_task(task_name, (body, source_info)) if "simulate_processing_lag" in body: log.info(("task - {} - simulating processing" "lag={} sleeping").format(task_name, body["simulate_processing_lag"])) time.sleep(float(body["simulate_processing_lag"])) # end of handling adding artifical lag for testing Celery log.info(("Done with msg_id={} result={}").format(body["msg_id"], result.get())) # now that the message has been # sent to the celery ecomm worker # we can ack the message which # deletes it from the source queue # the message processor uses message.ack()
recv_msgs = [] def handle_message(body, message): log.info(("callback received msg " "body={}").format(body)) recv_msgs.append(body) message.ack() # end of handle_message # Initialize KombuSubscriber # http://docs.celeryproject.org/en/latest/getting-started/brokers/sqs.html # https://github.com/celery/kombu/blob/master/kombu/transport/SQS.py aws_key = ev("SQS_AWS_ACCESS_KEY", "not_a_key") aws_secret = ev("SQS_AWS_SECRET_KEY", "not_a_secret") sqs_auth_url = ev("BROKER_URL", "sqs://{}:{}@".format(aws_key, aws_secret)) transport_options = {} ssl_options = {} sub = KombuSubscriber("kombu-sqs-subscriber", sqs_auth_url, ssl_options) # sample: "sqs://*****:*****@" # ^ from the doc: 'you must remember to include the "@" at the end.' # Now consume: seconds_to_consume = 10.0 serializer = "application/json" queue = "test1" exchange = "test1"
recv_msgs = [] def handle_message(body, message): log.info(("callback received msg " "body={}") .format(body)) recv_msgs.append(body) message.ack() # end of handle_message # Initialize KombuSubscriber ssl_options = {} sub = KombuSubscriber("kombu-mixin-subscriber", ev("SUB_BROKER_URL", "amqp://*****:*****@localhost:5672//"), ssl_options) # Now consume: seconds_to_consume = 10.0 heartbeat = 60 serializer = "application/json" exchange = ev("CONSUME_EXCHANGE", "reporting.payments") routing_key = ev("CONSUME_ROUTING_KEY", "reporting.payments") queue = ev("CONSUME_QUEUE", "reporting.payments") sub.consume(callback=handle_message, queue=queue, exchange=exchange, routing_key=routing_key, serializer=serializer,
from kombu.mixins import ConsumerProducerMixin from celery_connectors.utils import ev from celery_connectors.utils import build_msg from celery_connectors.utils import get_exchange_from_msg from celery_connectors.utils import get_routing_key_from_msg from celery_connectors.run_publisher import run_publisher # Credits and inspirations from these great sources: # # https://github.com/celery/kombu/blob/master/examples/rpc-tut6/rpc_server.py # https://gist.github.com/oubiwann/3843016 # https://gist.github.com/eavictor/ee7856581619ac60643b57987b7ed580#file-mq_kombu_rpc_server-py # https://github.com/Skablam/kombu-examples # https://gist.github.com/mlavin/6671079 log = logging.getLogger(ev("APP_NAME", "relay")) class RelayWorker(ConsumerProducerMixin): def __init__(self, name="relay", conn=None, callback=None, task_queues=[], prefetch_count=1, relay_exchange=None, relay_exchange_type=None, relay_routing_key=None, relay_queue=None, relay_broker_url=None, relay_ssl_options={},
def publish_processed_network_packets(name="not-set", task_queue=None, result_queue=None, need_response=False, shutdown_msg="SHUTDOWN"): """ # Redis/RabbitMQ/SQS messaging endpoints for pub-sub routing_key = ev("PUBLISH_EXCHANGE", "reporting.accounts") queue_name = ev("PUBLISH_QUEUE", "reporting.accounts") auth_url = ev("PUB_BROKER_URL", "redis://localhost:6379/0") serializer = "json" """ # these keys need to be cycled to prevent # exploiting static keys filter_key = ev("IGNORE_KEY", INCLUDED_IGNORE_KEY) forward_host = ev("FORWARD_HOST", "127.0.0.1") forward_port = int(ev("FORWARD_PORT", "80")) include_filter_key = ev("FILTER_KEY", "") if not include_filter_key and filter_key: include_filter_key = filter_key filter_keys = [filter_key] log.info(("START consumer={} " "forward={}:{} with " "key={} filters={}").format(name, forward_host, forward_port, include_filter_key, filter_key)) forward_skt = None not_done = True while not_done: if not forward_skt: forward_skt = connect_forwarder(forward_host=forward_host, forward_port=forward_port) next_task = task_queue.get() if next_task: if str(next_task) == shutdown_msg: # Poison pill for shutting down log.info(("{}: DONE CALLBACK " "Exiting msg={}").format(name, next_task)) task_queue.task_done() break # end of handling shutdown case try: log.debug(("{} parsing").format(name)) source = next_task.source packet = next_task.payload if not packet: log.error(("{} invalid task found " "{} missing payload").format(name, next_task)) break log.debug(("{} found msg from src={}").format(name, source)) network_data = parse_network_data( data_packet=packet, include_filter_key=include_filter_key, filter_keys=filter_keys) if network_data["status"] == VALID: if network_data["data_type"] == TCP \ or network_data["data_type"] == UDP \ or network_data["data_type"] == ARP \ or network_data["data_type"] == ICMP: log.info( ("{} valid={} packet={} " "data={}").format(name, network_data["id"], network_data["data_type"], network_data["target_data"])) if not forward_skt: forward_skt = connect_forwarder( forward_host=forward_host, forward_port=forward_port) if forward_skt: if network_data["stream"]: sent = False while not sent: try: log.info("sending={}".format( network_data["stream"])) send_msg( forward_skt, network_data["stream"].encode( "utf-8")) sent = True except Exception as e: sent = False time.sleep(0.5) try: forward_skt.close() forward_skt = None except Exception as w: forward_skt = None forward_skt = connect_forwarder( forward_host=forward_host, forward_port=forward_port) # end of reconnecting log.info("sent={}".format( network_data["stream"])) if need_response: log.info("receiving") cdr_res = forward_skt.recv(1024) log.info(("cdr - res{}").format(cdr_res)) else: log.info(("{} EMPTY stream={} " "error={} status={}").format( name, network_data["stream"], network_data["err"], network_data["status"])) else: log.info(("{} not_supported valid={} " "packet data_type={} status={}").format( name, network_data["id"], network_data["data_type"], network_data["status"])) elif network_data["status"] == FILTERED: log.info(("{} filtered={} status={}").format( name, network_data["filtered"], network_data["status"])) else: if network_data["status"] == INVALID: log.info(("{} invalid={} packet={} " "error={} status={}").format( name, network_data["id"], network_data["data_type"], network_data["error"], network_data["status"])) else: log.info(("{} unknown={} packet={} " "error={} status={}").format( name, network_data["id"], network_data["data_type"], network_data["error"], network_data["status"])) # end of if valid or not data except KeyboardInterrupt as k: log.info(("{} stopping").format(name)) break except Exception as e: log.error(("{} failed packaging packet to forward " "with ex={}").format(name, e)) break # end of try/ex during payload processing # end of if found a next_task log.info(("Consumer: {} {}").format(name, next_task)) task_queue.task_done() if need_response: answer = "processed: {}".format(next_task()) result_queue.put(answer) # end of while if forward_skt: try: forward_skt.close() log.info("CLOSED connection") forward_skt = None except Exception: log.info("CLOSED connection") # end of cleaning up forwarding socket log.info("{} Done".format(name)) return
def build_new_deep_neural_network_from_env_variables(): """build_new_deep_neural_network_from_env_variables Build a new deep neural network from environment variables: ``CSV_FILE`` - file to process created during prepare dataset ``CSV_META_FILE`` - metadata header file created during prepare dataset ``PREDICT_FEATURE`` - column to predict ``TEST_SIZE`` - split data into percentage of test to training """ csv_file = ev("CSV_FILE", "/tmp/cleaned_attack_scans.csv") meta_file = ev("CSV_META_FILE", "/tmp/cleaned_metadata.json") predict_feature = ev("PREDICT_FEATURE", "label_value") test_size = float(ev("TEST_SIZE", "0.20")) if not os.path.exists(csv_file): log.error(("missing csv_file={}").format(csv_file)) sys.exit(1) res = build_training_request(csv_file=csv_file, meta_file=meta_file, predict_feature=predict_feature, test_size=test_size) if res["status"] != VALID: log.error(("Stopping for status={} " "errors: {}").format(res["status"], res["err"])) sys.exit(1) else: log.info(("built_training_request={} " "features={} ignore={}").format(res["status"], res["features_to_process"], res["ignore_features"])) # end of validating the training request log.info("ready for training") log.info("creating Keras - sequential model") # create the model model = Sequential() model.add( Dense(8, input_dim=len(res["features_to_process"]), kernel_initializer="uniform", activation="relu")) model.add(Dense(6, kernel_initializer="uniform", activation="relu")) model.add(Dense(1, kernel_initializer="uniform", activation="sigmoid")) log.info("compiling model") # compile the model model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) log.info("fitting model - please wait") # fit the model model.fit(res["X_train"], res["Y_train"], validation_data=(res["X_test"], res["Y_test"]), epochs=50, batch_size=2, verbose=1) # evaluate the model scores = model.evaluate(res["X_test"], res["Y_test"]) log.info(("Accuracy: {}").format(scores[1] * 100))
from celery_connectors.utils import ERROR from celery_connectors.utils import ev from celery_connectors.utils import build_sample_msgs from celery_connectors.utils import calc_backoff_timer from celery_connectors.build_ssl_options import build_ssl_options # Credits and inspirations from these great sources: # # https://github.com/celery/kombu/blob/master/examples/rpc-tut6/rpc_server.py # https://gist.github.com/oubiwann/3843016 # https://gist.github.com/eavictor/ee7856581619ac60643b57987b7ed580#file-mq_kombu_rpc_server-py # https://github.com/Skablam/kombu-examples # https://gist.github.com/mlavin/6671079 name = ev("APP_NAME", "robopubsub") log = build_colorized_logger( name=name) broker_url = ev("PUB_BROKER_URL", "pyamqp://*****:*****@localhost:5672//") exchange_name = ev("PUBLISH_EXCHANGE", "ecomm.api") exchange_type = ev("PUBLISH_EXCHANGE_TYPE", "topic") routing_key = ev("PUBLISH_ROUTING_KEY", "ecomm.api.west") queue_name = ev("PUBLISH_QUEUE", "ecomm.api.west") prefetch_count = int(ev("PREFETCH_COUNT", "1")) priority_routing = {"high": queue_name, "low": queue_name} use_exchange = Exchange(exchange_name, type=exchange_type) use_routing_key = routing_key use_queue = Queue(queue_name, exchange=use_exchange, routing_key=routing_key)
import logging from kombu import Connection from celery_connectors.utils import ev from celery_connectors.relay_json_to_celery_worker import RelayJSONtoCeleryWorker # Credits and inspirations from these great sources: # # https://github.com/celery/kombu/blob/master/examples/rpc-tut6/rpc_server.py # https://gist.github.com/oubiwann/3843016 # https://gist.github.com/eavictor/ee7856581619ac60643b57987b7ed580#file-mq_kombu_rpc_server-py # https://github.com/Skablam/kombu-examples # https://gist.github.com/mlavin/6671079 log = logging.getLogger(ev("APP_NAME", "jtoc")) def run_jtoc_relay(broker_url, ssl_options={}, transport_options={}, task_queues=[], callback=None, prefetch_count=1, relay_broker_url=None, relay_backend_url=None, relay_exchange=None, relay_routing_key=None, relay_handler=None, celery_app=None, *args, **kwargs):
from spylunking.log.setup_logging import build_colorized_logger from celery_connectors.utils import ev from celery_connectors.utils import build_sample_msgs from celery_connectors.build_ssl_options import build_ssl_options from celery_connectors.run_publisher import run_publisher # Credits and inspirations from these great sources: # # https://github.com/celery/kombu/blob/master/examples/rpc-tut6/rpc_server.py # https://gist.github.com/oubiwann/3843016 # https://gist.github.com/eavictor/ee7856581619ac60643b57987b7ed580#file-mq_kombu_rpc_server-py # https://github.com/Skablam/kombu-examples # https://gist.github.com/mlavin/6671079 name = ev("APP_NAME", "robopub") log = build_colorized_logger( name=name) broker_url = ev("PUB_BROKER_URL", "pyamqp://*****:*****@localhost:5672//") exchange_name = ev("PUBLISH_EXCHANGE", "") exchange_type = ev("PUBLISH_EXCHANGE_TYPE", "") routing_key = ev("PUBLISH_ROUTING_KEY", "reporting.accounts") queue_name = ev("PUBLISH_QUEUE", "reporting.accounts") priority_routing = {"high": queue_name} use_exchange = Exchange(exchange_name, type=exchange_type) use_routing_key = routing_key use_queue = Queue(queue_name, exchange=use_exchange, routing_key=routing_key) task_queues = [ use_queue
from kombu import Exchange, Queue from spylunking.log.setup_logging import build_colorized_logger from celery_connectors.utils import ev from celery_connectors.build_ssl_options import build_ssl_options from celery_connectors.run_jtoc_relay import run_jtoc_relay # Credits and inspirations from these great sources: # # https://github.com/celery/kombu/blob/master/examples/rpc-tut6/rpc_server.py # https://gist.github.com/oubiwann/3843016 # https://gist.github.com/eavictor/ee7856581619ac60643b57987b7ed580#file-mq_kombu_rpc_server-py # https://github.com/Skablam/kombu-examples # https://gist.github.com/mlavin/6671079 name = ev("APP_NAME", "jtoc_relay") log = build_colorized_logger(name=name) broker_url = ev("SUB_BROKER_URL", "pyamqp://*****:*****@localhost:5672//") exchange_name = ev("CONSUME_EXCHANGE", "ecomm.api") exchange_type = ev("CONSUME_EXCHANGE_TYPE", "topic") routing_key = ev("CONSUME_ROUTING_KEY", "ecomm.api.west") queue_name = ev("CONSUME_QUEUE", "ecomm.api.west") prefetch_count = int(float(ev("PREFETCH_COUNT", "1"))) priority_routing = {"high": queue_name, "low": queue_name} use_exchange = Exchange(exchange_name, type=exchange_type) use_queue = Queue(queue_name, exchange=use_exchange, routing_key=routing_key) task_queues = [use_queue] ssl_options = build_ssl_options()
def test_rabbitmq_consuming(self): # Integration Test the Subscriber Processor # This test just fills the queue for processing num_to_consume = 50000 num_sent = 0 num_to_send = num_to_consume msgs_to_send = [] msgs_by_id = {} self.exchange_name = ev("LOAD_TEST_EXCHANGE", "reporting") self.routing_key = ev("LOAD_TEST_ROUTING_KEY", "reporting.accounts") self.queue_name = ev("LOAD_TEST_QUEUE", "reporting.accounts") log.info(("Publishing {}/{} " "ex={} rk={} broker={}").format(num_sent, num_to_send, self.exchange_name, self.routing_key, self.pub_auth_url)) pub_retry = True not_done_publishing = True test_values = {"test_name": "large messages"} if len(msgs_to_send) == 0: while len(msgs_to_send) != num_to_send: test_msg = self.build_user_conversion_event_msg(test_values) msgs_to_send.append(test_msg) msgs_by_id[test_msg["msg_id"]] = False # end of building messages before slower publishing calls while not_done_publishing: if (num_sent % 1000 == 0) and num_sent > 0: log.info(("Published {} for " "{}/{} messages").format( get_percent_done(num_sent, num_to_send), num_sent, num_to_send)) # end of if print for tracing msg_body = None if num_sent < len(msgs_to_send): msg_body = msgs_to_send[num_sent] self.publish(body=msg_body, exchange=self.exchange_name, routing_key=self.routing_key, queue=self.queue_name, priority=0, ttl=None, serializer=self.pub_serializer, retry=pub_retry) num_sent += 1 if num_sent >= num_to_send: log.info(("Published {} ALL " "{}/{} messages").format( get_percent_done(num_sent, num_to_send), num_sent, num_to_send)) not_done_publishing = False elif num_sent >= len(msgs_to_send): log.info(("Published {} all " "{}/{} messages").format( get_percent_done(num_sent, len(msgs_to_send)), num_sent, num_to_send)) not_done_publishing = False # if should stop # end of not_done_publishing assert (num_sent == num_to_consume) os.system("list-queues.sh") log.info("") log.info(("display messages in the queues " "with routing_key={} again with:").format(self.routing_key)) log.info("list-queues.sh") log.info("")
def __init__(self): """__init__""" self.recv_msgs = [] # save every nth number of messages self.save_after_num = int( ev("SAVE_AFTER_NUM", "100")) # shutdown after this number of messages self.stop_after_num = int( ev("STOP_AFTER_NUM", "-1")) if self.save_after_num < 0: self.save_after_num = 1 if self.stop_after_num < 0: self.stop_after_num = None # shutdown if this file is found self.stop_for_file = ev( "STOP_FILE", "/tmp/stop-recording-csv") self.dataset_name = ev( "DS_NAME", "netdata") self.save_dir = ev( "DS_DIR", "/tmp") self.save_to_file = ev( "OUTPUT_CSV", "{}/{}-{}.csv".format( self.save_dir, self.dataset_name, rnow("%Y-%m-%d-%H-%M-%S"))) self.archive_file = ev( "ARCHIVE_JSON", "{}/packets-{}-{}.json".format( self.save_dir, self.dataset_name, rnow("%Y-%m-%d-%H-%M-%S"))) self.debug = bool(ev( "DEBUG_PACKETS", "0") == "1") self.df = None self.last_df = None self.eth_keys = {"eth_id": "id"} self.ip_keys = {"ip_id": "id"} self.ipvsix_keys = {"ipvsix_id": "id"} self.icmp_keys = {"icmp_id": "id"} self.arp_keys = {"arp_id": "id"} self.tcp_keys = {"tcp_id": "id"} self.udp_keys = {"udp_id": "id"} self.dns_keys = {"dns_id": "id"} self.raw_keys = {"raw_id": "id"} self.pad_keys = {"pad_id": "id"} self.all_keys = {} self.all_keys_list = [] self.all_eth = [] self.all_ip = [] self.all_ipvsix = [] self.all_icmp = [] self.all_arp = [] self.all_tcp = [] self.all_udp = [] self.all_dns = [] self.all_raw = [] self.all_pad = [] self.all_flat = [] self.all_rows = [] # noqa https://github.com/jay-johnson/antinex-client/blob/5fbcefaaed3d979b3c0829447b61592d5910ef22/antinex_client/build_ai_client_from_env.py#L19 self.client = build_ai_client_from_env() # the client uses environment variables: # noqa https://github.com/jay-johnson/antinex-client/blob/5fbcefaaed3d979b3c0829447b61592d5910ef22/antinex_client/consts.py#L23 # here is an example of what to export: # noqa https://github.com/jay-johnson/antinex-client/blob/master/examples/example-prediction.env self.request_dict = {} if ANTINEX_PUBLISH_ENABLED: if os.path.exists(ANTINEX_PUBLISH_REQUEST_FILE): with open(ANTINEX_PUBLISH_REQUEST_FILE, "r") as f: self.request_dict = json.loads(f.read())
#!/usr/bin/env python from spylunking.log.setup_logging import build_colorized_logger from celery_connectors.utils import ev from celery_connectors.message_processor import MessageProcessor name = "msg-proc" log = build_colorized_logger(name=name) log.info("Start - {}".format(name)) # want to change where you're subscribing vs publishing? sub_ssl_options = {} sub_auth_url = ev("SUB_BROKER_URL", "redis://localhost:6379/0") pub_ssl_options = {} pub_auth_url = ev("PUB_BROKER_URL", "redis://localhost:6379/0") # start the message processor msg_proc = MessageProcessor(name=name, sub_auth_url=sub_auth_url, sub_ssl_options=sub_ssl_options, pub_auth_url=pub_auth_url, pub_ssl_options=pub_ssl_options) # configure where this is consuming: queue = ev("CONSUME_QUEUE", "user.events.conversions") # Relay Publish Hook - is disabled for this example # where is it sending handled messages using a publish-hook or auto-caching: exchange = None routing_key = None
def handle_relay(self, body={}, message={}, relay_exchange=None, relay_routing_key=None, serializer="json", src_exchange=None, src_routing_key=None): """ Allow derived classes to customize how they 'handle_relay' """ task_result = None last_step = "validating" if not relay_exchange and not relay_routing_key: log.error(("Relay is misconfigured: please set either a " "relay_exchange={} or a relay_routing_key={}").format( relay_exchange, relay_routing_key)) return False try: last_step = "setting up base relay payload" base_relay_payload = {"org_msg": body, "relay_name": self.name} if self.verbose: log.debug("build relay_payload") last_step = "building relay payload" relay_payload = build_msg(base_relay_payload) if self.verbose: log.info(("relay ex={} rk={} id={}").format( relay_exchange, relay_routing_key, relay_payload["msg_id"])) last_step = "setting up task" task_name = ev( "RELAY_TASK_NAME", "ecomm_app.ecommerce.tasks." + "handle_user_conversion_events") if "task_name" in body: task_name = body["task_name"] now = datetime.datetime.now().isoformat() use_msg_id = "" if "msg_id" in body: use_msg_id = body["msg_id"] else: use_msg_id = relay_payload["msg_id"] source_info = { "relay": self.name, "src_exchange": src_exchange, "src_routing_key": src_routing_key } publish_body = { "account_id": 999, "subscription_id": 321, "stripe_id": 876, "created": now, "product_id": "JJJ", "version": 1, "r_id": relay_payload["msg_id"], "msg_id": use_msg_id } if self.verbose: log.info(("relay msg_id={} body={} " "broker={} backend={}").format( use_msg_id, publish_body, self.relay_broker_url, self.relay_transport_options)) else: log.info(("relay msg_id={} body={}").format( use_msg_id, str(publish_body)[0:30])) last_step = "send start - app" # http://docs.celeryproject.org/en/latest/reference/celery.html#celery.Celery app = Celery( broker=self.relay_broker_url, backend=self.relay_backend_url, transport_otions=self.relay_transport_options, task_ignore_result=True) # needed for cleaning up task results # these are targeted at optimizing processing on long-running tasks # while increasing reliability # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-worker_prefetch_multiplier app.conf.worker_prefetch_multiplier = 1 # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_heartbeat app.conf.broker_heartbeat = 240 # seconds # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_connection_max_retries app.conf.broker_connection_max_retries = None # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-task_acks_late app.conf.task_acks_late = True # http://docs.celeryproject.org/en/latest/userguide/calling.html#calling-retry task_publish_retry_policy = { "interval_max": 1, "max_retries": 120, # None - forever "interval_start": 0.1, "interval_step": 0.2 } app.conf.task_publish_retry_policy = task_publish_retry_policy last_step = "send start - task={}".format(task_name) with app.producer_or_acquire(producer=None) as producer: """ http://docs.celeryproject.org/en/latest/reference/celery.app.task.html#celery.app.task.Task.apply_async retry (bool) – If enabled sending of the task message will be retried in the event of connection loss or failure. Default is taken from the task_publish_retry setting. Note that you need to handle the producer/connection manually for this to work. With a redis backend connection on restore of a broker the first time it appears to hang here indefinitely: task_result.get() Please avoid getting the relay task results until this is fixed """ task_result = app.send_task(task_name, (publish_body, source_info), retry=True, producer=producer, expires=300) # end of app producer block last_step = "send done - task={}".format(task_name) if task_result: log.info(("relay done with msg_id={}").format(body["msg_id"])) if "relay_simulate_processing_lag" in body["data"]: relay_sleep_duration = \ body["data"]["relay_simulate_processing_lag"] log.info(("task - {} - simulating processing lag " "sleep={} seconds").format(task_name, relay_sleep_duration)) time.sleep(float(relay_sleep_duration)) # end of handling adding artifical lag for testing Celery if self.verbose: if "msg_id" in body: log.info( ("relay done - " "msg_id={} r_id={}").format(use_msg_id, relay_payload["msg_id"])) else: log.info(("relay done - " "msg_id={} r_id={}" "body={}").format(use_msg_id, relay_payload["msg_id"], str(body)[0:30])) # end of logging except Exception as e: log.error(("Task Relay failed: with ex={} when sending " "to relay_exchange={} relay_routing_key={} " "last_step={}").format(e, relay_exchange, relay_routing_key, last_step)) return False # end of try/ex return True
conn_attrs = { "task_default_queue": "celery.redis.sub", "task_default_exchange": "celery.redis.sub", # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-worker_prefetch_multiplier "worker_prefetch_multiplier": 1, # consume 1 message at a time # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-worker_prefetch_multiplier "prefetch_count": 3, # consume 1 message at a time per worker (3 workers) # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_heartbeat "broker_heartbeat": 240, # in seconds # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_connection_max_retries "broker_connection_max_retries": None, # None is forever # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-task_acks_late "task_acks_late": True, # on consume do not send an immediate ack back "task_publish_retry_policy": task_publish_retry_policy } app = Celery() sub = Subscriber("redis-subscriber", ev("SUB_BROKER_URL", "redis://localhost:6379/0"), app, ssl_options, **conn_attrs) # Now consume: queue = ev("CONSUME_QUEUE", "reporting.accounts") sub.consume(callback=handle_message, queue=queue, exchange=None, routing_key=None, prefetch_count=conn_attrs["prefetch_count"]) log.info("End - {}".format(name))
def prepare_new_dataset(): """prepare_new_dataset""" clean_dir = ev( "OUTPUT_DIR", "/tmp") clean_file = ev( "CLEANED_FILE", "{}/cleaned_attack_scans.csv".format( clean_dir)) fulldata_file = ev( "FULLDATA_FILE", "{}/fulldata_attack_scans.csv".format( clean_dir)) dataset_dir = ev( "DS_DIR", "/opt/antinex/datasets") csv_glob_path = ev( "DS_GLOB_PATH", "{}/*/*.csv".format( dataset_dir)) pipeline_files = find_all_pipeline_csvs( csv_glob_path=csv_glob_path) post_proc_rules = { "drop_columns": [ "src_file", "raw_id", "raw_load", "raw_hex_load", "raw_hex_field_load", "pad_load", "eth_dst", # need to make this an int "eth_src", # need to make this an int "ip_dst", # need to make this an int "ip_src" # need to make this an int ], "predict_feature": "label_name" } label_rules = { "set_if_above": 85, "labels": ["not_attack", "attack"], "label_values": [0, 1] } log.info("building csv") save_node = build_csv( pipeline_files=pipeline_files, fulldata_file=fulldata_file, clean_file=clean_file, post_proc_rules=post_proc_rules, label_rules=label_rules) if save_node["status"] == VALID: log.info("Successfully process datasets:") if ev("SHOW_SUMMARY", "1") == "1": log.info(("Full csv: {}") .format(save_node["fulldata_file"])) log.info(("Full meta: {}") .format(save_node["fulldata_metadata_file"])) log.info(("Clean csv: {}") .format(save_node["clean_file"])) log.info(("Clean meta: {}") .format(save_node["clean_metadata_file"])) log.info("------------------------------------------") log.info(("Predicting Feature: {}") .format(save_node["feature_to_predict"])) log.info(("Features to Process: {}") .format(ppj(save_node["features_to_process"]))) log.info(("Ignored Features: {}") .format(ppj(save_node["ignore_features"]))) log.info("------------------------------------------") # end of show summary log.info("") log.info("done saving csv:") log.info("Full: {}".format( save_node["fulldata_file"])) log.info("Cleaned (no-NaNs in columns): {}".format( save_node["clean_file"])) log.info("") else: log.info("Failed to process datasets")
def run_main(need_response=False, callback=None): """run_main start the packet consumers and the packet processors :param need_response: should send response back to publisher :param callback: handler method """ stop_file = ev("STOP_FILE", "/opt/stop_recording") num_workers = int(ev("NUM_WORKERS", "1")) shutdown_msg = "SHUTDOWN" log.info("Start - {}".format(name)) log.info("Creating multiprocessing queue") tasks = multiprocessing.JoinableQueue() queue_to_consume = multiprocessing.Queue() host = "localhost" # Start consumers log.info("Starting Consumers to process queued tasks") consumers = start_consumers_for_queue( num_workers=num_workers, tasks=tasks, queue_to_consume=queue_to_consume, shutdown_msg=shutdown_msg, consumer_class=WorkerToProcessPackets, callback=callback) log.info("creating socket") skt = create_layer_2_socket() log.info("socket created") not_done = True while not_done: if not skt: log.info("Failed to create layer 2 socket") log.info("Please make sure to run as root") not_done = False break try: if os.path.exists(stop_file): log.info(("Detected stop_file={}").format(stop_file)) not_done = False break # stop if the file exists # Only works on linux packet = skt.recvfrom(65565) if os.path.exists(stop_file): log.info(("Detected stop_file={}").format(stop_file)) not_done = False break # stop if the file was created during a wait loop tasks.put(NetworkPacketTask(source=host, payload=packet)) except KeyboardInterrupt as k: log.info("Stopping") not_done = False break except Exception as e: log.error(("Failed reading socket with ex={}").format(e)) not_done = False break # end of try/ex during socket receving # end of while processing network packets log.info(("Shutting down consumers={}").format(len(consumers))) shutdown_consumers(num_workers=num_workers, tasks=tasks) # Wait for all of the tasks to finish if need_response: log.info("Waiting for tasks to finish") tasks.join() log.info("Done waiting for tasks to finish")
def build_csv( pipeline_files=[], fulldata_file=None, clean_file=None, post_proc_rules=None, label_rules=None, metadata_filename="metadata.json"): """build_csv :param pipeline_files: files to process :param fulldata_file: output all columns to this csv file :param clean_file: output all numeric-ready columns to this csv file :param post_proc_rules: rules after building the DataFrame :param label_rules: labeling rules :param metadata_filename: metadata """ save_node = { "status": INVALID, "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "fulldata_file": fulldata_file, "fulldata_metadata_file": None, "clean_file": clean_file, "clean_metadata_file": None, "features_to_process": [], "feature_to_predict": None, "ignore_features": [], "df_json": {} } if not fulldata_file: log.error("missing fulldata_file - stopping") save_node["status"] = INVALID return save_node if not clean_file: log.error("missing clean_file - stopping") save_node["status"] = INVALID return save_node log.info("build_csv - START") common_headers, \ headers_dict = find_all_headers( pipeline_files=pipeline_files) log.info(("num common_headers={} headers={}") .format(len(common_headers), common_headers)) # since the headers can be different we rebuild a new one: hdrs = {} for h in common_headers: hdrs[h] = None features_to_process = [] feature_to_predict = None ignore_features = [] set_if_above = None labels = [] label_values = [] if label_rules: set_if_above = label_rules["set_if_above"] labels = label_rules["labels"] label_values = label_rules["label_values"] all_rows = [] num_done = 0 total_files = len(pipeline_files) for c in pipeline_files: log.info(("merging={}/{} csv={}") .format(num_done, total_files, c)) cf = pd.read_csv(c) log.info((" processing rows={}") .format(len(cf.index))) for index, row in cf.iterrows(): valid_row = True new_row = copy.deepcopy(hdrs) new_row["src_file"] = c for k in hdrs: if k in row: new_row[k] = row[k] # end of for all headers to copy in if label_rules: test_rand = random.randint(0, 100) if test_rand > set_if_above: new_row["label_value"] = label_values[1] new_row["label_name"] = labels[1] else: new_row["label_value"] = label_values[0] new_row["label_name"] = labels[0] # end of applying label rules if valid_row: all_rows.append(new_row) # end of for all rows in this file num_done += 1 # end of building all files into one list log.info(("fulldata rows={} generating df") .format(len(all_rows))) df = pd.DataFrame(all_rows) log.info(("df rows={} headers={}") .format(len(df.index), df.columns.values)) if ev("CONVERT_DF", "0") == "1": log.info("converting df to json") save_node["df_json"] = df.to_json() if clean_file: log.info(("writing fulldata_file={}") .format(fulldata_file)) df.to_csv(fulldata_file, sep=',', encoding='utf-8', index=False) log.info(("done writing fulldata_file={}") .format(fulldata_file)) if post_proc_rules: clean_metadata_file = "" feature_to_predict = "label_name" features_to_process = [] ignore_features = [] if label_rules: ignore_features = [feature_to_predict] if "drop_columns" in post_proc_rules: for p in post_proc_rules["drop_columns"]: if p in headers_dict: ignore_features.append(p) # post proce filter more features out # for non-int/float types for d in df.columns.values: add_this_one = True for i in ignore_features: if d == i: add_this_one = False break if add_this_one: features_to_process.append(d) # for all df columns we're not ignoring... # add them as features to process fulldata_metadata_file = "{}/fulldata_{}".format( "/".join(fulldata_file.split("/")[:-1]), metadata_filename) log.info(("writing fulldata metadata file={}") .format(fulldata_metadata_file)) header_data = {"headers": list(df.columns.values), "output_type": "fulldata", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": features_to_process, "feature_to_predict": feature_to_predict, "ignore_features": ignore_features, "created": rnow()} with open(fulldata_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) keep_these = features_to_process keep_these.append(feature_to_predict) log.info(("creating new clean_file={} " "keep_these={} " "predict={}") .format(clean_file, keep_these, feature_to_predict)) # need to remove all columns that are all nan clean_df = df[keep_these].dropna( axis=1, how='all').dropna() cleaned_features = clean_df.columns.values cleaned_to_process = [] cleaned_ignore_features = [] for c in cleaned_features: if c == feature_to_predict: cleaned_ignore_features.append(c) else: keep_it = True for ign in ignore_features: if c == ign: cleaned_ignore_features.append(c) keep_it = False break # end of for all feaures to remove if keep_it: cleaned_to_process.append(c) # end of new feature columns log.info(("writing DROPPED clean_file={} " "features_to_process={} " "ignore_features={} " "predict={}") .format(clean_file, cleaned_to_process, cleaned_ignore_features, feature_to_predict)) write_clean_df = clean_df.drop( columns=cleaned_ignore_features ) log.info(("cleaned_df rows={}") .format(len(write_clean_df.index))) write_clean_df.to_csv( clean_file, sep=',', encoding='utf-8', index=False) clean_metadata_file = "{}/cleaned_{}".format( "/".join(clean_file.split("/")[:-1]), metadata_filename) log.info(("writing clean metadata file={}") .format(clean_metadata_file)) header_data = {"headers": list(write_clean_df.columns.values), "output_type": "clean", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": cleaned_to_process, "feature_to_predict": feature_to_predict, "ignore_features": cleaned_ignore_features, "created": rnow()} with open(clean_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) else: for d in df.columns.values: add_this_one = True for i in ignore_features: if d == i: add_this_one = False break if add_this_one: features_to_process.append(d) # for all df columns we're not ignoring... # add them as features to process fulldata_metadata_file = "{}/fulldata_{}".format( "/".join(fulldata_file.split("/")[:-1]), metadata_filename) log.info(("writing fulldata metadata file={}") .format(fulldata_metadata_file)) header_data = {"headers": list(df.columns.values), "output_type": "fulldata", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": features_to_process, "feature_to_predict": feature_to_predict, "ignore_features": ignore_features, "created": rnow()} with open(fulldata_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) keep_these = features_to_process keep_these.append(feature_to_predict) log.info(("creating new clean_file={} " "keep_these={} " "predict={}") .format(clean_file, keep_these, feature_to_predict)) # need to remove all columns that are all nan clean_df = df[keep_these].dropna( axis=1, how='all').dropna() cleaned_features = clean_df.columns.values cleaned_to_process = [] cleaned_ignore_features = [] for c in cleaned_features: if c == feature_to_predict: cleaned_ignore_features.append(c) else: keep_it = True for ign in ignore_features: if c == ign: cleaned_ignore_features.append(c) keep_it = False break # end of for all feaures to remove if keep_it: cleaned_to_process.append(c) # end of new feature columns log.info(("writing DROPPED clean_file={} " "features_to_process={} " "ignore_features={} " "predict={}") .format(clean_file, cleaned_to_process, cleaned_ignore_features, feature_to_predict)) write_clean_df = clean_df.drop( columns=cleaned_ignore_features ) log.info(("cleaned_df rows={}") .format(len(write_clean_df.index))) write_clean_df.to_csv( clean_file, sep=',', encoding='utf-8', index=False) clean_metadata_file = "{}/cleaned_{}".format( "/".join(clean_file.split("/")[:-1]), metadata_filename) log.info(("writing clean metadata file={}") .format(clean_metadata_file)) header_data = {"headers": list(write_clean_df.columns.values), "output_type": "clean", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": cleaned_to_process, "feature_to_predict": feature_to_predict, "ignore_features": cleaned_ignore_features, "created": rnow()} with open(clean_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) # end of if/else save_node["clean_file"] = clean_file save_node["clean_metadata_file"] = clean_metadata_file log.info(("done writing clean_file={}") .format(clean_file)) # end of post_proc_rules save_node["fulldata_file"] = fulldata_file save_node["fulldata_metadata_file"] = fulldata_metadata_file save_node["status"] = VALID # end of writing the file save_node["features_to_process"] = features_to_process save_node["feature_to_predict"] = feature_to_predict save_node["ignore_features"] = ignore_features log.info("build_csv - END") return save_node
log.info(("Done with msg_id={} result={}").format(body["msg_id"], result.get())) # now that the message has been # sent to the celery ecomm worker # we can ack the message which # deletes it from the source queue # the message processor uses message.ack() # end of relay_callback # want to change where you're subscribing vs publishing? sub_ssl_options = {} sub_auth_url = ev("SUB_BROKER_URL", "pyamqp://*****:*****@localhost:5672//") pub_ssl_options = {} pub_auth_url = ev("PUB_BROKER_URL", "redis://localhost:6379/0") # start the message processor msg_proc = MessageProcessor(name=name, sub_auth_url=sub_auth_url, sub_ssl_options=sub_ssl_options, pub_auth_url=pub_auth_url, pub_ssl_options=pub_ssl_options) # configure where this is consuming: queue = ev("CONSUME_QUEUE", "user.events.conversions") # Relay Publish Hook - sending to Redis # where is it sending handled messages using a publish-hook or auto-caching:
def test_rabbitmq_consuming(self): # Integration Test the Consuming Worker with 50,0000 messages # This test just uses send_task for publishing num_to_consume = 50000 num_sent = 0 num_to_send = num_to_consume msgs_to_send = [] msgs_by_id = {} not_done_publishing = True test_values = {"test_name": "large messages"} if len(msgs_to_send) == 0: while len(msgs_to_send) != num_to_send: test_msg = self.build_user_conversion_event_msg(test_values) msgs_to_send.append(test_msg) msgs_by_id[test_msg["msg_id"]] = False # end of building messages before slower publishing calls pub_auth_url = ev("RELAY_WORKER_BROKER_URL", "pyamqp://*****:*****@localhost:5672//") path_to_config_module = "ecomm_app.ecommerce.celeryconfig_pub_sub" app = ecomm_app.ecommerce.tasks.get_celery_app( name="demo", auth_url=pub_auth_url, path_to_config_module=path_to_config_module) task_name = "ecomm_app.ecommerce.tasks.handle_user_conversion_events" source_id = {"msg_proc": ev("TEST_RELAY_NAME", "test_ecomm_relay")} result = None log.info(("Sending broker={}") .format(app.conf.broker_url)) while not_done_publishing: if (num_sent % 1000 == 0) and num_sent > 0: log.info(("Published {} for " "{}/{} messages") .format(get_percent_done(num_sent, num_to_send), num_sent, num_to_send)) # end of if print for tracing msg_body = None if num_sent < len(msgs_to_send): msg_body = msgs_to_send[num_sent] result = app.send_task(task_name, (msg_body, source_id)) num_sent += 1 if num_sent >= num_to_send: log.info(("Published {} ALL " "{}/{} messages") .format(get_percent_done(num_sent, num_to_send), num_sent, num_to_send)) not_done_publishing = False elif num_sent >= len(msgs_to_send): log.info(("Published {} all " "{}/{} messages result={}") .format(get_percent_done(num_sent, len(msgs_to_send)), num_sent, num_to_send, result)) not_done_publishing = False # if should stop # end of not_done_publishing assert(num_sent == num_to_consume) log.info("") os.system("list-queues.sh") log.info("")
def build_training_request( csv_file=ev("CSV_FILE", "/tmp/cleaned_attack_scans.csv"), meta_file=ev("CSV_META_FILE", "/tmp/cleaned_metadata.json"), predict_feature=ev("PREDICT_FEATURE", "label_value"), ignore_features=[ "label_name", "ip_src", # need to make this an int "ip_dst", # need to make this an int "eth_src", # need to make this an int "eth_dst" # need to make this an int ], seed=None, test_size=float(ev("TEST_SIZE", "0.20")), preproc_rules=None): """build_training_request :param csv_file: csv file built with prepare_dataset.py :param meta_file: metadata file built with prepare_dataset.py :param predict_feature: feature (column) to predict :param ignore_features: features to remove from the csv before the split of test + train data :param seed: integer to seed :param test_size: percent of records to split into test vs train :param preproc_rules: future preprocessing rules hooks """ last_step = "not started" res = { "status": INVALID, "err": "", "csv_file": csv_file, "meta_file": meta_file, "meta_data": None, "seed": None, "test_size": test_size, "predict_feature": predict_feature, "features_to_process": [], "ignore_features": ignore_features, "X_train": None, "X_test": None, "Y_train": None, "Y_test": None } try: last_step = ("building seed={}").format(seed) log.debug(last_step) use_seed = seed if not use_seed: use_seed = 9 res["seed"] = np.random.seed(use_seed) last_step = ("Loading csv={}").format(csv_file) log.info(last_step) if not os.path.exists(csv_file): res["status"] = ERROR res["err"] = ("Unable to find csv_file={}").format(csv_file) log.error(res["err"]) return res # end of checking for a valid csv file on disk if not os.path.exists(meta_file): res["status"] = ERROR res["err"] = ("Unable to find meta_file={}").format(meta_file) log.error(res["err"]) return res # end of checking for a valid metadata file on disk # load csv file into pandas dataframe df = pd.read_csv(csv_file) features_to_process = [] meta_data = {} try: last_step = ("opening metadata={}").format(meta_file) log.debug(last_step) meta_data = json.loads(open(meta_file, "r").read()) res["meta_data"] = meta_data if "post_proc_rules" in meta_data: if "drop_columns" in meta_data["post_proc_rules"]: log.debug(("Found drop_columns={}").format( meta_data["post_proc_rules"]["drop_columns"])) for ign in meta_data["post_proc_rules"]["drop_columns"]: ignore_features.append(ign) except Exception as e: res["error"] = ("Failed building ignore_features: " "ignore_features={} meta={} meta_data={} " "last_step='{}' ex='{}'").format( ignore_features, meta_file, meta_data, last_step, e) log.error(res["error"]) res["status"] = ERROR return res # end of trying to lookup the meta data file # for non-int/float features to ignore last_step = ("metadata={} df has " "columns={} ignore={}").format(meta_file, df.columns.values, ignore_features) log.info(last_step) for feature in df.columns.values: keep_it = True for ign in ignore_features: if feature == ign: keep_it = False if keep_it: if feature != predict_feature: features_to_process.append(feature) # end of for all features to process last_step = ("Done post-procecessing " "Predicting={} with features={} " "ignore_features={} records={}").format( predict_feature, features_to_process, ignore_features, len(df.index)) log.info(last_step) res["predict_feature"] = predict_feature res["ignore_features"] = [] for k in ignore_features: if k not in res["ignore_features"]: res["ignore_features"].append(k) res["features_to_process"] = [] for k in features_to_process: if k not in res["features_to_process"]: if k != predict_feature: res["features_to_process"].append(k) # split the data into training (res["X_train"], res["X_test"], res["Y_train"], res["Y_test"]) = train_test_split(df[features_to_process], df[predict_feature], test_size=test_size, random_state=res["seed"]) last_step = ("Done splitting rows={} into " "X_train={} X_test={} " "Y_train={} Y_test={}").format(len(df.index), len(res["X_train"]), len(res["X_test"]), len(res["Y_train"]), len(res["Y_test"])) log.info(("Success: {}").format(last_step)) res["err"] = "" res["status"] = VALID except Exception as e: res["status"] = ERROR res["err"] = ("Failed build_training_request " "step='{}' with ex='{}'").format(last_step, e) log.error(("build_training_request: {}").format(res["err"])) # end of try/ex return res
class BaseTestCase(unittest.TestCase): debug = False exchange_name = ev("TEST_EXCHANGE", "test.events") queue_name = ev("TEST_QUEUE", "test.events.conversions") routing_key = ev("TEST_ROUTING_KEY", "test.events.conversions") exchange = None queue = None rabbitmq_auth_url = ev("TEST_RABBITMQ_BROKER_URL", "pyamqp://*****:*****@localhost:5672//") redis_auth_url = ev("TEST_REDIS_BROKER_URL", "redis://localhost:6379/0") pub_auth_url = rabbitmq_auth_url sub_auth_url = rabbitmq_auth_url pub_ssl_options = {} sub_ssl_options = {} pub_attrs = {} sub_attrs = {} pub_serializer = "json" sub_serializer = "application/json" test_body = {} test_id = str(uuid.uuid4()).replace("-", "") test_body = {"account_id": 123, "subscription_id": 456, "stripe_id": 789, "product_id": "ABC"} pub_msgs = [] sub_msgs = [] last_pub_msg = None last_sub_msg = None last_sub_callback = None def setUp(self): if self.debug: print("setUp") # state trips in the custom classes os.environ["TEST_STOP_DONE"] = "1" self.last_pub_msg = None self.last_sub_msg = None self.pub = None self.sub = None self.pub_msgs = [] self.sub_msgs = [] self.exchange_name = ev("TEST_EXCHANGE", "test.events") self.routing_key = ev("TEST_ROUTING_KEY", "test.events.conversions") self.queue_name = ev("TEST_QUEUE", "test.events.conversions") self.exchange = None self.queue = None self.last_sub_callback = None # end of setUp def tearDown(self): if self.debug: print("tearDown") self.pub = None self.sub = None self.exchange = None self.queue = None self.last_sub_callback = None # end of tearDown def handle_message(self, body, msg): log.info(("test={} BASETEST handle_message got " "body={} msg={}") .format(self.test_id, body, msg)) if msg: msg.ack() # end of handle_message def connect_pub(self, auth_url=None, ssl_options={}, attrs={}): use_auth_url = self.pub_auth_url use_ssl_options = self.pub_ssl_options use_pub_attrs = self.pub_attrs if auth_url: use_auth_url = auth_url if len(ssl_options) > 0: use_ssl_options = ssl_options if len(ssl_options) > 0: use_pub_attrs = use_pub_attrs self.pub = Publisher("test-pub", use_auth_url, use_ssl_options) # end of connect_pub def connect_sub(self, auth_url=None, ssl_options={}, attrs={}): use_auth_url = self.sub_auth_url use_ssl_options = self.sub_ssl_options use_sub_attrs = self.sub_attrs if auth_url: use_auth_url = auth_url if len(ssl_options) > 0: use_ssl_options = ssl_options if len(ssl_options) > 0: use_sub_attrs = use_sub_attrs self.sub = KombuSubscriber("test-sub", use_auth_url, use_ssl_options) # end of connect_sub def build_msg(self, test_values={}): body = {"test_id": self.test_id, "date": datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S"), "msg_id": str(uuid.uuid4()).replace("-", ""), "test_values": test_values} return body # end of build_msg def consume(self, callback=None, queue=queue, exchange=exchange, routing_key=routing_key, serializer="application/json", heartbeat=60, time_to_wait=5.0, forever=False, silent=True): if not callback: log.error(("Subscriber - Requires a callback handler for message" "processing with signature definition: " "def handle_message(self, body, message):") .format(self.sub_auth_url, self.sub_ssl_options)) assert(callback) # if not connected, just connect with defaults if not self.sub: self.connect_sub() if not self.sub: log.error(("Subscriber - Failed to connect " "to broker={} ssl={}") .format(self.sub_auth_url, self.sub_ssl_options)) assert(self.sub) if self.sub: self.sub.consume(callback=callback, queue=queue, exchange=exchange, routing_key=routing_key, serializer=serializer, heartbeat=heartbeat, forever=forever, time_to_wait=time_to_wait, silent=silent) else: log.info("Sub is None already - client should not call consume") # end of consume def publish(self, body=None, exchange=exchange, routing_key=routing_key, queue=queue, priority=0, ttl=None, serializer="json", retry=True, silent=True): # if no body for the message if not body: log.error(("Publisher - requires argument: " "body=some_dictionary to test")) assert(body) # if not connected, just connect with defaults if not self.pub: self.connect_pub() if not self.pub: log.error(("Publisher - Failed to connect " "to broker={} ssl={}") .format(self.pub_auth_url, self.pub_ssl_options)) assert(self.pub) if self.pub: self.pub.publish(body=body, exchange=exchange, routing_key=routing_key, queue=queue, serializer=serializer, priority=priority, ttl=ttl, retry=retry, silent=silent) else: log.info("Pub is None already - client should not call publish")