def main(): """ Main predictor function """ args = init_parser() config = init_config(args) logger = get_logger(f'hawkes-{config["partition"]}', broker_list=config["bootstrap_servers"], debug=True) consumer = KafkaConsumer(bootstrap_servers=config["bootstrap_servers"]) consumer.assign( [TopicPartition(config["consumer_topic"], config["partition"])]) producer = KafkaProducer( bootstrap_servers=config["bootstrap_servers"], value_serializer=lambda v: json.dumps(v).encode("utf-8"), key_serializer=lambda v: json.dumps(v).encode("utf-8")) alpha = config["alpha"] mu = config["mu"] for message in consumer: mess = message.value.decode().replace("'", '"').replace('(', '[').replace( ')', ']') mess = eval(mess) cascade = np.array(mess["tweets"]) tweet_id = mess["cid"] text = mess["msg"] T_obs = mess["T_obs"] p, beta = 0.02, 1 / 3600 t = cascade[-1, 0] LL = loglikelihood((p, beta), cascade, t) LL_MLE, MLE = compute_MLE(cascade, t, alpha, mu) p_est, beta_est = MLE N, G1, n_star = prediction([p_est, beta_est], cascade, alpha, mu, t) messfinal = { "type": "parameters", "cid": tweet_id, "msg": text, "n_obs": len(cascade), "n_supp": N, "params": list(MLE), "G1": G1, "n_star": n_star } producer.send(config["producer_topic"], key=T_obs, value=messfinal, partition=config["partition"]) logger.info( "Predicted params p = {: .3f} and beta = {: .3f} for tweet {} at time {} on partition: {}" .format(p_est, beta_est, tweet_id, T_obs, config["partition"]))
def main(): """ Main predictor function """ args = init_parser() config = init_config(args) consumer = KafkaConsumer(config["consumer_topic"], bootstrap_servers=config["bootstrap_servers"]) producer = KafkaProducer(bootstrap_servers=config["bootstrap_servers"]) regressors = defaultdict(RandomForestRegressor) train_X = defaultdict(list) train_y = defaultdict(list) # Set the frequence of trainings of each random forest update_size = config["update_size"] logger = get_logger('learner', broker_list=config["bootstrap_servers"], debug=True) for message in consumer: t = message.key value = message.value.decode().replace("'", '"').replace('(', '[').replace( ')', ']') value = eval(value) inputs = value['X'] # (beta, n_star, G1) W = value['W'] train_X[t].append(inputs) train_y[t].append(W) if not len(train_X[t]) % update_size: regressors[t].fit(train_X[t], train_y[t]) regressor_message = pickle.dumps({ "type": "model", "regressor": regressors[t] }) producer.send('models', key=t, value=regressor_message, partition=message.partition) logger.info("Model {}s updated and sent".format(t))
""" # test_misc_trie.py """ import logging import os import unittest from ml.misc.trie import Trie from ml.utils.logger import get_logger LOGGER = get_logger(__name__) class TrieTests(unittest.TestCase): """ TrieTests includes all unit tests for ml.misc.trie module """ @classmethod def load_dict(cls, dict_path): results = [] if os.path.isfile(dict_path): with open(dict_path, 'rt') as fh: for word in fh: results.append(word.strip()) return results @classmethod def setUpClass(cls): """setup for all tests""" cls.test_path = os.path.dirname(os.path.realpath(__file__))
""" # ml.common.message_tasker.py @author: Jason Zhu @email: [email protected] @created: 2019-01-30 """ import logging from ml.config import get_uint from ml.utils.logger import get_logger DEBUG_LEVEL = get_uint('debug.level', logging.INFO) LOGGER = get_logger(__name__, level=DEBUG_LEVEL) LEVEL_LIMIT = get_uint('tasks.max_nested_level', 3) class MessageTasker(): """ MessageTasker processes a task list. """ def __init__(self, tasks, message={}, max_level=LEVEL_LIMIT): """ Constructor of ml.common.MessageTasker @param tasks: a list of tasks with recursive multi-layer sub-tasks that describes a tasks workflow. @param message: pre-processed message (dict). @param max_level: the maximum nested sub-tasks level.
""" test_common_svc_checker.py """ import json import os import pytest import unittest from mock import patch from ml.common.svc_checker import ServiceChecker from ml.utils.logger import get_logger LOGGER = get_logger('ml.' + __name__) class ServiceCheckerTester(unittest.TestCase): @classmethod def teardown_class(cls): pass def setUp(self): """setup for test""" self.test_path = os.path.dirname(os.path.realpath(__file__)) self.repo_path = os.path.dirname(self.test_path) self.proj_path = os.path.join(self.repo_path, 'ml') self.data_path = os.path.join(self.test_path, 'data') self.data_file = os.path.join(self.data_path, 'test_endpoints.json') self.data_file_failure = os.path.join( self.data_path, 'test_endpoints_failure_500.json') self.data_file_success = os.path.join(self.data_path,
def main(): """ Main predictor function """ args = init_parser() config = init_config(args) partition = config["obs_map"][config["obs_window"]] consumer = KafkaConsumer( bootstrap_servers=config["bootstrap_servers"], key_deserializer=lambda v: v.decode(), ) consumer.assign([ TopicPartition(topic, partition) for topic in config["consumer_topic"] ]) producer_samples = KafkaProducer( bootstrap_servers=config["bootstrap_servers"], value_serializer=lambda v: json.dumps(v).encode('utf-8'), key_serializer=str.encode) producer_alerts = KafkaProducer( bootstrap_servers=config["bootstrap_servers"], value_serializer=lambda v: json.dumps(v).encode('utf-8')) alpha = config["alpha"] mu = config["mu"] alert_limit = config["alert_limit"] regressor = RandomForestRegressor() sizes = defaultdict(dict) forest_inputs = {} logger = get_logger(f'predictor-{partition}', broker_list=config["bootstrap_servers"], debug=True) for message in consumer: try: mess = message.value.decode().replace("'", '"') mess = json.loads(mess) except: mess = pickle.loads(message.value) ################### MODEL if mess['type'] == 'model': regressor = mess['regressor'] logger.info("Updated model received") ################### SIZE t = message.key if mess['type'] == 'size': # When we receive the final size of a cascade, we store it tweet_id = mess['cid'] sizes[tweet_id]["real"] = mess['n_tot'] if mess['type'] == "parameters": G1 = mess['G1'] n_star = mess['n_star'] tweet_id = mess['cid'] p, beta = mess['params'] msg = mess['msg'] n_obs = mess['n_obs'] try: sklearn.utils.validation.check_is_fitted(regressor) n_tot = regressor.predict((beta, n_star, G1)) except: n_tot = n_obs + G1 / (1 - n_star) sizes[tweet_id]["prediction"] = n_tot forest_inputs[tweet_id] = [beta, n_star, G1, n_obs] alert_message = { 'type': 'alert', 'cid': tweet_id, 'msg': msg, 'T_obs': t, 'n_tot': n_tot, } producer_alerts.send('alerts', key=None, value=alert_message) producer_alerts.flush() logger.info("Alert produced for tweet {} at time {}".format( tweet_id, t)) if n_tot > alert_limit: logger.warning( "Tweet {} may create an important cascade with {} retweets predicted" .format(tweet_id, n_tot)) if len(sizes[tweet_id].keys()) == 2: true_size = sizes[tweet_id]["real"] pred_size = sizes[tweet_id]["prediction"] are = abs(pred_size - true_size) / true_size stat_message = { 'type': 'stat', 'cid': tweet_id, 'T_obs': t, 'ARE': are } producer_alerts.send('stats', key=None, value=stat_message) producer_alerts.flush() beta, n_star, G1, n_obs = forest_inputs[tweet_id] W = (true_size - n_obs) * (1 - n_star) / G1 sample_message = { 'type': 'sample', 'cid': tweet_id, 'X': (beta, n_star, G1), 'W': W } producer_samples.send('samples', key=args.obs_window, value=sample_message) producer_samples.flush() logger.info( "Stats and sample produced for tweet {} at time {}".format( tweet_id, t))
""" run_gevent.py """ from gevent import monkey # Dev Note: # Need `monkey.patch_all()` so the wsgi server can handle concurrent requests. # Have to patch at the top of the file here because it overrides certain python # modules with gevent modules (e.g. threading); otherwise, it may not work. # Not an ideal solution; probably choose a different WSGI server. monkey.patch_all() from gevent.pywsgi import WSGIServer # noqa: E402 from ml.app import app # noqa: E402 from ml.utils.logger import get_logger # noqa: E402 LOGGER = get_logger(__name__, level='INFO') LOGGER.info("starting wsgi server") # configure WSGI server http_server = WSGIServer(('localhost', 8081), app) http_server.serve_forever()