Ejemplo n.º 1
0
def main():
    """
    Main predictor function
    """

    args = init_parser()
    config = init_config(args)
    logger = get_logger(f'hawkes-{config["partition"]}',
                        broker_list=config["bootstrap_servers"],
                        debug=True)
    consumer = KafkaConsumer(bootstrap_servers=config["bootstrap_servers"])
    consumer.assign(
        [TopicPartition(config["consumer_topic"], config["partition"])])
    producer = KafkaProducer(
        bootstrap_servers=config["bootstrap_servers"],
        value_serializer=lambda v: json.dumps(v).encode("utf-8"),
        key_serializer=lambda v: json.dumps(v).encode("utf-8"))

    alpha = config["alpha"]
    mu = config["mu"]

    for message in consumer:
        mess = message.value.decode().replace("'", '"').replace('(',
                                                                '[').replace(
                                                                    ')', ']')

        mess = eval(mess)

        cascade = np.array(mess["tweets"])
        tweet_id = mess["cid"]
        text = mess["msg"]
        T_obs = mess["T_obs"]
        p, beta = 0.02, 1 / 3600
        t = cascade[-1, 0]
        LL = loglikelihood((p, beta), cascade, t)
        LL_MLE, MLE = compute_MLE(cascade, t, alpha, mu)
        p_est, beta_est = MLE
        N, G1, n_star = prediction([p_est, beta_est], cascade, alpha, mu, t)

        messfinal = {
            "type": "parameters",
            "cid": tweet_id,
            "msg": text,
            "n_obs": len(cascade),
            "n_supp": N,
            "params": list(MLE),
            "G1": G1,
            "n_star": n_star
        }

        producer.send(config["producer_topic"],
                      key=T_obs,
                      value=messfinal,
                      partition=config["partition"])

        logger.info(
            "Predicted params p = {: .3f} and beta = {: .3f} for tweet {} at time {} on partition: {}"
            .format(p_est, beta_est, tweet_id, T_obs, config["partition"]))
Ejemplo n.º 2
0
def main():
    """
    Main predictor function
    """

    args = init_parser()
    config = init_config(args)
    consumer = KafkaConsumer(config["consumer_topic"],
                             bootstrap_servers=config["bootstrap_servers"])
    producer = KafkaProducer(bootstrap_servers=config["bootstrap_servers"])

    regressors = defaultdict(RandomForestRegressor)
    train_X = defaultdict(list)
    train_y = defaultdict(list)

    # Set the frequence of trainings of each random forest
    update_size = config["update_size"]

    logger = get_logger('learner',
                        broker_list=config["bootstrap_servers"],
                        debug=True)

    for message in consumer:

        t = message.key

        value = message.value.decode().replace("'", '"').replace('(',
                                                                 '[').replace(
                                                                     ')', ']')

        value = eval(value)
        inputs = value['X']  # (beta, n_star, G1)
        W = value['W']

        train_X[t].append(inputs)
        train_y[t].append(W)

        if not len(train_X[t]) % update_size:

            regressors[t].fit(train_X[t], train_y[t])

            regressor_message = pickle.dumps({
                "type": "model",
                "regressor": regressors[t]
            })

            producer.send('models',
                          key=t,
                          value=regressor_message,
                          partition=message.partition)

            logger.info("Model {}s updated and sent".format(t))
Ejemplo n.º 3
0
"""
# test_misc_trie.py

"""
import logging
import os
import unittest

from ml.misc.trie import Trie
from ml.utils.logger import get_logger

LOGGER = get_logger(__name__)


class TrieTests(unittest.TestCase):
    """
    TrieTests includes all unit tests for ml.misc.trie module
    """
    @classmethod
    def load_dict(cls, dict_path):
        results = []
        if os.path.isfile(dict_path):
            with open(dict_path, 'rt') as fh:
                for word in fh:
                    results.append(word.strip())
        return results

    @classmethod
    def setUpClass(cls):
        """setup for all tests"""
        cls.test_path = os.path.dirname(os.path.realpath(__file__))
Ejemplo n.º 4
0
"""
# ml.common.message_tasker.py

@author: Jason Zhu
@email: [email protected]
@created: 2019-01-30

"""
import logging

from ml.config import get_uint
from ml.utils.logger import get_logger

DEBUG_LEVEL = get_uint('debug.level', logging.INFO)
LOGGER = get_logger(__name__, level=DEBUG_LEVEL)

LEVEL_LIMIT = get_uint('tasks.max_nested_level', 3)


class MessageTasker():
    """
    MessageTasker processes a task list.
    """
    def __init__(self, tasks, message={}, max_level=LEVEL_LIMIT):
        """
        Constructor of ml.common.MessageTasker

        @param tasks: a list of tasks with recursive multi-layer sub-tasks that
                      describes a tasks workflow.
        @param message: pre-processed message (dict).
        @param max_level: the maximum nested sub-tasks level.
Ejemplo n.º 5
0
"""
test_common_svc_checker.py
"""
import json
import os
import pytest
import unittest

from mock import patch

from ml.common.svc_checker import ServiceChecker
from ml.utils.logger import get_logger

LOGGER = get_logger('ml.' + __name__)


class ServiceCheckerTester(unittest.TestCase):
    @classmethod
    def teardown_class(cls):
        pass

    def setUp(self):
        """setup for test"""
        self.test_path = os.path.dirname(os.path.realpath(__file__))
        self.repo_path = os.path.dirname(self.test_path)
        self.proj_path = os.path.join(self.repo_path, 'ml')
        self.data_path = os.path.join(self.test_path, 'data')
        self.data_file = os.path.join(self.data_path, 'test_endpoints.json')
        self.data_file_failure = os.path.join(
            self.data_path, 'test_endpoints_failure_500.json')
        self.data_file_success = os.path.join(self.data_path,
Ejemplo n.º 6
0
def main():
    """
    Main predictor function
    """

    args = init_parser()
    config = init_config(args)

    partition = config["obs_map"][config["obs_window"]]
    consumer = KafkaConsumer(
        bootstrap_servers=config["bootstrap_servers"],
        key_deserializer=lambda v: v.decode(),
    )
    consumer.assign([
        TopicPartition(topic, partition) for topic in config["consumer_topic"]
    ])

    producer_samples = KafkaProducer(
        bootstrap_servers=config["bootstrap_servers"],
        value_serializer=lambda v: json.dumps(v).encode('utf-8'),
        key_serializer=str.encode)

    producer_alerts = KafkaProducer(
        bootstrap_servers=config["bootstrap_servers"],
        value_serializer=lambda v: json.dumps(v).encode('utf-8'))

    alpha = config["alpha"]
    mu = config["mu"]
    alert_limit = config["alert_limit"]

    regressor = RandomForestRegressor()

    sizes = defaultdict(dict)
    forest_inputs = {}
    logger = get_logger(f'predictor-{partition}',
                        broker_list=config["bootstrap_servers"],
                        debug=True)

    for message in consumer:
        try:
            mess = message.value.decode().replace("'", '"')
            mess = json.loads(mess)
        except:
            mess = pickle.loads(message.value)

        ###################   MODEL
        if mess['type'] == 'model':
            regressor = mess['regressor']
            logger.info("Updated model received")

        ###################   SIZE
        t = message.key
        if mess['type'] == 'size':
            # When we receive the final size of a cascade, we store it
            tweet_id = mess['cid']
            sizes[tweet_id]["real"] = mess['n_tot']

        if mess['type'] == "parameters":

            G1 = mess['G1']
            n_star = mess['n_star']
            tweet_id = mess['cid']
            p, beta = mess['params']
            msg = mess['msg']
            n_obs = mess['n_obs']

            try:
                sklearn.utils.validation.check_is_fitted(regressor)
                n_tot = regressor.predict((beta, n_star, G1))
            except:
                n_tot = n_obs + G1 / (1 - n_star)

            sizes[tweet_id]["prediction"] = n_tot

            forest_inputs[tweet_id] = [beta, n_star, G1, n_obs]

            alert_message = {
                'type': 'alert',
                'cid': tweet_id,
                'msg': msg,
                'T_obs': t,
                'n_tot': n_tot,
            }

            producer_alerts.send('alerts', key=None, value=alert_message)
            producer_alerts.flush()
            logger.info("Alert produced for tweet {} at time {}".format(
                tweet_id, t))

            if n_tot > alert_limit:
                logger.warning(
                    "Tweet {} may create an important cascade with {} retweets predicted"
                    .format(tweet_id, n_tot))

        if len(sizes[tweet_id].keys()) == 2:
            true_size = sizes[tweet_id]["real"]
            pred_size = sizes[tweet_id]["prediction"]
            are = abs(pred_size - true_size) / true_size

            stat_message = {
                'type': 'stat',
                'cid': tweet_id,
                'T_obs': t,
                'ARE': are
            }

            producer_alerts.send('stats', key=None, value=stat_message)
            producer_alerts.flush()
            beta, n_star, G1, n_obs = forest_inputs[tweet_id]

            W = (true_size - n_obs) * (1 - n_star) / G1

            sample_message = {
                'type': 'sample',
                'cid': tweet_id,
                'X': (beta, n_star, G1),
                'W': W
            }

            producer_samples.send('samples',
                                  key=args.obs_window,
                                  value=sample_message)
            producer_samples.flush()
            logger.info(
                "Stats and sample produced for tweet {} at time {}".format(
                    tweet_id, t))
Ejemplo n.º 7
0
"""
run_gevent.py
"""
from gevent import monkey

# Dev Note:
# Need `monkey.patch_all()` so the wsgi server can handle concurrent requests.
# Have to patch at the top of the file here because it overrides certain python
# modules with gevent modules (e.g. threading); otherwise, it may not work.
# Not an ideal solution; probably choose a different WSGI server.
monkey.patch_all()

from gevent.pywsgi import WSGIServer  # noqa: E402
from ml.app import app  # noqa: E402
from ml.utils.logger import get_logger  # noqa: E402

LOGGER = get_logger(__name__, level='INFO')
LOGGER.info("starting wsgi server")

# configure WSGI server
http_server = WSGIServer(('localhost', 8081), app)
http_server.serve_forever()