Esempio n. 1
0
import unittest
import sys

sys.path.append('.')  # in case file is run from root dir
from cranial.re_iter import *
from cranial.common import logger

log = logger.get(name='test_re_iter')


def dummy_fn(x):
    return 2 * x


class TestReIter(unittest.TestCase):
    def test_ReGenerator(self):
        gen_fn = lambda: range(5)
        out = ReGenerator(gen_fn)
        actual = [_ for _ in out] + [_ for _ in out]
        expected = [_ for _ in range(5)] + [_ for _ in range(5)]
        self.assertListEqual(actual, expected,
                             'should repeat 0->4 sequence twice')

    def test_ReFilter(self):
        inpt = [0, 1, 2, 3, 4]
        out = ReFilter(iterable_input=inpt, fn=lambda x: x % 2)
        actual = [_ for _ in out] + [_ for _ in out]
        expected = [1, 3, 1, 3]
        self.assertListEqual(actual, expected,
                             'should leave only odd numbers, twice')
from abc import abstractmethod, ABCMeta
import time
from collections import deque
from concurrent.futures import ThreadPoolExecutor
from cranial.common import logger
from cranial.model_base import ModelBase, StatefulModel

log = logger.get(name='online_learning')


class TrainerBase(metaclass=ABCMeta):
    """
    Object responsible for defining when and how to update a model

        - `is_ready` is a method that will be called every time a transform method of OnlineLearningWrapper is
            called, if it returns True then OnlineLearningWrapper will try to get a training data from its accumulator
            and use it to update a model, or in case of remote updates will try to load a saved state from a connector.

        - `update` is a method that defines how to update: call a model.update with accumulated data, or
            start to load a remotely stored state
    """

    @abstractmethod
    def update(self, model, data):
        """
        Should take model and training data as arguments and return an updated model. The updating logic can be
        anything, it can use the data, or not use the data, or maybe completely re-instantiates a model. It's up to a
        developer and what needs they have.

        This method should also return True/False whether update was completed, this will allow OnlineLearningWrapper
        to call update again even outside of schedule to check again if update was completed
Esempio n. 3
0
"""
This file has primitive models that wrap around gensim common models such as LSI, TFIDF, etc...
"""
import gensim as g
import os
from cranial.re_iter import ReMap, DiskCache
from cranial.model_base import StatefulModel, ModelBase
from cranial.common import logger

log = logger.get(name='gensim_models', var='MODELS_LOGLEVEL')  # streaming log


class GensimDictionary(StatefulModel):
    name = 'gensim_dictionary'

    def __init__(self, dict_params: dict, **kwargs):
        """
        Wraps around gensim's Similarity index

        Parameters
        ----------
        sim_params
            kwargs to pass to gensim's Similarity initialization
            This must have `output_prefix` and `num_features`

        kwargs
            any other kwargs to be passed to parent class __init__
        """
        super(GensimDictionary, self).__init__(**kwargs)
        self.params = dict_params
        self.state.model = None
Esempio n. 4
0
import requests
from time import time
from typing import Any, Callable, Dict, List, Optional  # noqa

from cranial.common import logger
from cranial.servicediscovery import base

log = logger.get()

MARATHON_URL = 'http://marathon.mesos:8080/v2/apps'


def get_services_with_predicate(predicate: Callable) -> List:
    """Return a list of all Marathon Services that satisfy the predicate."""
    response = requests.get(MARATHON_URL)
    if response.status_code == requests.codes.ok:
        services = [x for x in response.json()['apps'] if predicate(x)]
        if log:
            log.info('Got {} services from Marathon.'.format(len(services)))
        return services
    else:
        if log:
            log.warn('Bad response from Marathon Service Discovery.')
        return []


def get_tasks_for_service(service_id: str, portIndex: int = 0) -> List[str]:
    """Return a list of ip:portIndex for all tasks belonging to the service.

    The service_id is the string including the leading /, as given by the 'id'
    field for the service definition.
Esempio n. 5
0
import io
import logging
import os
import shutil
import subprocess
import tempfile
from typing import Dict

import boto3

from cranial.connectors import base
from cranial.common import logger

log = logger.get('S3_LOGLEVEL', name='s3_fetchers')

SOURCE_DIR = 'storage/source'
TARGET_DIR = 'storage/target'
MODEL_DIR = 'storage/model'


def cleanup_temp_data():
    """
    delete temp dirs
    """
    try:
        shutil.rmtree(SOURCE_DIR)
        log.info("Removed {}".format(SOURCE_DIR))
    except Exception as e:
        log.info(e)
    try:
        shutil.rmtree(TARGET_DIR)
"""
tokenizers that use spacy
"""
import spacy

from cranial.common import logger
from cranial.re_iter import ReGenerator
from cranial.model_base import ModelBase
from cranial.models.tokenizers import add_n_grams

log = logger.get(name='tokenizers_spacy', var='MODELS_LOGLEVEL')  # streaming log


class SpacyWrapper(ModelBase):
    name = 'spacy_wrapper'

    def __init__(self, lang='en', in_field=None, out_field=None, batch_size=10000, n_threads=1, **spacy_load_params):
        """
        Use spaCy to transform text records into spacy document objects.

        Parameters
        ----------
        min_length
            min number of characters for a token

        stop_list
            list of tokens to exclude

        n_grams
            add n-grams, if n_grams=2, then 'a b c' -> 'a', 'b', 'c', 'a_b', 'b_c'
        """
Esempio n. 7
0
import pickle
import json
from typing import Iterable
from cachetools import TTLCache

from cranial.common import logger

log = logger.get(name='cranial.keyvalue')

# Default Databse Column type for values when blob is False
DEFAULT_TYPE = 'text'


class KeyValueDB(object):
    '''This object should mostly work like a dictionary, except it reads and
    writes from an external database. Subclasses can implement caching as
    appropriate.

    This class assumes that the database enforces uniqueness of the keys. I.e.,
    all SELECT queries will have LIMIT 1.

    >>> kv = get_mock()
    >>> kv['foo'] = 'a'
    >>> kv[1] = 'int'
    >>> kv.cache_clear()
    >>> kv['foo']
    'a'
    >>> kv[1]
    'int'
    >>> kv['bar'] = 'b'
    >>> kv.cache_clear()
Esempio n. 8
0
import logging
import io
import os

from google.cloud.storage import Client, Blob
from google.oauth2 import service_account

from cranial.connectors import base
from cranial.common import logger

log = logger.get('GOOGLECLOUDSTORAGE_LOGLEVEL', name='gcs_connector')


class InMemoryConnector(base.Connector):
    def __init__(self,
                 bucket,
                 prefix='',
                 binary=True,
                 do_read=False,
                 credentials=None,
                 project=None):
        super().__init__(base_address=prefix, binary=binary, do_read=do_read)

        params = {'project': project}
        if credentials:
            creds = service_account.Credentials.from_service_account_info(
                credentials)
            params['credentials'] = creds
        self.bucket = Client(**params).get_bucket(bucket)
        self.bucket_name = bucket
Esempio n. 9
0
import unittest
import os
from cranial.common import logger
from cranial.model_base import State, StatefulModel, ModelBase

log = logger.get('test_re_iter')


class DummyModel(ModelBase):
    def transform(self, record):
        return record * 2


class DummyStateful(StatefulModel):
    def transform(self, record):
        return record * self.state.n

    def train(self, iterable):
        c = 0
        for _ in iterable:
            c += 1
        self.state.n = c


class TestModelBase(unittest.TestCase):
    def test_State_save(self):
        s = State()
        s.foo = 'bar'
        s.save('tmp_state')
        actual = os.path.isfile('tmp_state')
        os.unlink('tmp_state')
Esempio n. 10
0
# END_TIME          end date for update data
"""
from docopt import docopt
import arrow
import os
import traceback
import json

from cranial.fetchers import S3InMemoryConnector
from cranial.fetchers.s3 import from_s3
from cranial.common import logger

# these are supposed to exist at the target location
from model import Model, Dataset, BUCKET, MODEL_PREFIX

log = logger.get(name='train', var='TRAIN_LOGLEVEL')  # streaming log


def train(opts, **kwargs):
    """
    Train a model
    Parameters
    ----------
    opts
        a config - nested dictionary with options
    Returns
    -------
    """
    m = Model(name=opts['model_name'], **opts['model_params'], **kwargs)
    connector = S3InMemoryConnector(bucket=BUCKET, prefix=MODEL_PREFIX)
Esempio n. 11
0
import traceback
import os

# 3rd-party modules.
from docopt import docopt

# 1st-party modules.
from cranial.fetchers import S3InMemoryConnector
from cranial.fetchers.s3 import from_s3

# these are supposed to exist at the target location
from cranial.common import logger
from cranial.listeners.kafka import Listener as Kafka
from model import Model, Dataset, Consumer, BUCKET, MODEL_PREFIX

log = logger.get(name='kafka_process', var='MODELS_LOGLEVEL')


def run(opts):
    """ The server expects a \t separated ascii byte string of
          (user id, slug, yaml dict of additional parameters)
        and returns a \t separated list starting with the recommender algo
        version used, followed by a list of recommended slugs.

        @see parse_msg_list()
    """
    # make a var for convenience
    restart_every = opts['restart_all_every_n']
    restart_every = None if restart_every is None else int(restart_every)

    # init model, consumer and listener for the first time
Esempio n. 12
0
import arrow

# 3rd-party modules.
from docopt import docopt

# 1st-party modules.
from cranial.fetchers import S3InMemoryConnector
from cranial.fetchers.s3 import from_s3
from cranial.listeners.zmq import Listener as Zmq
from cranial.messaging.adapters import firehose_async
from cranial.common import logger

# these are supposed to exist at the target location
from model import Model, BUCKET, MODEL_PREFIX

log = logger.get(var='SERVING_LOGLEVEL', name='serving')


def start_model(opts, **kwargs):
    m = Model(name=opts['model_name'], **opts['model_params'], **kwargs)
    if opts.get('try_load', True):
        try:
            connector = S3InMemoryConnector(bucket=BUCKET, prefix=MODEL_PREFIX)
            m.load(connector=connector)
        except Exception as e:
            msg = """Did not load a state, will run with default initialized state.
                       Reason: {}"""
            log.warning(msg.format(e))
    else:
        log.warning("Config file specified that model does not load saved state, make sure this is correct")
    return m
Esempio n. 13
0
from datetime import datetime
import io
import os
from tempfile import mkstemp
from typing import List, IO  # noqa

from cranial.connectors import base
from cranial.common import logger

log = logger.get(name='local_fetchers')  # streaming log


def file_readlines(fp, delete_after=False):
    """
    memory efficient iterator to read lines from a file (readlines() method
    reads whole file)

    Parameters
    ----------
    fp
        path to a decompressed file downloaded from s3 key
    delete_after
        delete file after it was read
    Returns
    -------
        generator of lines
    """
    with open(fp) as f:
        while True:
            line = f.readline()
            if line:
import unittest
from cranial.common import logger
from cranial.model_base import StatefulModel
from cranial.online_training import OnlineLearningWrapper, TrainerBase, \
    AccumulatorBase, CountSchedule

log = logger.get('test_online_learning')


class DummyModel(StatefulModel):
    def __init__(self):
        super(DummyModel, self).__init__()
        self.state.n = 0

    def transform(self, record):
        return record * self.state.n

    def update(self, iterable):
        for _ in iterable:
            self.state.n += 1
        return self

    def train(self, iterable):
        return self.update(iterable)


class DummyTrainer(TrainerBase):
    is_ready = True

    def update(self, model, data):
        model.state.n = 1
Esempio n. 15
0
import os
from time import sleep, time
from typing import Callable, Dict, List, Optional, Tuple  # noqa

from cachetools import cached, TTLCache
from docopt import docopt
from recordclass import RecordClass
import ujson as json

import cranial.messaging  # noqa; For Typing.
from cranial.messaging.base import Message, Notifier
import cranial.common.config as config
import cranial.common.logger as logger
from cranial.common.utils import dieIf, warnIf

logging = logger.get()

opts = docopt(__doc__)

if opts.get('--list'):
    import pkgutil
    import cranial.listeners as L
    import cranial.messaging as N
    print('Built-in Protocols\n==================')
    for pkg, name in [(L, "Listeners"), (N, "Notifiers")]:
        print("\n" + name + "\n----------------")
        prefix = pkg.__name__ + '.'
        for info in pkgutil.iter_modules(pkg.__path__, prefix):
            mod = info.name.split('.')[-1]
            if mod not in ['base', 'file']:
                print(mod)
Esempio n. 16
0
                    other by config only, make sure you also set different model names
--model_suffix=<s>  same config but modify model name by adding a suffix
"""
from docopt import docopt
import os
import traceback
import json

from cranial.fetchers import S3InMemoryConnector
from cranial.fetchers.s3 import from_s3
from cranial.common import logger

# these are supposed to exist at the target location
from model import Model, Dataset, Consumer, BUCKET, MODEL_PREFIX

log = logger.get(name='backfill_job', var='BACKFILL_LOGLEVEL')  # streaming log


def backfill(opts, **kwargs):
    """
    processes data by model
    Parameters
    ----------
    opts
        options dictionary, should have
            'input_data_params' - parameters to instatiate a dataset taht creates a stream of raw data to process,
            'model_params' - model parameters needed to instantiate it
            'output_data_params' - parameters of a consumer of processed data, things like params for additional
                                    post-model tranformations, parameters of destination for outputing results, etc...
    """
Esempio n. 17
0
"""
Helper module for example applications. Mimics ZeroMQ Guide's zhelpers.h.
"""

import binascii
import os
from random import randint
from time import time

import zmq

from cranial.common import logger

log = logger.get('ZMQ_LOGLEVEL')
default_context = zmq.Context.instance()


def socket_set_hwm(socket, hwm=-1):
    """libzmq 2/3/4 compatible sethwm"""
    try:
        socket.sndhwm = socket.rcvhwm = hwm
    except AttributeError:
        socket.hwm = hwm


def dump(msg_or_socket):
    """Receives all message parts from socket, printing each frame neatly"""
    if isinstance(msg_or_socket, zmq.Socket):
        # it's a socket, call on current message
        msg = msg_or_socket.recv_multipart()
    else:
Esempio n. 18
0
element at a time and not loading all avalable data into memory, but iteration can be done more than once.
"""

import time
import uuid
import os
import json
import random

# changing to this one because it uses dill instead of pickle, so can use lambdas closures and functools
from pathos.pools import ProcessPool
from multiprocessing.pool import ThreadPool

from cranial.common import logger

log = logger.get(name='re_iter', var='REITER_LOGLEVEL')


class ReIterBase(object):
    """
    Base class, do not use by itself
    Defines convenience methods
    """
    def __init__(self, iterable_input=None, name='', verbose=True):
        self.iterable_input = iterable_input
        self.name = name
        self.verbose = verbose
        self.iter_counter = 0
        self.item_counter = 0
        self.time_start = None
        self._curr_generator = None
Esempio n. 19
0
"""
base classes for models
"""
from abc import ABCMeta, abstractmethod
import os
from collections import OrderedDict
from cranial.common import logger
from cranial.re_iter import ReMap

log = logger.get(name='model_base', var='MODELS_LOGLEVEL')  # streaming log

# Optional packages
try:
    import dill as pickle
except ImportError:
    import pickle

try:
    import numpy as np
except ImportError as e:
    log.info("Failed to import optional package: {}: {}.".format(type(e), e))


class NoMatch(Exception):
    pass


class State(metaclass=ABCMeta):
    """ An object with save() & load() methods and any other properties.

    Notice that `Foo` below does NOT inherit from State.
Esempio n. 20
0
    IO,
    Optional,
    Set,  # noqa
    Tuple,
    Union,
    TYPE_CHECKING)  # noqa

from recordclass import structclass
from recordclass.recordobject import recordobject
import ujson as json

from cranial.common import logger
import cranial.servicediscovery.base as sd
from cranial.servicediscovery import marathon

log = logger.get('MESSAGING_LOGLEVEL')

StructClass = recordobject


class Serde(metaclass=ABCMeta):
    @classmethod
    def __subclasshook__(cls, ClassObject):
        """This hook cases `isinstance(x, Serde)` to be True for any x which is
        an object or class having both loads() and dumps() methods."""
        if cls is Serde:
            if any("loads" in B.__dict__ and "dumps" in B.__dict__
                   for B in ClassObject.__mro__):
                return True
        return NotImplemented
Esempio n. 21
0
"""
nlp models
"""
import collections
import os
import numpy as np
from collections import Counter

from cranial.common import logger
from cranial.model_base import StatefulModel

log = logger.get(name='nlp_models', var='MODELS_LOGLEVEL')


class BasicDictionary(StatefulModel):
    name = 'basic_dictionary'

    def __init__(self,
                 no_below_raw,
                 no_above_raw,
                 max_num_raw,
                 no_below,
                 no_above,
                 max_num,
                 filter_at=100000,
                 token_is_tuple=False,
                 protected_tokens=None,
                 **kwargs):
        """
        A custom class for creating a dictionary of tokens from given texts