import unittest import sys sys.path.append('.') # in case file is run from root dir from cranial.re_iter import * from cranial.common import logger log = logger.get(name='test_re_iter') def dummy_fn(x): return 2 * x class TestReIter(unittest.TestCase): def test_ReGenerator(self): gen_fn = lambda: range(5) out = ReGenerator(gen_fn) actual = [_ for _ in out] + [_ for _ in out] expected = [_ for _ in range(5)] + [_ for _ in range(5)] self.assertListEqual(actual, expected, 'should repeat 0->4 sequence twice') def test_ReFilter(self): inpt = [0, 1, 2, 3, 4] out = ReFilter(iterable_input=inpt, fn=lambda x: x % 2) actual = [_ for _ in out] + [_ for _ in out] expected = [1, 3, 1, 3] self.assertListEqual(actual, expected, 'should leave only odd numbers, twice')
from abc import abstractmethod, ABCMeta import time from collections import deque from concurrent.futures import ThreadPoolExecutor from cranial.common import logger from cranial.model_base import ModelBase, StatefulModel log = logger.get(name='online_learning') class TrainerBase(metaclass=ABCMeta): """ Object responsible for defining when and how to update a model - `is_ready` is a method that will be called every time a transform method of OnlineLearningWrapper is called, if it returns True then OnlineLearningWrapper will try to get a training data from its accumulator and use it to update a model, or in case of remote updates will try to load a saved state from a connector. - `update` is a method that defines how to update: call a model.update with accumulated data, or start to load a remotely stored state """ @abstractmethod def update(self, model, data): """ Should take model and training data as arguments and return an updated model. The updating logic can be anything, it can use the data, or not use the data, or maybe completely re-instantiates a model. It's up to a developer and what needs they have. This method should also return True/False whether update was completed, this will allow OnlineLearningWrapper to call update again even outside of schedule to check again if update was completed
""" This file has primitive models that wrap around gensim common models such as LSI, TFIDF, etc... """ import gensim as g import os from cranial.re_iter import ReMap, DiskCache from cranial.model_base import StatefulModel, ModelBase from cranial.common import logger log = logger.get(name='gensim_models', var='MODELS_LOGLEVEL') # streaming log class GensimDictionary(StatefulModel): name = 'gensim_dictionary' def __init__(self, dict_params: dict, **kwargs): """ Wraps around gensim's Similarity index Parameters ---------- sim_params kwargs to pass to gensim's Similarity initialization This must have `output_prefix` and `num_features` kwargs any other kwargs to be passed to parent class __init__ """ super(GensimDictionary, self).__init__(**kwargs) self.params = dict_params self.state.model = None
import requests from time import time from typing import Any, Callable, Dict, List, Optional # noqa from cranial.common import logger from cranial.servicediscovery import base log = logger.get() MARATHON_URL = 'http://marathon.mesos:8080/v2/apps' def get_services_with_predicate(predicate: Callable) -> List: """Return a list of all Marathon Services that satisfy the predicate.""" response = requests.get(MARATHON_URL) if response.status_code == requests.codes.ok: services = [x for x in response.json()['apps'] if predicate(x)] if log: log.info('Got {} services from Marathon.'.format(len(services))) return services else: if log: log.warn('Bad response from Marathon Service Discovery.') return [] def get_tasks_for_service(service_id: str, portIndex: int = 0) -> List[str]: """Return a list of ip:portIndex for all tasks belonging to the service. The service_id is the string including the leading /, as given by the 'id' field for the service definition.
import io import logging import os import shutil import subprocess import tempfile from typing import Dict import boto3 from cranial.connectors import base from cranial.common import logger log = logger.get('S3_LOGLEVEL', name='s3_fetchers') SOURCE_DIR = 'storage/source' TARGET_DIR = 'storage/target' MODEL_DIR = 'storage/model' def cleanup_temp_data(): """ delete temp dirs """ try: shutil.rmtree(SOURCE_DIR) log.info("Removed {}".format(SOURCE_DIR)) except Exception as e: log.info(e) try: shutil.rmtree(TARGET_DIR)
""" tokenizers that use spacy """ import spacy from cranial.common import logger from cranial.re_iter import ReGenerator from cranial.model_base import ModelBase from cranial.models.tokenizers import add_n_grams log = logger.get(name='tokenizers_spacy', var='MODELS_LOGLEVEL') # streaming log class SpacyWrapper(ModelBase): name = 'spacy_wrapper' def __init__(self, lang='en', in_field=None, out_field=None, batch_size=10000, n_threads=1, **spacy_load_params): """ Use spaCy to transform text records into spacy document objects. Parameters ---------- min_length min number of characters for a token stop_list list of tokens to exclude n_grams add n-grams, if n_grams=2, then 'a b c' -> 'a', 'b', 'c', 'a_b', 'b_c' """
import pickle import json from typing import Iterable from cachetools import TTLCache from cranial.common import logger log = logger.get(name='cranial.keyvalue') # Default Databse Column type for values when blob is False DEFAULT_TYPE = 'text' class KeyValueDB(object): '''This object should mostly work like a dictionary, except it reads and writes from an external database. Subclasses can implement caching as appropriate. This class assumes that the database enforces uniqueness of the keys. I.e., all SELECT queries will have LIMIT 1. >>> kv = get_mock() >>> kv['foo'] = 'a' >>> kv[1] = 'int' >>> kv.cache_clear() >>> kv['foo'] 'a' >>> kv[1] 'int' >>> kv['bar'] = 'b' >>> kv.cache_clear()
import logging import io import os from google.cloud.storage import Client, Blob from google.oauth2 import service_account from cranial.connectors import base from cranial.common import logger log = logger.get('GOOGLECLOUDSTORAGE_LOGLEVEL', name='gcs_connector') class InMemoryConnector(base.Connector): def __init__(self, bucket, prefix='', binary=True, do_read=False, credentials=None, project=None): super().__init__(base_address=prefix, binary=binary, do_read=do_read) params = {'project': project} if credentials: creds = service_account.Credentials.from_service_account_info( credentials) params['credentials'] = creds self.bucket = Client(**params).get_bucket(bucket) self.bucket_name = bucket
import unittest import os from cranial.common import logger from cranial.model_base import State, StatefulModel, ModelBase log = logger.get('test_re_iter') class DummyModel(ModelBase): def transform(self, record): return record * 2 class DummyStateful(StatefulModel): def transform(self, record): return record * self.state.n def train(self, iterable): c = 0 for _ in iterable: c += 1 self.state.n = c class TestModelBase(unittest.TestCase): def test_State_save(self): s = State() s.foo = 'bar' s.save('tmp_state') actual = os.path.isfile('tmp_state') os.unlink('tmp_state')
# END_TIME end date for update data """ from docopt import docopt import arrow import os import traceback import json from cranial.fetchers import S3InMemoryConnector from cranial.fetchers.s3 import from_s3 from cranial.common import logger # these are supposed to exist at the target location from model import Model, Dataset, BUCKET, MODEL_PREFIX log = logger.get(name='train', var='TRAIN_LOGLEVEL') # streaming log def train(opts, **kwargs): """ Train a model Parameters ---------- opts a config - nested dictionary with options Returns ------- """ m = Model(name=opts['model_name'], **opts['model_params'], **kwargs) connector = S3InMemoryConnector(bucket=BUCKET, prefix=MODEL_PREFIX)
import traceback import os # 3rd-party modules. from docopt import docopt # 1st-party modules. from cranial.fetchers import S3InMemoryConnector from cranial.fetchers.s3 import from_s3 # these are supposed to exist at the target location from cranial.common import logger from cranial.listeners.kafka import Listener as Kafka from model import Model, Dataset, Consumer, BUCKET, MODEL_PREFIX log = logger.get(name='kafka_process', var='MODELS_LOGLEVEL') def run(opts): """ The server expects a \t separated ascii byte string of (user id, slug, yaml dict of additional parameters) and returns a \t separated list starting with the recommender algo version used, followed by a list of recommended slugs. @see parse_msg_list() """ # make a var for convenience restart_every = opts['restart_all_every_n'] restart_every = None if restart_every is None else int(restart_every) # init model, consumer and listener for the first time
import arrow # 3rd-party modules. from docopt import docopt # 1st-party modules. from cranial.fetchers import S3InMemoryConnector from cranial.fetchers.s3 import from_s3 from cranial.listeners.zmq import Listener as Zmq from cranial.messaging.adapters import firehose_async from cranial.common import logger # these are supposed to exist at the target location from model import Model, BUCKET, MODEL_PREFIX log = logger.get(var='SERVING_LOGLEVEL', name='serving') def start_model(opts, **kwargs): m = Model(name=opts['model_name'], **opts['model_params'], **kwargs) if opts.get('try_load', True): try: connector = S3InMemoryConnector(bucket=BUCKET, prefix=MODEL_PREFIX) m.load(connector=connector) except Exception as e: msg = """Did not load a state, will run with default initialized state. Reason: {}""" log.warning(msg.format(e)) else: log.warning("Config file specified that model does not load saved state, make sure this is correct") return m
from datetime import datetime import io import os from tempfile import mkstemp from typing import List, IO # noqa from cranial.connectors import base from cranial.common import logger log = logger.get(name='local_fetchers') # streaming log def file_readlines(fp, delete_after=False): """ memory efficient iterator to read lines from a file (readlines() method reads whole file) Parameters ---------- fp path to a decompressed file downloaded from s3 key delete_after delete file after it was read Returns ------- generator of lines """ with open(fp) as f: while True: line = f.readline() if line:
import unittest from cranial.common import logger from cranial.model_base import StatefulModel from cranial.online_training import OnlineLearningWrapper, TrainerBase, \ AccumulatorBase, CountSchedule log = logger.get('test_online_learning') class DummyModel(StatefulModel): def __init__(self): super(DummyModel, self).__init__() self.state.n = 0 def transform(self, record): return record * self.state.n def update(self, iterable): for _ in iterable: self.state.n += 1 return self def train(self, iterable): return self.update(iterable) class DummyTrainer(TrainerBase): is_ready = True def update(self, model, data): model.state.n = 1
import os from time import sleep, time from typing import Callable, Dict, List, Optional, Tuple # noqa from cachetools import cached, TTLCache from docopt import docopt from recordclass import RecordClass import ujson as json import cranial.messaging # noqa; For Typing. from cranial.messaging.base import Message, Notifier import cranial.common.config as config import cranial.common.logger as logger from cranial.common.utils import dieIf, warnIf logging = logger.get() opts = docopt(__doc__) if opts.get('--list'): import pkgutil import cranial.listeners as L import cranial.messaging as N print('Built-in Protocols\n==================') for pkg, name in [(L, "Listeners"), (N, "Notifiers")]: print("\n" + name + "\n----------------") prefix = pkg.__name__ + '.' for info in pkgutil.iter_modules(pkg.__path__, prefix): mod = info.name.split('.')[-1] if mod not in ['base', 'file']: print(mod)
other by config only, make sure you also set different model names --model_suffix=<s> same config but modify model name by adding a suffix """ from docopt import docopt import os import traceback import json from cranial.fetchers import S3InMemoryConnector from cranial.fetchers.s3 import from_s3 from cranial.common import logger # these are supposed to exist at the target location from model import Model, Dataset, Consumer, BUCKET, MODEL_PREFIX log = logger.get(name='backfill_job', var='BACKFILL_LOGLEVEL') # streaming log def backfill(opts, **kwargs): """ processes data by model Parameters ---------- opts options dictionary, should have 'input_data_params' - parameters to instatiate a dataset taht creates a stream of raw data to process, 'model_params' - model parameters needed to instantiate it 'output_data_params' - parameters of a consumer of processed data, things like params for additional post-model tranformations, parameters of destination for outputing results, etc... """
""" Helper module for example applications. Mimics ZeroMQ Guide's zhelpers.h. """ import binascii import os from random import randint from time import time import zmq from cranial.common import logger log = logger.get('ZMQ_LOGLEVEL') default_context = zmq.Context.instance() def socket_set_hwm(socket, hwm=-1): """libzmq 2/3/4 compatible sethwm""" try: socket.sndhwm = socket.rcvhwm = hwm except AttributeError: socket.hwm = hwm def dump(msg_or_socket): """Receives all message parts from socket, printing each frame neatly""" if isinstance(msg_or_socket, zmq.Socket): # it's a socket, call on current message msg = msg_or_socket.recv_multipart() else:
element at a time and not loading all avalable data into memory, but iteration can be done more than once. """ import time import uuid import os import json import random # changing to this one because it uses dill instead of pickle, so can use lambdas closures and functools from pathos.pools import ProcessPool from multiprocessing.pool import ThreadPool from cranial.common import logger log = logger.get(name='re_iter', var='REITER_LOGLEVEL') class ReIterBase(object): """ Base class, do not use by itself Defines convenience methods """ def __init__(self, iterable_input=None, name='', verbose=True): self.iterable_input = iterable_input self.name = name self.verbose = verbose self.iter_counter = 0 self.item_counter = 0 self.time_start = None self._curr_generator = None
""" base classes for models """ from abc import ABCMeta, abstractmethod import os from collections import OrderedDict from cranial.common import logger from cranial.re_iter import ReMap log = logger.get(name='model_base', var='MODELS_LOGLEVEL') # streaming log # Optional packages try: import dill as pickle except ImportError: import pickle try: import numpy as np except ImportError as e: log.info("Failed to import optional package: {}: {}.".format(type(e), e)) class NoMatch(Exception): pass class State(metaclass=ABCMeta): """ An object with save() & load() methods and any other properties. Notice that `Foo` below does NOT inherit from State.
IO, Optional, Set, # noqa Tuple, Union, TYPE_CHECKING) # noqa from recordclass import structclass from recordclass.recordobject import recordobject import ujson as json from cranial.common import logger import cranial.servicediscovery.base as sd from cranial.servicediscovery import marathon log = logger.get('MESSAGING_LOGLEVEL') StructClass = recordobject class Serde(metaclass=ABCMeta): @classmethod def __subclasshook__(cls, ClassObject): """This hook cases `isinstance(x, Serde)` to be True for any x which is an object or class having both loads() and dumps() methods.""" if cls is Serde: if any("loads" in B.__dict__ and "dumps" in B.__dict__ for B in ClassObject.__mro__): return True return NotImplemented
""" nlp models """ import collections import os import numpy as np from collections import Counter from cranial.common import logger from cranial.model_base import StatefulModel log = logger.get(name='nlp_models', var='MODELS_LOGLEVEL') class BasicDictionary(StatefulModel): name = 'basic_dictionary' def __init__(self, no_below_raw, no_above_raw, max_num_raw, no_below, no_above, max_num, filter_at=100000, token_is_tuple=False, protected_tokens=None, **kwargs): """ A custom class for creating a dictionary of tokens from given texts