コード例 #1
0
class AuthMiddleware(object):

    def __init__(self, settings):
        self.session = AerospikeSession(**settings['aerospike'])

    def process_request(self, req, resp):
        token = req.get_header('Authorization')

        if req.path == '/v1/auth':
            return

        if req.method == 'OPTIONS':
            return

        if token is None:
            description = ('Please provide an auth token '
                           'as part of the request.')

            raise falcon.HTTPUnauthorized('Auth token required',
                                          description,
                                          href='http://docs.example.com/auth')

        try:
            user = self.session.query(Auth).filter_by(token=token).first()
            self.session.add(user, {'ttl': 3600})  # renew ttl
            req.context['user'] = user
        except (ValueError):
            description = ('The provided auth token is not valid. '
                           'Please request a new token and try again.')

            raise falcon.HTTPUnauthorized('Authentication required',
                                          description,
                                          href='http://docs.example.com/auth',
                                          scheme='Token; UUID')
コード例 #2
0
 def _before(self, *args, **kwargs):
     self.init_t = time.time()
     self.session = AerospikeSession(**settings['aerospike'])
コード例 #3
0
class FileToDb(GenericTask):

    allowed_kwargs = ['file', 'interval', 'project', 'logger']

    def _before(self, *args, **kwargs):
        self.init_t = time.time()
        self.session = AerospikeSession(**settings['aerospike'])

    def _get_file(self):
        return open(self.file, 'rb')

    def _fix_type(self, str):
        try:
            return int(str)
        except ValueError:
            pass

        try:
            return float(str)
        except ValueError:
            pass

        return str

    def execute(self, *args, **kwargs):
        datafile = self._get_file()
        header = None
        dt_epoch = datetime(1970, 1, 1)
        common_data = {}

        for line in datafile:

            try:
                line = line.decode('utf-8')

                if header is None:
                    header = line.strip().split('\t')
                    continue

                raw_data = line.strip().split('\t')
                ip_address = '.'.join(raw_data[0:4])
                raw_data = [self._fix_type(value) for value in raw_data]
                data = dict(zip(header[4:], raw_data[4:]))
                grouptime = datetime.strptime(str(data['grouptime']),
                                              '%Y%m%d%H%M')
                grouptime_interval = grouptime - dt_epoch
                common_data = {
                    'ip_address': ip_address,
                    'interval': self.interval,
                    'project': self.project.full_name,
                    'period': int(grouptime_interval.total_seconds()),
                }
                common_data.update(data)
                example = Example(common_data)
                self.process_example(example)

            except Exception as e:
                self.logger.info(str(e))
                if 'ip_address' in common_data:
                    self.logger.info('unknown error with example {0}'.format(
                        common_data['ip_address']))

        self.logger.info('Chunk {0} processed'.format(self.file))

        # clean
        datafile.close()
        os.remove(self.file)

    def process_example(self, example):
        try:
            for interval in self.project.interval:
                current = copy.copy(example)
                current.reduce_interval(interval)
                key, meta = self.session.exists(current)

                if meta is None:
                    self.session.add(current)
                else:
                    db_example = self.session.query(Example).get(
                        current.__key__)
                    current = current + db_example
                    self.session.add(current)
        except Exception as e:
            self.logger.error("Exception in process_record {0}".format(e))
コード例 #4
0
import sys
import os
sys.path.append(os.path.abspath('.'))
from model.persistence import AerospikeSession
from model.user import User
from model.project import Project
from model.example import Example
from settings import settings
import traceback
import time

try:

    session = AerospikeSession(**settings['aerospike'])

    print("Creating indexes..."),
    session.aerospike.index_string_create(settings['aerospike']['namespace'], 'user', 'name', 'idx_name')
    session.aerospike.index_string_create(settings['aerospike']['namespace'], 'user', 'private_token', 'idx_private_token')
    session.aerospike.index_string_create(settings['aerospike']['namespace'], 'auth', 'token', 'idx_token')
    session.aerospike.index_string_create(settings['aerospike']['namespace'], 'project', 'full_name', 'idx_full_name')
    session.aerospike.index_string_create(settings['aerospike']['namespace'], 'project', 'user', 'idx_projectuser')
    session.aerospike.index_string_create(settings['aerospike']['namespace'], 'example', 'p_interval', 'idx_interval')
    session.aerospike.index_string_create(settings['aerospike']['namespace'], 'example', 'ip_address', 'idx_ip')
    print("ok")

    print("Creating user..."),
    user = User({'name': 'jmpeso', 'email': '*****@*****.**'})
    user.password = user.make_password('1234')
    session.add(user)

    print('ok (auth token for {0} is {1})'.format(user.name, user.private_token))
コード例 #5
0
 def _before(self, *args, **kwargs):
     self.init_t = time.time()
     self.session = AerospikeSession(**settings["aerospike"])
コード例 #6
0
class OutlierDetector(GenericTask):

    page_size = 1000
    limit = None
    allowed_kwargs = ["project", "logger", "interval"]

    def _before(self, *args, **kwargs):
        self.init_t = time.time()
        self.session = AerospikeSession(**settings["aerospike"])

    def density(self, X):
        mean = np.mean(X, 0)
        sigma2 = np.var(X, 0)
        cov = np.diag(sigma2)
        return multivariate_normal.pdf(X, mean=mean, cov=cov)

    def find_outliers(self, X, p, epsilons):
        outliers = []
        for k, e in enumerate(epsilons):
            outliers.insert(k, X[p < e, :].shape[0])
        return (outliers, epsilons)

    def execute(self, *args, **kwargs):
        """Run outlier detection table
        """

        project_interval = "{0}_{1}".format(self.project.full_name, self.interval)

        t1 = time.time()
        labels, X = self.create_ndarray(
            self.session.query(Example).filter_by(p_interval=project_interval)
            # self.session.query(Example).filter_by(ip_address='84.120.211.34')
        )
        t2 = time.time()
        np.random.shuffle(X)

        # multivariate gauss
        t3 = time.time()
        columns = ["status_200", "as_bot", "as_badbot"]
        columns_idx = []
        for c in columns:
            columns_idx.append(labels.index(c))

        p = self.density(X[:, columns_idx])
        t4 = time.time()
        outliers, epsilon = self.find_outliers(X, p, self.project.list_epsilons)
        t5 = time.time()

        self.logger.info("\tDimensions are {0}".format(X.shape))
        self.logger.info("\t{0}MB used in data".format(X.nbytes / 1024 / 1024))
        self.logger.info("\tshuffle done ({0:.3f}s):".format(t3 - t2))
        self.logger.info("\tmultivariate done ({0:.3f}s):".format(t4 - t3))
        self.logger.info("\tfind outliers done ({0:.3f}s):".format(t5 - t4))

        # TAKE CARE, loop function not vectorized internally
        Xp = np.c_[X, p]
        for x in Xp:
            if x[-1] < self.project.cur_epsilon:
                prediction = Prediction(
                    {
                        "interval": self.interval,
                        "ip_address": self.int2ip(int(x[labels.index("ip_address")])),
                        "period": x[labels.index("period")],
                        "model": "gauss_multivariate",
                        "project": self.project.full_name,
                        "epsilon": float(self.project.cur_epsilon),
                        "pvalue": float(x[-1]),
                    }
                )
                self.session.add(prediction)

    def create_ndarray(self, resultset):
        """ Create a numpy ndarray with aerospike results via
            fromitem + data_generator. Returns labels and data separated.
        """
        labels = self.labels(resultset.first())
        X = np.fromiter(self.data_generator(resultset), np.uint32).reshape([-1, len(labels)])
        return (labels, X)

    def data_generator(self, examples):
        """ Generator that returns each field of each example for filling
            numpy ndarray. Be aware of types because numpy ndarray must have
            all items of same dtype.
        """
        for example in examples:
            yield example.period
            yield example.interval
            yield self.ip2int(example.ip_address)
            for name, value in example._custom_data.items():
                if isinstance(value, (int)):
                    yield value

    def labels(self, example):
        """ Return all labels of an example. Numpy array has no labels,
            we manage in separate list (same order). If something change here
            must change in data_generator and viceversa.
        """
        labels = ["period", "interval", "ip_address"]
        for name, value in example._custom_data.items():
            if isinstance(value, (int)):
                labels.append(name)
        return labels

    def ip2int(self, s):
        "Convert dotted IPv4 address to integer."
        return reduce(lambda a, b: a << 8 | b, map(int, s.split(".")))

    def int2ip(self, ip):
        "Convert 32-bit integer to dotted IPv4 address."
        return ".".join(map(lambda n: str(ip >> n & 0xFF), [24, 16, 8, 0]))
コード例 #7
0
 def __init__(self, settings):
     self.settings = settings
     self.session = AerospikeSession(**settings['aerospike'])
コード例 #8
0
class OutlierDetector(GenericTask):

    page_size = 1000
    limit = None
    allowed_kwargs = [
        'project',
        'logger',
        'interval',
    ]

    def _before(self, *args, **kwargs):
        self.init_t = time.time()
        self.session = AerospikeSession(**settings['aerospike'])

    def density(self, X):
        mean = np.mean(X, 0)
        sigma2 = np.var(X, 0)
        cov = np.diag(sigma2)
        return multivariate_normal.pdf(X, mean=mean, cov=cov)

    def find_outliers(self, X, p, epsilons):
        outliers = []
        for k, e in enumerate(epsilons):
            outliers.insert(k, X[p < e, :].shape[0])
        return (outliers, epsilons)

    def execute(self, *args, **kwargs):
        """Run outlier detection table
        """

        project_interval = "{0}_{1}".format(self.project.full_name,
                                            self.interval)

        t1 = time.time()
        labels, X = self.create_ndarray(
            self.session.query(Example).filter_by(p_interval=project_interval)
            # self.session.query(Example).filter_by(ip_address='84.120.211.34')
        )
        t2 = time.time()
        np.random.shuffle(X)

        # multivariate gauss
        t3 = time.time()
        columns = ['status_200', 'as_bot', 'as_badbot']
        columns_idx = []
        for c in columns:
            columns_idx.append(labels.index(c))

        p = self.density(X[:, columns_idx])
        t4 = time.time()
        outliers, epsilon = self.find_outliers(X, p,
                                               self.project.list_epsilons)
        t5 = time.time()

        self.logger.info("\tDimensions are {0}".format(X.shape))
        self.logger.info("\t{0}MB used in data".format(X.nbytes / 1024 / 1024))
        self.logger.info("\tshuffle done ({0:.3f}s):".format(t3 - t2))
        self.logger.info("\tmultivariate done ({0:.3f}s):".format(t4 - t3))
        self.logger.info("\tfind outliers done ({0:.3f}s):".format(t5 - t4))

        # TAKE CARE, loop function not vectorized internally
        Xp = np.c_[X, p]
        for x in Xp:
            if x[-1] < self.project.cur_epsilon:
                prediction = Prediction({
                    'interval':
                    self.interval,
                    'ip_address':
                    self.int2ip(int(x[labels.index('ip_address')])),
                    'period':
                    x[labels.index('period')],
                    'model':
                    'gauss_multivariate',
                    'project':
                    self.project.full_name,
                    'epsilon':
                    float(self.project.cur_epsilon),
                    'pvalue':
                    float(x[-1])
                })
                self.session.add(prediction)

    def create_ndarray(self, resultset):
        """ Create a numpy ndarray with aerospike results via
            fromitem + data_generator. Returns labels and data separated.
        """
        labels = self.labels(resultset.first())
        X = np.fromiter(self.data_generator(resultset),
                        np.uint32).reshape([-1, len(labels)])
        return (labels, X)

    def data_generator(self, examples):
        """ Generator that returns each field of each example for filling
            numpy ndarray. Be aware of types because numpy ndarray must have
            all items of same dtype.
        """
        for example in examples:
            yield example.period
            yield example.interval
            yield self.ip2int(example.ip_address)
            for name, value in example._custom_data.items():
                if isinstance(value, (int)):
                    yield value

    def labels(self, example):
        """ Return all labels of an example. Numpy array has no labels,
            we manage in separate list (same order). If something change here
            must change in data_generator and viceversa.
        """
        labels = ['period', 'interval', 'ip_address']
        for name, value in example._custom_data.items():
            if isinstance(value, (int)):
                labels.append(name)
        return labels

    def ip2int(self, s):
        "Convert dotted IPv4 address to integer."
        return reduce(lambda a, b: a << 8 | b, map(int, s.split(".")))

    def int2ip(self, ip):
        "Convert 32-bit integer to dotted IPv4 address."
        return ".".join(map(lambda n: str(ip >> n & 0xFF), [24, 16, 8, 0]))
コード例 #9
0
class SimpleStats(GenericTask):

    allowed_kwargs = [
        'project',
        'interval',
        'logger'
    ]

    def _before(self, *args, **kwargs):
        self.init_t = time.time()
        self.session = AerospikeSession(**settings['aerospike'])

    def execute(self, *args, **kwargs):
        """ Computes simple statistics over all examples of project and interval.
            All operations are calculated over entire matrix across columns
            (features) except percentile and histogram. After all calculations
            data is saved in feature set in one record per feature/project and one
            bin per stat/interval.
        """
        project_interval = "{0}_{1}".format(
            self.project.full_name, self.interval)

        labels, X = self.create_ndarray(
            self.session.query(Example).filter_by(p_interval=project_interval)
        )

        tmp_stats = {
            'max': np.amax(X[:, 3:], 0),
            'min': np.amin(X[:, 3:], 0),
            'mean': np.mean(X[:, 3:], 0),
            'std': np.std(X[:, 3:], 0),
            'var': np.var(X[:, 3:], 0),
            'median': np.median(X[:, 3:], 0),
        }

        # reshape by feature
        for feature_index, feature_name in enumerate(labels[3:]):
            if feature_name == 'grouptime':
                continue
            custom_data = {}
            xkey = feature_index + 3
            feature = Feature({'project': self.project.full_name, 'name': feature_name})
            hist, bin_hedges = np.histogram(X[:, xkey], bins=10)
            # cast to list are needed by persistence layer (array not supported)
            custom_data['histogram{0}'.format(self.interval)] = [list(hist), list(bin_hedges)]
            custom_data['percentile{0}'.format(self.interval)] = list(np.percentile(X[:, xkey], [25, 50, 75]))

            for stat_name, stat_value in tmp_stats.items():
                name = '{0}{1}'.format(stat_name, self.interval)
                value = float(stat_value[feature_index]) # cast needed
                custom_data[name] = value
            feature.add_custom_data(custom_data)
            self.session.add(feature)

    def create_ndarray(self, resultset):
        """ Create a numpy ndarray with aerospike results via
            fromitem + data_generator. Returns labels and data separated.
        """
        labels = self.labels(resultset.first())
        X = np.fromiter(
            self.data_generator(resultset),
            np.uint32
        ).reshape([-1, len(labels)])
        return (labels, X)

    def data_generator(self, examples):
        """ Generator that returns each field of each example for filling
            numpy ndarray. Be aware of types because numpy ndarray must have
            all items of same dtype.
        """
        for example in examples:
            yield example.period
            yield example.interval
            yield self.ip2int(example.ip_address)
            for name, value in example._custom_data.items():
                if isinstance(value, (int)):
                    yield value

    def labels(self, example):
        """ Return all labels of an example. Numpy array has no labels,
            we manage in separate list (same order). If something change here
            must change in data_generator and viceversa.
        """
        labels = ['period', 'interval', 'ip_address']
        for name, value in example._custom_data.items():
            if isinstance(value, (int)):
                labels.append(name)
        return labels

    def ip2int(self, s):
        "Convert dotted IPv4 address to integer."
        return reduce(lambda a, b: a << 8 | b, map(int, s.split(".")))


    def int2ip(self, ip):
        "Convert 32-bit integer to dotted IPv4 address."
        return ".".join(map(lambda n: str(ip >> n & 0xFF), [24, 16, 8, 0]))
コード例 #10
0
import sys
import os
from settings import settings
from tasks import *
from model.project import Project
from model.persistence import AerospikeSession
sys.path.append(os.path.abspath('.'))

session = AerospikeSession(**settings['aerospike'])
project = session.query(Project).filter_by(full_name='jmpeso/my_project').first()
simple_stats(project=project, interval=720)
#outlier_detector(project=project, interval=30)
コード例 #11
0
class FileToDb(GenericTask):

    allowed_kwargs = [
        'file',
        'interval',
        'project',
        'logger'
    ]

    def _before(self, *args, **kwargs):
        self.init_t = time.time()
        self.session = AerospikeSession(**settings['aerospike'])


    def _get_file(self):
        return open(self.file, 'rb')

    def _fix_type(self, str):
        try:
            return int(str)
        except ValueError:
            pass

        try:
            return float(str)
        except ValueError:
            pass

        return str

    def execute(self, *args, **kwargs):
        datafile = self._get_file()
        header = None
        dt_epoch = datetime(1970, 1, 1)
        common_data = {}

        for line in datafile:

            try:
                line = line.decode('utf-8')

                if header is None:
                    header = line.strip().split('\t')
                    continue

                raw_data = line.strip().split('\t')
                ip_address = '.'.join(raw_data[0:4])
                raw_data = [self._fix_type(value) for value in raw_data]
                data = dict(zip(header[4:], raw_data[4:]))
                grouptime = datetime.strptime(str(data['grouptime']), '%Y%m%d%H%M')
                grouptime_interval = grouptime - dt_epoch
                common_data = {
                    'ip_address': ip_address,
                    'interval': self.interval,
                    'project': self.project.full_name,
                    'period': int(grouptime_interval.total_seconds()),
                }
                common_data.update(data)
                example = Example(common_data)
                self.process_example(example)


            except Exception as e:
                self.logger.info(str(e))
                if 'ip_address' in common_data:
                    self.logger.info(
                        'unknown error with example {0}'
                        .format(common_data['ip_address']))

        self.logger.info('Chunk {0} processed'.format(self.file))

        # clean
        datafile.close()
        os.remove(self.file)

    def process_example(self, example):
        try:
            for interval in self.project.interval:
                current = copy.copy(example)
                current.reduce_interval(interval)
                key, meta = self.session.exists(current)

                if meta is None:
                    self.session.add(current)
                else:
                    db_example = self.session.query(Example).get(current.__key__)
                    current = current + db_example
                    self.session.add(current)
        except Exception as e:
            self.logger.error(
                "Exception in process_record {0}"
                .format(e))
コード例 #12
0
import sys
import os

sys.path.append(os.path.abspath('.'))
from model.persistence import AerospikeSession
from model.user import User
from model.project import Project
from model.example import Example
from settings import settings
import traceback
import time

try:

    session = AerospikeSession(**settings['aerospike'])

    print("Creating indexes..."),
    session.aerospike.index_string_create(settings['aerospike']['namespace'],
                                          'user', 'name', 'idx_name')
    session.aerospike.index_string_create(settings['aerospike']['namespace'],
                                          'user', 'private_token',
                                          'idx_private_token')
    session.aerospike.index_string_create(settings['aerospike']['namespace'],
                                          'auth', 'token', 'idx_token')
    session.aerospike.index_string_create(settings['aerospike']['namespace'],
                                          'project', 'full_name',
                                          'idx_full_name')
    session.aerospike.index_string_create(settings['aerospike']['namespace'],
                                          'project', 'user', 'idx_projectuser')
    session.aerospike.index_string_create(settings['aerospike']['namespace'],
                                          'example', 'p_interval',
コード例 #13
0
class SimpleStats(GenericTask):

    allowed_kwargs = ['project', 'interval', 'logger']

    def _before(self, *args, **kwargs):
        self.init_t = time.time()
        self.session = AerospikeSession(**settings['aerospike'])

    def execute(self, *args, **kwargs):
        """ Computes simple statistics over all examples of project and interval.
            All operations are calculated over entire matrix across columns
            (features) except percentile and histogram. After all calculations
            data is saved in feature set in one record per feature/project and one
            bin per stat/interval.
        """
        project_interval = "{0}_{1}".format(self.project.full_name,
                                            self.interval)

        labels, X = self.create_ndarray(
            self.session.query(Example).filter_by(p_interval=project_interval))

        tmp_stats = {
            'max': np.amax(X[:, 3:], 0),
            'min': np.amin(X[:, 3:], 0),
            'mean': np.mean(X[:, 3:], 0),
            'std': np.std(X[:, 3:], 0),
            'var': np.var(X[:, 3:], 0),
            'median': np.median(X[:, 3:], 0),
        }

        # reshape by feature
        for feature_index, feature_name in enumerate(labels[3:]):
            if feature_name == 'grouptime':
                continue
            custom_data = {}
            xkey = feature_index + 3
            feature = Feature({
                'project': self.project.full_name,
                'name': feature_name
            })
            hist, bin_hedges = np.histogram(X[:, xkey], bins=10)
            # cast to list are needed by persistence layer (array not supported)
            custom_data['histogram{0}'.format(
                self.interval)] = [list(hist), list(bin_hedges)]
            custom_data['percentile{0}'.format(self.interval)] = list(
                np.percentile(X[:, xkey], [25, 50, 75]))

            for stat_name, stat_value in tmp_stats.items():
                name = '{0}{1}'.format(stat_name, self.interval)
                value = float(stat_value[feature_index])  # cast needed
                custom_data[name] = value
            feature.add_custom_data(custom_data)
            self.session.add(feature)

    def create_ndarray(self, resultset):
        """ Create a numpy ndarray with aerospike results via
            fromitem + data_generator. Returns labels and data separated.
        """
        labels = self.labels(resultset.first())
        X = np.fromiter(self.data_generator(resultset),
                        np.uint32).reshape([-1, len(labels)])
        return (labels, X)

    def data_generator(self, examples):
        """ Generator that returns each field of each example for filling
            numpy ndarray. Be aware of types because numpy ndarray must have
            all items of same dtype.
        """
        for example in examples:
            yield example.period
            yield example.interval
            yield self.ip2int(example.ip_address)
            for name, value in example._custom_data.items():
                if isinstance(value, (int)):
                    yield value

    def labels(self, example):
        """ Return all labels of an example. Numpy array has no labels,
            we manage in separate list (same order). If something change here
            must change in data_generator and viceversa.
        """
        labels = ['period', 'interval', 'ip_address']
        for name, value in example._custom_data.items():
            if isinstance(value, (int)):
                labels.append(name)
        return labels

    def ip2int(self, s):
        "Convert dotted IPv4 address to integer."
        return reduce(lambda a, b: a << 8 | b, map(int, s.split(".")))

    def int2ip(self, ip):
        "Convert 32-bit integer to dotted IPv4 address."
        return ".".join(map(lambda n: str(ip >> n & 0xFF), [24, 16, 8, 0]))
コード例 #14
0
 def __init__(self, settings):
     self.session = AerospikeSession(**settings['aerospike'])