class AuthMiddleware(object): def __init__(self, settings): self.session = AerospikeSession(**settings['aerospike']) def process_request(self, req, resp): token = req.get_header('Authorization') if req.path == '/v1/auth': return if req.method == 'OPTIONS': return if token is None: description = ('Please provide an auth token ' 'as part of the request.') raise falcon.HTTPUnauthorized('Auth token required', description, href='http://docs.example.com/auth') try: user = self.session.query(Auth).filter_by(token=token).first() self.session.add(user, {'ttl': 3600}) # renew ttl req.context['user'] = user except (ValueError): description = ('The provided auth token is not valid. ' 'Please request a new token and try again.') raise falcon.HTTPUnauthorized('Authentication required', description, href='http://docs.example.com/auth', scheme='Token; UUID')
class FileToDb(GenericTask): allowed_kwargs = ['file', 'interval', 'project', 'logger'] def _before(self, *args, **kwargs): self.init_t = time.time() self.session = AerospikeSession(**settings['aerospike']) def _get_file(self): return open(self.file, 'rb') def _fix_type(self, str): try: return int(str) except ValueError: pass try: return float(str) except ValueError: pass return str def execute(self, *args, **kwargs): datafile = self._get_file() header = None dt_epoch = datetime(1970, 1, 1) common_data = {} for line in datafile: try: line = line.decode('utf-8') if header is None: header = line.strip().split('\t') continue raw_data = line.strip().split('\t') ip_address = '.'.join(raw_data[0:4]) raw_data = [self._fix_type(value) for value in raw_data] data = dict(zip(header[4:], raw_data[4:])) grouptime = datetime.strptime(str(data['grouptime']), '%Y%m%d%H%M') grouptime_interval = grouptime - dt_epoch common_data = { 'ip_address': ip_address, 'interval': self.interval, 'project': self.project.full_name, 'period': int(grouptime_interval.total_seconds()), } common_data.update(data) example = Example(common_data) self.process_example(example) except Exception as e: self.logger.info(str(e)) if 'ip_address' in common_data: self.logger.info('unknown error with example {0}'.format( common_data['ip_address'])) self.logger.info('Chunk {0} processed'.format(self.file)) # clean datafile.close() os.remove(self.file) def process_example(self, example): try: for interval in self.project.interval: current = copy.copy(example) current.reduce_interval(interval) key, meta = self.session.exists(current) if meta is None: self.session.add(current) else: db_example = self.session.query(Example).get( current.__key__) current = current + db_example self.session.add(current) except Exception as e: self.logger.error("Exception in process_record {0}".format(e))
session = AerospikeSession(**settings['aerospike']) print("Creating indexes..."), session.aerospike.index_string_create(settings['aerospike']['namespace'], 'user', 'name', 'idx_name') session.aerospike.index_string_create(settings['aerospike']['namespace'], 'user', 'private_token', 'idx_private_token') session.aerospike.index_string_create(settings['aerospike']['namespace'], 'auth', 'token', 'idx_token') session.aerospike.index_string_create(settings['aerospike']['namespace'], 'project', 'full_name', 'idx_full_name') session.aerospike.index_string_create(settings['aerospike']['namespace'], 'project', 'user', 'idx_projectuser') session.aerospike.index_string_create(settings['aerospike']['namespace'], 'example', 'p_interval', 'idx_interval') session.aerospike.index_string_create(settings['aerospike']['namespace'], 'example', 'ip_address', 'idx_ip') print("ok") print("Creating user..."), user = User({'name': 'jmpeso', 'email': '*****@*****.**'}) user.password = user.make_password('1234') session.add(user) print('ok (auth token for {0} is {1})'.format(user.name, user.private_token)) assert user.check_password('1234') print("Creating project..."), project = Project({ 'user': '******', 'name': 'my_project', 'logformat': 'ApacheCombinedFormat', 'logdir': '/opt/data/', 'logpattern': 'myproject-files-*', 'hosts': [ 'host1.domain.com', 'host2.domain.com', 'host3.domain.com',
class OutlierDetector(GenericTask): page_size = 1000 limit = None allowed_kwargs = ["project", "logger", "interval"] def _before(self, *args, **kwargs): self.init_t = time.time() self.session = AerospikeSession(**settings["aerospike"]) def density(self, X): mean = np.mean(X, 0) sigma2 = np.var(X, 0) cov = np.diag(sigma2) return multivariate_normal.pdf(X, mean=mean, cov=cov) def find_outliers(self, X, p, epsilons): outliers = [] for k, e in enumerate(epsilons): outliers.insert(k, X[p < e, :].shape[0]) return (outliers, epsilons) def execute(self, *args, **kwargs): """Run outlier detection table """ project_interval = "{0}_{1}".format(self.project.full_name, self.interval) t1 = time.time() labels, X = self.create_ndarray( self.session.query(Example).filter_by(p_interval=project_interval) # self.session.query(Example).filter_by(ip_address='84.120.211.34') ) t2 = time.time() np.random.shuffle(X) # multivariate gauss t3 = time.time() columns = ["status_200", "as_bot", "as_badbot"] columns_idx = [] for c in columns: columns_idx.append(labels.index(c)) p = self.density(X[:, columns_idx]) t4 = time.time() outliers, epsilon = self.find_outliers(X, p, self.project.list_epsilons) t5 = time.time() self.logger.info("\tDimensions are {0}".format(X.shape)) self.logger.info("\t{0}MB used in data".format(X.nbytes / 1024 / 1024)) self.logger.info("\tshuffle done ({0:.3f}s):".format(t3 - t2)) self.logger.info("\tmultivariate done ({0:.3f}s):".format(t4 - t3)) self.logger.info("\tfind outliers done ({0:.3f}s):".format(t5 - t4)) # TAKE CARE, loop function not vectorized internally Xp = np.c_[X, p] for x in Xp: if x[-1] < self.project.cur_epsilon: prediction = Prediction( { "interval": self.interval, "ip_address": self.int2ip(int(x[labels.index("ip_address")])), "period": x[labels.index("period")], "model": "gauss_multivariate", "project": self.project.full_name, "epsilon": float(self.project.cur_epsilon), "pvalue": float(x[-1]), } ) self.session.add(prediction) def create_ndarray(self, resultset): """ Create a numpy ndarray with aerospike results via fromitem + data_generator. Returns labels and data separated. """ labels = self.labels(resultset.first()) X = np.fromiter(self.data_generator(resultset), np.uint32).reshape([-1, len(labels)]) return (labels, X) def data_generator(self, examples): """ Generator that returns each field of each example for filling numpy ndarray. Be aware of types because numpy ndarray must have all items of same dtype. """ for example in examples: yield example.period yield example.interval yield self.ip2int(example.ip_address) for name, value in example._custom_data.items(): if isinstance(value, (int)): yield value def labels(self, example): """ Return all labels of an example. Numpy array has no labels, we manage in separate list (same order). If something change here must change in data_generator and viceversa. """ labels = ["period", "interval", "ip_address"] for name, value in example._custom_data.items(): if isinstance(value, (int)): labels.append(name) return labels def ip2int(self, s): "Convert dotted IPv4 address to integer." return reduce(lambda a, b: a << 8 | b, map(int, s.split("."))) def int2ip(self, ip): "Convert 32-bit integer to dotted IPv4 address." return ".".join(map(lambda n: str(ip >> n & 0xFF), [24, 16, 8, 0]))
class OutlierDetector(GenericTask): page_size = 1000 limit = None allowed_kwargs = [ 'project', 'logger', 'interval', ] def _before(self, *args, **kwargs): self.init_t = time.time() self.session = AerospikeSession(**settings['aerospike']) def density(self, X): mean = np.mean(X, 0) sigma2 = np.var(X, 0) cov = np.diag(sigma2) return multivariate_normal.pdf(X, mean=mean, cov=cov) def find_outliers(self, X, p, epsilons): outliers = [] for k, e in enumerate(epsilons): outliers.insert(k, X[p < e, :].shape[0]) return (outliers, epsilons) def execute(self, *args, **kwargs): """Run outlier detection table """ project_interval = "{0}_{1}".format(self.project.full_name, self.interval) t1 = time.time() labels, X = self.create_ndarray( self.session.query(Example).filter_by(p_interval=project_interval) # self.session.query(Example).filter_by(ip_address='84.120.211.34') ) t2 = time.time() np.random.shuffle(X) # multivariate gauss t3 = time.time() columns = ['status_200', 'as_bot', 'as_badbot'] columns_idx = [] for c in columns: columns_idx.append(labels.index(c)) p = self.density(X[:, columns_idx]) t4 = time.time() outliers, epsilon = self.find_outliers(X, p, self.project.list_epsilons) t5 = time.time() self.logger.info("\tDimensions are {0}".format(X.shape)) self.logger.info("\t{0}MB used in data".format(X.nbytes / 1024 / 1024)) self.logger.info("\tshuffle done ({0:.3f}s):".format(t3 - t2)) self.logger.info("\tmultivariate done ({0:.3f}s):".format(t4 - t3)) self.logger.info("\tfind outliers done ({0:.3f}s):".format(t5 - t4)) # TAKE CARE, loop function not vectorized internally Xp = np.c_[X, p] for x in Xp: if x[-1] < self.project.cur_epsilon: prediction = Prediction({ 'interval': self.interval, 'ip_address': self.int2ip(int(x[labels.index('ip_address')])), 'period': x[labels.index('period')], 'model': 'gauss_multivariate', 'project': self.project.full_name, 'epsilon': float(self.project.cur_epsilon), 'pvalue': float(x[-1]) }) self.session.add(prediction) def create_ndarray(self, resultset): """ Create a numpy ndarray with aerospike results via fromitem + data_generator. Returns labels and data separated. """ labels = self.labels(resultset.first()) X = np.fromiter(self.data_generator(resultset), np.uint32).reshape([-1, len(labels)]) return (labels, X) def data_generator(self, examples): """ Generator that returns each field of each example for filling numpy ndarray. Be aware of types because numpy ndarray must have all items of same dtype. """ for example in examples: yield example.period yield example.interval yield self.ip2int(example.ip_address) for name, value in example._custom_data.items(): if isinstance(value, (int)): yield value def labels(self, example): """ Return all labels of an example. Numpy array has no labels, we manage in separate list (same order). If something change here must change in data_generator and viceversa. """ labels = ['period', 'interval', 'ip_address'] for name, value in example._custom_data.items(): if isinstance(value, (int)): labels.append(name) return labels def ip2int(self, s): "Convert dotted IPv4 address to integer." return reduce(lambda a, b: a << 8 | b, map(int, s.split("."))) def int2ip(self, ip): "Convert 32-bit integer to dotted IPv4 address." return ".".join(map(lambda n: str(ip >> n & 0xFF), [24, 16, 8, 0]))
class SimpleStats(GenericTask): allowed_kwargs = [ 'project', 'interval', 'logger' ] def _before(self, *args, **kwargs): self.init_t = time.time() self.session = AerospikeSession(**settings['aerospike']) def execute(self, *args, **kwargs): """ Computes simple statistics over all examples of project and interval. All operations are calculated over entire matrix across columns (features) except percentile and histogram. After all calculations data is saved in feature set in one record per feature/project and one bin per stat/interval. """ project_interval = "{0}_{1}".format( self.project.full_name, self.interval) labels, X = self.create_ndarray( self.session.query(Example).filter_by(p_interval=project_interval) ) tmp_stats = { 'max': np.amax(X[:, 3:], 0), 'min': np.amin(X[:, 3:], 0), 'mean': np.mean(X[:, 3:], 0), 'std': np.std(X[:, 3:], 0), 'var': np.var(X[:, 3:], 0), 'median': np.median(X[:, 3:], 0), } # reshape by feature for feature_index, feature_name in enumerate(labels[3:]): if feature_name == 'grouptime': continue custom_data = {} xkey = feature_index + 3 feature = Feature({'project': self.project.full_name, 'name': feature_name}) hist, bin_hedges = np.histogram(X[:, xkey], bins=10) # cast to list are needed by persistence layer (array not supported) custom_data['histogram{0}'.format(self.interval)] = [list(hist), list(bin_hedges)] custom_data['percentile{0}'.format(self.interval)] = list(np.percentile(X[:, xkey], [25, 50, 75])) for stat_name, stat_value in tmp_stats.items(): name = '{0}{1}'.format(stat_name, self.interval) value = float(stat_value[feature_index]) # cast needed custom_data[name] = value feature.add_custom_data(custom_data) self.session.add(feature) def create_ndarray(self, resultset): """ Create a numpy ndarray with aerospike results via fromitem + data_generator. Returns labels and data separated. """ labels = self.labels(resultset.first()) X = np.fromiter( self.data_generator(resultset), np.uint32 ).reshape([-1, len(labels)]) return (labels, X) def data_generator(self, examples): """ Generator that returns each field of each example for filling numpy ndarray. Be aware of types because numpy ndarray must have all items of same dtype. """ for example in examples: yield example.period yield example.interval yield self.ip2int(example.ip_address) for name, value in example._custom_data.items(): if isinstance(value, (int)): yield value def labels(self, example): """ Return all labels of an example. Numpy array has no labels, we manage in separate list (same order). If something change here must change in data_generator and viceversa. """ labels = ['period', 'interval', 'ip_address'] for name, value in example._custom_data.items(): if isinstance(value, (int)): labels.append(name) return labels def ip2int(self, s): "Convert dotted IPv4 address to integer." return reduce(lambda a, b: a << 8 | b, map(int, s.split("."))) def int2ip(self, ip): "Convert 32-bit integer to dotted IPv4 address." return ".".join(map(lambda n: str(ip >> n & 0xFF), [24, 16, 8, 0]))
class FileToDb(GenericTask): allowed_kwargs = [ 'file', 'interval', 'project', 'logger' ] def _before(self, *args, **kwargs): self.init_t = time.time() self.session = AerospikeSession(**settings['aerospike']) def _get_file(self): return open(self.file, 'rb') def _fix_type(self, str): try: return int(str) except ValueError: pass try: return float(str) except ValueError: pass return str def execute(self, *args, **kwargs): datafile = self._get_file() header = None dt_epoch = datetime(1970, 1, 1) common_data = {} for line in datafile: try: line = line.decode('utf-8') if header is None: header = line.strip().split('\t') continue raw_data = line.strip().split('\t') ip_address = '.'.join(raw_data[0:4]) raw_data = [self._fix_type(value) for value in raw_data] data = dict(zip(header[4:], raw_data[4:])) grouptime = datetime.strptime(str(data['grouptime']), '%Y%m%d%H%M') grouptime_interval = grouptime - dt_epoch common_data = { 'ip_address': ip_address, 'interval': self.interval, 'project': self.project.full_name, 'period': int(grouptime_interval.total_seconds()), } common_data.update(data) example = Example(common_data) self.process_example(example) except Exception as e: self.logger.info(str(e)) if 'ip_address' in common_data: self.logger.info( 'unknown error with example {0}' .format(common_data['ip_address'])) self.logger.info('Chunk {0} processed'.format(self.file)) # clean datafile.close() os.remove(self.file) def process_example(self, example): try: for interval in self.project.interval: current = copy.copy(example) current.reduce_interval(interval) key, meta = self.session.exists(current) if meta is None: self.session.add(current) else: db_example = self.session.query(Example).get(current.__key__) current = current + db_example self.session.add(current) except Exception as e: self.logger.error( "Exception in process_record {0}" .format(e))
session.aerospike.index_string_create(settings['aerospike']['namespace'], 'project', 'full_name', 'idx_full_name') session.aerospike.index_string_create(settings['aerospike']['namespace'], 'project', 'user', 'idx_projectuser') session.aerospike.index_string_create(settings['aerospike']['namespace'], 'example', 'p_interval', 'idx_interval') session.aerospike.index_string_create(settings['aerospike']['namespace'], 'example', 'ip_address', 'idx_ip') print("ok") print("Creating user..."), user = User({'name': 'jmpeso', 'email': '*****@*****.**'}) user.password = user.make_password('1234') session.add(user) print('ok (auth token for {0} is {1})'.format(user.name, user.private_token)) assert user.check_password('1234') print("Creating project..."), project = Project({ 'user': '******', 'name': 'my_project', 'logformat': 'ApacheCombinedFormat', 'logdir': '/opt/data/',
class SimpleStats(GenericTask): allowed_kwargs = ['project', 'interval', 'logger'] def _before(self, *args, **kwargs): self.init_t = time.time() self.session = AerospikeSession(**settings['aerospike']) def execute(self, *args, **kwargs): """ Computes simple statistics over all examples of project and interval. All operations are calculated over entire matrix across columns (features) except percentile and histogram. After all calculations data is saved in feature set in one record per feature/project and one bin per stat/interval. """ project_interval = "{0}_{1}".format(self.project.full_name, self.interval) labels, X = self.create_ndarray( self.session.query(Example).filter_by(p_interval=project_interval)) tmp_stats = { 'max': np.amax(X[:, 3:], 0), 'min': np.amin(X[:, 3:], 0), 'mean': np.mean(X[:, 3:], 0), 'std': np.std(X[:, 3:], 0), 'var': np.var(X[:, 3:], 0), 'median': np.median(X[:, 3:], 0), } # reshape by feature for feature_index, feature_name in enumerate(labels[3:]): if feature_name == 'grouptime': continue custom_data = {} xkey = feature_index + 3 feature = Feature({ 'project': self.project.full_name, 'name': feature_name }) hist, bin_hedges = np.histogram(X[:, xkey], bins=10) # cast to list are needed by persistence layer (array not supported) custom_data['histogram{0}'.format( self.interval)] = [list(hist), list(bin_hedges)] custom_data['percentile{0}'.format(self.interval)] = list( np.percentile(X[:, xkey], [25, 50, 75])) for stat_name, stat_value in tmp_stats.items(): name = '{0}{1}'.format(stat_name, self.interval) value = float(stat_value[feature_index]) # cast needed custom_data[name] = value feature.add_custom_data(custom_data) self.session.add(feature) def create_ndarray(self, resultset): """ Create a numpy ndarray with aerospike results via fromitem + data_generator. Returns labels and data separated. """ labels = self.labels(resultset.first()) X = np.fromiter(self.data_generator(resultset), np.uint32).reshape([-1, len(labels)]) return (labels, X) def data_generator(self, examples): """ Generator that returns each field of each example for filling numpy ndarray. Be aware of types because numpy ndarray must have all items of same dtype. """ for example in examples: yield example.period yield example.interval yield self.ip2int(example.ip_address) for name, value in example._custom_data.items(): if isinstance(value, (int)): yield value def labels(self, example): """ Return all labels of an example. Numpy array has no labels, we manage in separate list (same order). If something change here must change in data_generator and viceversa. """ labels = ['period', 'interval', 'ip_address'] for name, value in example._custom_data.items(): if isinstance(value, (int)): labels.append(name) return labels def ip2int(self, s): "Convert dotted IPv4 address to integer." return reduce(lambda a, b: a << 8 | b, map(int, s.split("."))) def int2ip(self, ip): "Convert 32-bit integer to dotted IPv4 address." return ".".join(map(lambda n: str(ip >> n & 0xFF), [24, 16, 8, 0]))