def connect_to_db(self): """ This connection to the db will live for the live time of the bothound_tools instance and will be used to save data back to the db """ self.session, self.engine = set_up_db(self.conf.__dict__)
def delete_by(past_to_current_ids, db_conf): """ Delete the old request_sets (that refer to the same target ip pairs as the current request_sets) :param tuple[int, int] past_to_current_ids: :param str db_conf: the serialized db_conf :return: True if everything went well, False otherwise :rtype: bool """ import json from baskerville.db import set_up_db from baskerville.db.models import RequestSet db_conf = json.loads(db_conf) session, engine = set_up_db(db_conf, False) try: session.query(RequestSet).filter( RequestSet.id.in_([x[0] for x in past_to_current_ids ])).delete(synchronize_session=False) session.commit() except Exception: import traceback traceback.print_exc() session.rollback() # do we want to rollback here? session.close() engine.dispose() return False session.close() engine.dispose() return True
def add_model_to_database(database_config): """ Load the test model and save it in the database :param dict[str, T] database_config: :return: """ global logger path = os.path.join(get_default_data_path(), 'samples', 'models', 'AnomalyModel') logger.info(f'Loading test model from: {path}') model = AnomalyModelSklearn() model.load(path=path) db_cfg = DatabaseConfig(database_config).validate() session, _ = set_up_db(db_cfg.__dict__, partition=False) db_model = Model() db_model.algorithm = 'baskerville.models.anomaly_model_sklearn.AnomalyModelSklearn' db_model.created_at = datetime.now(tz=tzutc()) db_model.parameters = json.dumps(model.get_params()) db_model.classifier = bytearray(path.encode('utf8')) # save to db session.add(db_model) session.commit() session.close()
def update_request_set_by_id(id, stop, features, prediction, subset_count, num_requests, db_conf): """ Updates a RequestSet by id :param int id: the request_set id :param datetime.datetime stop: current subset's stop datetime :param dict features: latest features :param float prediction: the current prediction :param int subset_count: the subset count :param int num_requests: the current subset's num_requests :param str db_conf: the database configuration serialized :return: """ import json import traceback from baskerville.db import set_up_db from baskerville.db.models import RequestSet db_conf = json.loads(db_conf) session, engine = set_up_db(db_conf, False) stop = stop.replace(tzinfo=tzutc()) values = { "stop": stop, "features": features, "prediction": prediction, "subset_count": subset_count, "num_requests": num_requests, } try: # with engine.connect() as conn: # stmt = RequestSet.__table__.update(). \ # values(values).where(RequestSet.id == id) # conn.execute(stmt) # # session.bulk_update_mappings(RequestSet, [values]) session.query(RequestSet).filter(RequestSet.id == id).update(values) session.flush() session.commit() except Exception: traceback.print_exc() session.rollback() session.close() engine.dispose() return False session.close() engine.dispose() return True
def bulk_update_request_sets(id, stop, features, prediction, subset_count, num_requests, total_seconds, db_conf): import json import traceback from itertools import zip_longest from baskerville.db import set_up_db from baskerville.db.models import RequestSet db_conf = json.loads(db_conf) # https://stackoverflow.com/questions/13125236/sqlalchemy-psycopg2-and-postgresql-copy # https://www.endpoint.com/blog/2014/04/11/speeding-up-saving-millions-of-orm session, engine = set_up_db(db_conf, False) req_per_row = list( zip_longest(id, stop, features, prediction, subset_count, num_requests, total_seconds)) try: for request_set in req_per_row: values = { "stop": request_set[1], "features": request_set[2], "prediction": request_set[3], "subset_count": request_set[4], "num_requests": request_set[5], "total_seconds": request_set[6] } session.query(RequestSet).filter( RequestSet.id == request_set[0]).update(values) session.flush() session.commit() except Exception: traceback.print_exc() session.rollback() session.close() engine.dispose() return False session.close() engine.dispose() return True
def cross_reference_misp(ip, db_conf): import json from baskerville.db import set_up_db from baskerville.db.models import Attribute db_conf = json.loads(db_conf) session, engine = set_up_db(db_conf, False) attribute = session.query(Attribute).filter(Attribute.value == ip).first() label = None id_attribute = None if attribute: label = LabelEnum.malicious.value id_attribute = attribute.id session.close() engine.dispose() return label, id_attribute
def maintain_db(): """ Runs the partitioning and archive scripts :return: """ # todo: this can fail silently baskerville_root = os.environ.get( 'BASKERVILLE_ROOT', '../../../../baskerville' ) # we need the current config for the database details config = parse_config(path=f'{baskerville_root}/conf/baskerville.yaml') logger = get_logger( __name__, logging_level=config['engine']['log_level'], output_file=config['engine']['logpath'] ) db_config = DatabaseConfig(config['database']).validate() if db_config.maintenance.partition_by != 'week': raise NotImplementedError( f'Partition by {db_config.maintenance.partition_by} ' f'is not yet implemented' ) # maintainance will run every Sunday, so now should be Sunday night # move to the start of Monday now = datetime.utcnow() y, w, _ = now.isocalendar() partition_start_week = isoweek.Week(y, w + 1) start = datetime.combine( partition_start_week.monday(), datetime.min.time() ) end = datetime.combine( partition_start_week.sunday(), datetime.max.time() ) logger.info(f'Data Partition Start : {start}') diy = get_days_in_year(end.year) latest_archive_date = end - timedelta(days=diy) latest_archive_year, latest_archive_week, _ = latest_archive_date.isocalendar() print(latest_archive_week, latest_archive_year) if latest_archive_week > 1: latest_archive_week = latest_archive_week - 1 else: latest_archive_week = isoweek.Week.last_week_of_year( latest_archive_year-1 ).week latest_archive_year = latest_archive_year - 1 week = isoweek.Week(latest_archive_year, latest_archive_week) print(week) db_config.maintenance.data_partition.since = start db_config.maintenance.data_partition.until = ( start + timedelta(days=6) ).replace( hour=23, minute=59, second=59 ) db_config.maintenance.data_archive.since = datetime.combine( week.monday(), datetime.min.time() ) db_config.maintenance.data_archive.until = datetime.combine( week.sunday(), datetime.max.time() ) print(db_config.maintenance.data_partition) print(db_config.maintenance.data_archive) # get sql scripts partition_sql = get_temporal_partitions(db_config.maintenance) archive_sql = get_archive_script( latest_archive_date - timedelta(weeks=1), latest_archive_date ) logger.debug(partition_sql) logger.debug(archive_sql) session, engine = set_up_db(db_config.__dict__, create=False) try: # create partitions session.execute(partition_sql) session.commit() print('Partitioning done') # detach partitions over a year and attach them to the archive table session.execute(archive_sql) session.commit() print('Archive done') except SQLAlchemyError as e: traceback.print_exc() session.rollback() logger.error(f'Error executing maintenance: {e}') finally: session.close()
def model_transfer(input_db_cfg, output_db_cfg, model_id=None, truncate_out_table=False): """ Transferring models between dbs. The databases can be located on the same server or on different servers. Use ssh tunneling for better performance in the second case. :param dict[str, T] input_db_cfg: the database configuration for the db to get data from input_db_cfg = { 'name': 'baskerville_training', 'user': '******', 'password': '******', 'host': '127.0.0.1', 'port': 5432, 'type': 'postgres', } :param dict[str, T] output_db_cfg: the database configuration for the db to store data to, e.g. output_db_cfg = { 'name': 'baskerville_production', 'user': '******', 'password': '******', 'host': '127.0.0.1', 'port': 5432, 'type': 'postgres', } :param int model_id: specify an id to get only one model :param bool truncate_out_table: Drop the models' data on the target database (True) or not (False) and restarts identity :return: None """ from baskerville.models.config import DatabaseConfig from baskerville.db import set_up_db from baskerville.db.models import Model in_db_cfg = DatabaseConfig(input_db_cfg) out_db_cfg = DatabaseConfig(output_db_cfg) in_session, _ = set_up_db(in_db_cfg.__dict__) out_session, _ = set_up_db(out_db_cfg.__dict__) if truncate_out_table: if input('Are you sure you want to truncate table data?(y/n)') in [ 'Y', 'y' ]: print('Truncating data...') out_session.execute( f'TRUNCATE TABLE {Model.__tablename__} RESTART IDENTITY') out_session.commit() else: print('Skipping data truncation...') models_in = in_session.query(Model) if model_id: models_in = models_in.filter_by(id=model_id) models_in = models_in.order_by(asc(Model.id)).all() print(f'{len(models_in)} models in in_db: {in_db_cfg.name}') try: for model in models_in: print(f'Getting model with id: {model.id}') model_out = Model() model_out.scaler = model.scaler model_out.classifier = model.classifier model_out.features = model.features model_out.algorithm = model.algorithm model_out.analysis_notebook = model.analysis_notebook model_out.created_at = model.created_at model_out.f1_score = model.f1_score model_out.n_training = model.n_training model_out.n_testing = model.n_testing model_out.notes = model.notes model_out.parameters = model.parameters model_out.precision = model.precision model_out.recall = model.recall model_out.request_sets = model.request_sets out_session.add(model_out) out_session.commit() except Exception: traceback.print_exc() out_session.rollback()