def get_top_N_connections_for_entity(interval_label, last_used_ts, max_ts, time_between, direction, entity_id, entity_ip): with get_db_cursor() as c: values = [] c.execute(f""" SELECT f.ipv4_src_addr, f.ipv4_dst_addr, sum(f.in_bytes) "traffic" FROM {DB_PREFIX}flows "f" WHERE f.client_ip = %s AND f.ts > %s AND f.ts <= %s AND f.direction = %s GROUP BY f.ipv4_src_addr, f.ipv4_dst_addr ORDER BY traffic desc LIMIT {TOP_N_MAX}; """, (entity_ip, last_used_ts, max_ts, direction,)) output_path_entity = NetFlowBot.construct_output_path_prefix(interval_label, direction, entity_id, interface=None) for ipv4_src_addr, ipv4_dst_addr, traffic_bytes in c.fetchall(): output_path = f"{output_path_entity}.topconn.{path_part_encode(ipv4_src_addr)}.{path_part_encode(ipv4_dst_addr)}" values.append({ 'p': output_path, 'v': traffic_bytes / time_between, # Bps }) return values
def get_top_N_protocols_for_entity(interval_label, last_used_ts, max_ts, time_between, direction, entity_id, entity_ip): with get_db_cursor() as c: values = [] c.execute(f""" SELECT f.protocol, sum(f.in_bytes) "traffic" FROM {DB_PREFIX}flows "f" WHERE f.client_ip = %s AND f.ts > %s AND f.ts <= %s AND f.direction = %s GROUP BY f.protocol ORDER BY traffic desc LIMIT {TOP_N_MAX}; """, (entity_ip, last_used_ts, max_ts, direction,)) output_path_entity = NetFlowBot.construct_output_path_prefix(interval_label, direction, entity_id, interface=None) for protocol, traffic_bytes in c.fetchall(): protocol_label = PROTOCOLS[protocol] if protocol in PROTOCOLS else f"proto{protocol}" output_path = f"{output_path_entity}.topprotocol.{path_part_encode(protocol_label)}" values.append({ 'p': output_path, 'v': traffic_bytes / time_between, # Bps }) return values
def _get_current_max_ts(): with get_db_cursor() as c: c.execute(f"SELECT MAX(ts) FROM {DB_PREFIX}flows2;") rec = c.fetchone() if rec is None: return None max_ts, = rec return max_ts
def _get_last_used_ts(job_id): with get_db_cursor() as c: c.execute(f'SELECT j.last_used_ts FROM {DB_PREFIX}bot_jobs j WHERE j.job_id = %s;', (job_id,)) rec = c.fetchone() if rec is None: return None last_used_ts, = rec return int(last_used_ts)
def get_top_N_IPs_for_entity_interfaces(interval_label, last_used_ts, max_ts, time_between, direction, entity_id, entity_ip): with get_db_cursor() as c, get_db_cursor() as c2: values = [] c.execute(f""" SELECT distinct(f.{'input_snmp' if direction == DIRECTION_INGRESS else 'output_snmp'}) "interface_index" FROM {DB_PREFIX}flows "f" WHERE f.client_ip = %s AND f.ts > %s AND f.ts <= %s AND f.direction = %s """, (entity_ip, last_used_ts, max_ts, direction,)) for interface_index, in c.fetchall(): c2.execute(f""" SELECT f.{'ipv4_src_addr' if direction == DIRECTION_INGRESS else 'ipv4_dst_addr'}, sum(f.in_bytes) "traffic" FROM {DB_PREFIX}flows "f" WHERE f.client_ip = %s AND f.ts > %s AND f.ts <= %s AND f.direction = %s AND f.{'input_snmp' if direction == DIRECTION_INGRESS else 'output_snmp'} = %s GROUP BY f.{'ipv4_src_addr' if direction == DIRECTION_INGRESS else 'ipv4_dst_addr'} ORDER BY traffic desc LIMIT {TOP_N_MAX}; """, (entity_ip, last_used_ts, max_ts, direction, interface_index,)) output_path_interface = NetFlowBot.construct_output_path_prefix(interval_label, direction, entity_id, interface=interface_index) for top_ip, traffic_bytes in c2.fetchall(): output_path = f"{output_path_interface}.topip.{path_part_encode(top_ip)}" values.append({ 'p': output_path, 'v': traffic_bytes / time_between, # Bps }) return values
def _pgwriter_finish(pgwriter): try: with get_db_cursor() as c: pgwriter.write(struct.pack('!h', -1)) pgwriter.seek(0) c.copy_expert(f"COPY {DB_PREFIX}flows2 FROM STDIN WITH BINARY", pgwriter) except DBConnectionError: log.error("Error writing to DB, records lost!") return
def job_maint_remove_old_data(*args, **kwargs): log.info("MAINT: Maintenance started - removing old data") try: with get_db_cursor() as c: c.execute( f"SELECT drop_chunks(INTERVAL '{LEAVE_N_PAST_DAYS} days', '{DB_PREFIX}flows2');" ) log.info("MAINT: Maintenance finished (removing old data).") except DBConnectionError: log.error( "MAINT: Maintenance job (removing old data) failed due to DB connection issues." )
def ensure_flow_table_partition_exists(partition_no): ts_start = partition_no * S_PER_PARTITION ts_end = ts_start + S_PER_PARTITION with get_db_cursor() as c: # "When creating a range partition, the lower bound specified with FROM is an inclusive bound, whereas # the upper bound specified with TO is an exclusive bound." # PARTITION OF: "Any indexes, constraints and user-defined row-level triggers that exist in the parent # table are cloned on the new partition." # https://www.postgresql.org/docs/12/sql-createtable.html c.execute( f"CREATE UNLOGGED TABLE IF NOT EXISTS {DB_PREFIX}flows_{partition_no} PARTITION OF {DB_PREFIX}flows FOR VALUES FROM ({ts_start}) TO ({ts_end})" ) return partition_no
def get_traffic_for_entity(interval_label, last_used_ts, max_ts, time_between, direction, entity_id, entity_ip): # returns cumulative traffic for the whole entity, and traffic per interface for this entity with get_db_cursor() as c: sql = f""" SELECT f.{'input_snmp' if direction == DIRECTION_INGRESS else 'output_snmp'}, sum(f.in_bytes) FROM {DB_PREFIX}flows2 "f" WHERE f.client_ip = %s AND f.ts > %s AND f.ts <= %s AND f.direction = %s GROUP BY f.{'input_snmp' if direction == DIRECTION_INGRESS else 'output_snmp'} """ c.execute(sql, (entity_ip, last_used_ts, max_ts, direction)) values = [] sum_traffic = 0 for if_index, traffic_bytes in c.fetchall(): output_path = NetFlowBot.construct_output_path_prefix( interval_label, direction, entity_id, interface=if_index) v = traffic_bytes / time_between if v < 0: # invalid condition - to find the cause we need some additional logging: log.error( "Sum of positive numbers should never be negative! " + f"{v} {traffic_bytes} {time_between} {sql} {entity_ip} {last_used_ts} {max_ts} {direction}" ) continue # this is never ok - skip this value values.append({ 'p': output_path, 'v': v, }) sum_traffic += traffic_bytes output_path = NetFlowBot.construct_output_path_prefix( interval_label, direction, entity_id, interface=None) values.append({ 'p': output_path, 'v': sum_traffic / time_between, }) return values, sum_traffic
def job_maint_suggest_entities(*args, **job_params): log.info("MAINT: Maintenance started - making suggestions for device entities") backend_url = job_params['backend_url'] bot_token = job_params['bot_token'] requests_session = requests.Session() # for each account, add any new netflow exporters (entities) that might not exist yet: # find all the accounts we have access to: r = requests_session.get(f'{backend_url}/accounts/?b={bot_token}') if r.status_code != 200: raise Exception("Invalid bot token or network error, got status {} while retrieving {}/accounts".format(r.status_code, backend_url)) j = r.json() accounts_ids = [a["id"] for a in j["list"]] # find all entities for each of the accounts: for account_id in accounts_ids: r = requests_session.get('{}/accounts/{}/entities/?b={}'.format(backend_url, account_id, bot_token)) if r.status_code != 200: raise Exception("Network error, got status {} while retrieving {}/accounts/{}/entities".format(r.status_code, backend_url, account_id)) j = r.json() entities_ips = [e["details"]["ipv4"] for e in j["list"] if e["entity_type"] == "device"] with get_db_cursor() as c: # Ideally, we would just run "select distinct(client_ip) from netflow_flows;", but unfortunately # I was unable to find a performant way to run this query. So we are using netflow_exporters: c.execute(f"SELECT ip FROM {DB_PREFIX}exporters;") for client_ip, in c.fetchall(): if client_ip in entities_ips: log.info(f"MAINT: We already know exporter [{client_ip}]") continue log.info(f"MAINT: Unknown exporter found, inserting [{client_ip}] to account [{account_id}]") url = f'{backend_url}/accounts/{account_id}/entities/?b={bot_token}' params = { "name": f'{client_ip} (NetFlow exporter)', "entity_type": "device", "details": { "ipv4": client_ip, }, } r = requests_session.post(url, json=params) if r.status_code > 299: raise Exception("Network error, got status {} while posting to {}/accounts/{}/entities: {}".format(r.status_code, backend_url, account_id, r.content)) log.info("MAINT: Maintenance finished (device entities suggestions).")
def job_maint_remove_old_partitions(*args, **kwargs): with get_db_cursor() as c: log.info("MAINT: Maintenance started - removing old partitions") today_seq = int(time.time() // S_PER_PARTITION) c.execute(f"SELECT tablename FROM pg_tables WHERE schemaname = 'public' AND tablename LIKE '{DB_PREFIX}flows_%';") for tablename, in c.fetchall(): m = re.match(f'^{DB_PREFIX}flows_([0-9]+)$', tablename) if not m: log.warning(f"MAINT: Table {tablename} does not match regex, skipping") continue day_seq = int(m.group(1)) if day_seq > today_seq: log.warning(f"MAINT: CAREFUL! Table {tablename} marks a future day (today is {today_seq}); this should never happen! Skipping.") continue if day_seq < today_seq - LEAVE_N_PAST_PARTITIONS: log.info(f"MAINT: Removing old data: {tablename} (today is {today_seq})") c.execute(f"DROP TABLE {tablename};") else: log.info(f"MAINT: Leaving {tablename} (today is {today_seq})") log.info("MAINT: Maintenance finished (removing old partitions).")
def _save_current_max_ts(job_id, max_ts): with get_db_cursor() as c: c.execute( f"INSERT INTO {DB_PREFIX}bot_jobs2 (job_id, last_used_ts) VALUES (%s, %s) ON CONFLICT (job_id) DO UPDATE SET last_used_ts = %s;", (job_id, max_ts, max_ts))
def _pgwriter_finish(pgwriter): with get_db_cursor() as c: pgwriter.write(struct.pack('!h', -1)) pgwriter.seek(0) c.copy_expert(f"COPY {DB_PREFIX}flows FROM STDIN WITH BINARY", pgwriter)
def ensure_exporter(client_ip): with get_db_cursor() as c: c.execute( f"INSERT INTO {DB_PREFIX}exporters (ip) VALUES (%s) ON CONFLICT DO NOTHING;", (client_ip, ))