def test_get_whois_df(self): results = get_whois_df(data=self.input_df, ip_column="AllExtIPs") self.assertEqual(len(results), len(self.input_df)) self.assertIn("AsnDescription", results.columns) results2 = get_whois_df(data=self.input_df, ip_column="AllExtIPs", asn_col="asn", whois_col="whois") self.assertEqual(len(results2), len(self.input_df)) self.assertIn("asn", results2.columns) self.assertIn("whois", results2.columns) self.assertEqual(len(results2[~results2["asn"].isna()]), len(self.input_df)) self.assertEqual(len(results2[~results2["whois"].isna()]), len(self.input_df))
def get_geoip_whois(geo_lookup, data: pd.DataFrame, ip_col: str): """ Get GeoIP and WhoIs data for IPs. Parameters ---------- geo_lookup : GeoIpLookup GeoIP Provider data : pd.DataFrame Input data frame ip_col : str Name of Ip address column Returns ------- pd.DataFrame Results dataframe with GeoIP and WhoIs data """ data = _normalize_ip4(data, ip_col) nb_markdown(f"Querying geolocation for {len(data)} ip addresses...") geo_ips = geo_lookup.lookup_ips(data, column=ip_col) geo_df = data.merge(geo_ips, how="left", left_on=ip_col, right_on="IpAddress") nb_markdown(f"Querying WhoIs for {len(data)} ip addresses...") # Get the WhoIs results return get_whois_df(geo_df, "IpAddress", whois_col="Whois_data")
def _get_flow_summary(flow_index): flows_df = ( flow_index[ ["source", "dest", "L7Protocol", "FlowDirection", "TotalAllowedFlows"] ] .groupby(["source", "dest", "L7Protocol", "FlowDirection"]) .sum() .reset_index() ) num_ips = len(flows_df["source"].unique()) + len(flows_df["dest"].unique()) nb_markdown(f"Found {num_ips} unique IP Addresses.") nb_data_wait("Whois") flows_df = get_whois_df( flows_df, ip_column="dest", asn_col="DestASN", whois_col="DestASNFull", show_progress=True, ) flows_df = get_whois_df( flows_df, ip_column="source", asn_col="SourceASN", whois_col="SourceASNFull", show_progress=True, ) return ( flows_df.groupby(["DestASN", "SourceASN"]) .agg( TotalAllowedFlows=pd.NamedAgg(column="TotalAllowedFlows", aggfunc="sum"), L7Protocols=pd.NamedAgg( column="L7Protocol", aggfunc=lambda x: x.unique().tolist() ), source_ips=pd.NamedAgg( column="source", aggfunc=lambda x: x.unique().tolist() ), dest_ips=pd.NamedAgg(column="dest", aggfunc=lambda x: x.unique().tolist()), ) .reset_index() )