def map_tcp_stream(rawdf: pd.DataFrame, main: TcpConnection) -> List[TcpMapping]: """ Returns: a sorted list of mappings (tcpconnection, score) with the first one being the most probable """ results = [] for tcpstream in rawdf["tcpstream"].unique(): other = TcpConnection.build_from_dataframe(rawdf, tcpstream) score = main.score(other) if score > float('-inf'): mapping = TcpMapping(other, score) results.append(mapping) # decreasing sort based on the score results.sort(key=lambda x: x[1], reverse=True) return results
def do_list_tcp_connections(self, *args): """ List tcp connections via their ids (tcp.stream) """ streams = self.data.groupby("tcpstream") self.poutput('%d tcp connection(s)' % len(streams)) for tcpstream, group in streams: # self.list_subflows(mptcpstream) self.data.tcp.connection(tcpstream) con = TcpConnection.build_from_dataframe(self.data, tcpstream) self.poutput(con) self.poutput("\n")
def tcpdest_from_connections(df, con: TcpConnection) -> pd.DataFrame: for dest in ConnectionRoles: log.debug("Looking at destination %s" % dest) q = con.generate_direction_query(dest) df_dest = df.query(q) print("tcpdest %r" % dest) df.loc[df_dest.index, 'tcpdest'] = dest # print(df.tcpdest.head()) # assert df['tcpdest'].notnull() == , "every packet should have tcpdest set" return df
def do_tocsv(self, args): """ Selects tcp/mptcp/udp connection and exports it to csv """ df = self.data # TODO let the parser do it # if args.tcpstream: # # df = df[ df.tcpstream == args.tcpstream] # self.poutput("Filtering tcpstream") # con = TcpConnection.build_from_dataframe(df, args.tcpstream) # if args.destination: # self.poutput("Filtering destination") # q = con.generate_direction_query(args.destination) # df = df.query(q) # elif args.mptcpstream: # self.poutput("Unsupported yet") # df = df[ df.mptcpstream == args.mptcpstream] # need to compute the destinations before dropping syn from the dataframe # df['tcpdest'] = np.nan; for streamid, subdf in df.groupby("tcpstream"): con = TcpConnection.build_from_dataframe(df, streamid) df = mpdata.tcpdest_from_connections(df, con) if args.drop_syn: # use subdf ? self.poutput("drop-syn Unsupported yet") df.drop(subdf.head(3).index, inplace=True) # drop 3 first packets of each connection ? # this should be a filter syns = df[df.tcpflags == mp.TcpFlags.SYN] # df = df[ df.flags ] # if args.destination: # if args.tcpstream: # TODO we should filter destination self.poutput("Writing to %s" % args.output) pandas_to_csv(df, args.output)
def do_map_tcp_connection(self, args): df1 = load_into_pandas(args.pcap1, self.tshark_config) df2 = load_into_pandas(args.pcap2, self.tshark_config) main_connection = TcpConnection.build_from_dataframe(df1, args.tcpstreamid) mappings = map_tcp_stream(df2, main_connection) self.poutput("Trying to map %s" % (main_connection,)) self.poutput("%d mapping(s) found" % len(mappings)) for match in mappings: # formatted_output = main.format_mapping(match) # output = "{c1.tcpstreamid} <-> {c2.tcpstreamid} with score={score}" # formatted_output = output.format( # c1=main_connection, # c2=match, # score=score # ) # print(formatted_output) self.poutput("%s" % str(match))
def connection(self, streamid) -> TcpConnection: # if tcpdest is None: # tcpdest = list(mp.ConnectionRoles) return TcpConnection.build_from_dataframe(self._obj, streamid)
def filter_dataframe( self, rawdf, # TODO choose prefix merged_one, tcpstream=None, mptcpstream=None, skipped_subflows=[], destinations: list = None, extra_query: str = None, **kwargs): """ Can filter a single dataframe beforehand (hence call it several times for several dataframes). Feel free to inherit/override this class. Args: rawdf: Raw dataframe kwargs: expanded arguments returned by the parser destination: Filters packets depending on their :enum:`.ConnectionRoles` stream: keep only the packets related to mptcp.stream == mptcpstream skipped_subflows: list of skipped subflows extra_query: Add some more filters to the pandas query This baseclass can filter on: - mptcpstream - destination (mptcpstream required) - skipped_subflows Returns: Filtered dataframe """ log.debug("Preprocessing dataframe with extra args %s" % kwargs) queries = [] log.debug("tcp.stream %d mptcp: %d" % (tcpstream, mptcpstream)) stream = tcpstream if tcpstream is not None else mptcpstream dataframe = rawdf for skipped_subflow in skipped_subflows: log.debug("Skipping subflow %d" % skipped_subflow) queries.append(" tcpstream!=%d " % skipped_subflow) if stream is not None: protocol = "mptcp" if mptcpstream is not None else "tcp" log.debug("Filtering %s stream #%d." % (protocol, stream)) queries.append(protocol + "stream==%d" % stream) if protocol == "tcp": # generates the "tcpdest" component of the dataframe con2 = TcpConnection.build_from_dataframe(dataframe, stream) dataframe = tcpdest_from_connections(dataframe, con2) # trust plots to do the filtering # if destinations is not []: # queries.append(protocol + "dest==%d" % stream) else: # todo shall do the same for mptcp destinations con = MpTcpConnection.build_from_dataframe(dataframe, stream) # mptcpdest = main_connection.mptcp_dest_from_tcpdest(tcpdest) df = mptcpdest_from_connections(dataframe, con) # TODO generate mptcpdest # if protocol == "mptcp": if destinations is not None: raise Exception( "destination filtering is not ready yet for mptcp") log.debug("Filtering destination") # Generate a filter for the connection # con = MpTcpConnection.build_from_dataframe(dataframe, stream) # q = con.generate_direction_query(destination) # queries.append(q) if extra_query: log.debug("Appending extra_query=%s" % extra_query) queries.append(extra_query) query = " and ".join(queries) # throws when querying with an empty query if len(query) > 0: log.info("Running query:\n%s\n" % query) dataframe.query(query, inplace=True) return dataframe
def connection(self, streamid): return TcpConnection.build_from_dataframe(self._obj, streamid)
def load_merged_streams_into_pandas( pcap1: str, pcap2: str, streamid1: int, # Union[MpTcpStreamId, TcpStreamId], streamid2: int, mptcp: bool, tshark_config: TsharkConfig, clock_offset1: int = 0, clock_offset2: int = 0, mapping_mode: PacketMappingMode = PacketMappingMode.HASH, **extra): """ Arguments: protocol: mptcp or tcp mapping_mode: Only HASH works for now Returns a dataframe with columns... owd ? """ log.debug( "Asked to load merged tcp streams %d and %d from pcaps %s and %s" % (streamid1, streamid2, pcap1, pcap2)) cache = mp.get_cache() protocolStr = "mptcp" if mptcp else "tcp" cacheid = cache.cacheuid( "merged", [ getrealpath(pcap1), getrealpath(pcap2), ], protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv") # if we can't load that file from cache try: merged_df = pd.DataFrame() res = pd.DataFrame() valid, cachename = cache.get(cacheid) log.info("Cache validity=%s and cachename=%s" % (valid, cachename)) # TODO disable when clock_offset is set if not valid: df1 = load_into_pandas(pcap1, tshark_config, clock_offset=clock_offset1) df2 = load_into_pandas(pcap2, tshark_config, clock_offset=clock_offset2) main_connection = None # type: Union[MpTcpConnection, TcpConnection] other_connection = None # type: Union[MpTcpConnection, TcpConnection] if mptcp: main_connection = MpTcpConnection.build_from_dataframe( df1, streamid1) other_connection = MpTcpConnection.build_from_dataframe( df2, streamid2) # TODO generate # map_mptcp_connection() # for now we use known streams exclusively # might be interested to use merge_tcp_dataframes later merged_df = merge_mptcp_dataframes_known_streams( (df1, main_connection), (df2, other_connection)) else: main_connection = TcpConnection.build_from_dataframe( df1, streamid1) other_connection = TcpConnection.build_from_dataframe( df2, streamid2) # for now we use known streams exclusively # might be interested to use merge_tcp_dataframes later merged_df = merge_tcp_dataframes_known_streams( (df1, main_connection), (df2, other_connection)) assert cachename logging.info("Saving into %s" % cachename) # trying to export lists correctly # print(merged_df.reinjected_in.dropna().head()) # convert arrays back to strings # merged_df.apply(",".join() merged_df.to_csv( cachename, # columns=columns, index=False, header=True, sep=tshark_config.delimiter, ) # la on a perdu tcpdest est devenu object print("saving with dtypes=", dict(merged_df.dtypes)) # print("MERGED_DF", merged_df[TCP_DEBUG_FIELDS].head(20)) # if log level >= DEBUG then save to xls too ! # if True: # filename = cachename + ".xls" # logging.debug("Saved a debug excel copy at %s" % filename) # merged_df.to_excel(filename) else: logging.info("Loading from cache %s" % cachename) # dtypes = {k: v for k, v in temp.items() if v is not None or k not in ["tcpflags"]} def _gen_dtypes(fields) -> Dict[str, Any]: dtypes = {} # type: ignore for _name in [_first, _second]: # TODO this could be simplified for k, v in fields.items(): if v is not None or k not in ["tcpflags"]: dtypes.setdefault(_name(k), v) # add generated field dtypes dtypes.update({ _name(f.fullname): f.type for f in per_pcap_artificial_fields.values() }) # these are overrides from the generated dtypes dtypes.update({ # during the merge, we join even unmapped packets so some entries # may be empty => float64 _first("packetid"): np.float64, _second("packetid"): np.float64, }) return dtypes def _gen_converters() -> Dict[str, Callable]: # converters = {} # type: Dict[str, Any] fields = dict(tshark_config.fields) fields.update(per_pcap_artificial_fields) converters = {} # no need to convert tcpflags default_converters = { name: f.converter for name, f in fields.items() if f.converter and name != "tcpflags" } # converters.update({ name: f.converter for name, f in per_pcap_artificial_fields.items() if f.converter}) for name, converter in default_converters.items(): converters.update({ _first(name): converter, _second(name): converter }) return converters with open(cachename) as fd: dtypes = _gen_dtypes({ name: field.type for name, field in tshark_config.fields.items() }) converters = _gen_converters() # more recent versions can do without it # pd.set_option('display.max_rows', 200) # pd.set_option('display.max_colwidth', -1) # print("converters=", converters) merged_df = pd.read_csv( fd, skip_blank_lines=True, comment='#', # we don't need 'header' when metadata is with comment sep=tshark_config.delimiter, # memory_map=True, # could speed up processing dtype=dtypes, # poping still generates converters=converters, ) # log.debug("Column names after loading from cache: %s", merged_df.columns) # TODO: # No columns to parse from file # we fix the clocks a posteriori so that the cache is still usable logging.debug("Postprocessing clock if needed") merged_df[_first('abstime')] += clock_offset1 merged_df[_second('abstime')] += clock_offset2 logging.debug("Converting dataframes to be sender/receiver based...") # in both cases # TODO here we should attribute the definite mptcprole # compute owd if mptcp: print("Should be merging OWDs") logging.error( "We should correct the clocks if the argument is passed !") # raise mp.MpTcpException("Implement mptcp merge") res = convert_to_sender_receiver(merged_df) else: # tcp # c la ou ou corrige les temps # on rename les colonnes host1 ou host2 par _sender ou bien _receiver ?! res = convert_to_sender_receiver(merged_df) # don't do it here else we might repeat it # data["abstime"] += clock_offset logging.debug("Computing owds") log.debug("Column names: %s", res.columns) log.debug("Dtypes after load:%s\n" % dict(res.dtypes)) print("res=") # TODO we don't necessarely need to generate the OWDs here, might be put out res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')] # .head(40)) with pd.option_context('float_format', '{:f}'.format): print(res[_sender(["ipsrc", "ipdst", "abstime"]) + _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS + ["owd"]]) except Exception: logging.exception("exception happened while merging") # pd.set_option('display.max_rows', 200) # pd.set_option('display.max_colwidth', -1) # print("dtypes=", dict(dtypes)) # log.debug("Dtypes after load:%s\n" % pp.pformat(merged_df.dtypes)) log.info("Finished loading. merged dataframe size: %d" % len(merged_df)) return res
def load_merged_streams_into_pandas( pcap1: str, pcap2: str, streamid1: int, streamid2: int, # TODO changed to protocol mptcp: bool, tshark_config: TsharkConfig, clock_offset1: int = 0, clock_offset2: int = 0, mapping_mode: PacketMappingMode = PacketMappingMode.HASH, **extra): """ Arguments: protocol: mptcp or tcp mapping_mode: Only HASH works for now clock_offset: untested Returns a dataframe with columns... owd ? """ protocolStr = "mptcp" if mptcp else "tcp" log.debug(f"Asked to load {protocolStr} merged streams {streamid1} and " "{streamid2} from pcaps {pcap1} and {pcap2}") cache = mp.get_cache() cacheid = cache.cacheuid( "merged", [getrealpath(pcap1), getrealpath(pcap2)], protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv") # if we can't load that file from cache try: merged_df = pd.DataFrame() res = pd.DataFrame() valid, cachename = cache.get(cacheid) log.info("Cache validity=%s and cachename=%s" % (valid, cachename)) # TODO disable when clock_offset is set if not valid: df1 = load_into_pandas(pcap1, tshark_config, clock_offset=clock_offset1) df2 = load_into_pandas(pcap2, tshark_config, clock_offset=clock_offset2) main_connection = None # type: Union[MpTcpConnection, TcpConnection] other_connection = None # type: Union[MpTcpConnection, TcpConnection] if mptcp: main_connection = MpTcpConnection.build_from_dataframe( df1, MpTcpStreamId(streamid1)) other_connection = MpTcpConnection.build_from_dataframe( df2, MpTcpStreamId(streamid2)) # for now we use known streams exclusively # might be interested to use merge_tcp_dataframes later merged_df = merge_mptcp_dataframes_known_streams( (df1, main_connection), (df2, other_connection)) else: main_connection = TcpConnection.build_from_dataframe( df1, TcpStreamId(streamid1)) other_connection = TcpConnection.build_from_dataframe( df2, TcpStreamId(streamid2)) # for now we use known streams exclusively # might be interested to use merge_tcp_dataframes later merged_df = merge_tcp_dataframes_known_streams( (df1, main_connection), (df2, other_connection)) assert cachename log.info("Saving into %s" % cachename) # trying to export lists correctly # print(merged_df.reinjected_in.dropna().head()) # convert arrays back to strings # merged_df.apply(",".join() # or abstime ? # TODO rechange the flags hex() merged_df.to_csv( cachename, # columns=columns, index=False, header=True, sep=tshark_config.delimiter, ) # tcpdest had become an objected instead of a CategoricalDtype # see https://github.com/pandas-dev/pandas/issues/22361 log.log(mp.TRACE, "saving with dtypes=", dict(merged_df.dtypes)) else: log.info("Loading from cache %s", cachename) date_cols = get_date_cols(tshark_config.fields) with open(cachename) as fd: # generate fieldlist def _gen_fields(fields): gfields = {} # type: ignore for _name in [_first, _second]: gfields.update( {_name(k): v for k, v in fields.items()}) return gfields # reltime discarded on save ? tshark_config.fields.pop("reltime") gfields = _gen_fields(tshark_config.fields) merge_dtypes = get_dtypes(gfields) # log.log(mp.TRACE, "Using gfields %s" % pp.pformat(gfields)) # we don't need any converters converters = {} date_cols = get_date_cols(gfields) log.log(mp.TRACE, "Using date_cols %s" % pp.pformat(date_cols)) log.log(mp.TRACE, "Using dtypes %s" % pp.pformat(merge_dtypes)) # log.log(mp.TRACE, "Using converters %s" % (pp.pformat(converters))) merged_df = pd.read_csv( fd, skip_blank_lines=True, comment='#', # we don't need 'header' when metadata is with comment sep=tshark_config.delimiter, # memory_map=True, # could speed up processing dtype=merge_dtypes, # poping still generates converters=converters, # date_parser=date_converter, parse_dates=date_cols, ) # at this stage, destinatiosn are nan debug_fields = ["abstime", "tcpstream", "tcpdest", "mptcpdest"] mptcpanalyzer.debug.debug_dataframe( merged_df, "Merged dataframe", usecols=(_first(debug_fields) + _second(debug_fields))) # workaround bug https://github.com/pandas-dev/pandas/issues/25448 def _convert_to_enums(): # per_pcap_artificial_fields for col in [ _first("tcpdest"), _first("mptcpdest"), _second("tcpdest"), _second("mptcpdest") ]: merged_df[col] = merged_df[col].apply( _convert_role, convert_dtype=False) # we fix the clocks a posteriori so that the cache is still usable log.debug("Postprocessing clock if needed") # merged_df[_first('abstime')] += clock_offset1 # merged_df[_second('abstime')] += clock_offset2 log.debug("Converting dataframes to be sender/receiver based...") # in both cases # TODO here we should attribute the definite mptcprole if mptcp: log.error( "We should correct the clocks if the argument is passed !") # raise mp.MpTcpException("Implement mptcp merge") res = convert_to_sender_receiver(merged_df) # fill MPTCP dest ? else: # tcp res = convert_to_sender_receiver(merged_df) # log.debug("Sorting by sender abstime") # merged_df.sort_values(by=_sender("abstime"), ascending=True, inplace=True) # debug_dataframe(res, "checking merge", usecols=["merge_status"]) # print("%d nan values" % len(res[res.merge_status == np.nan])) log.debug("Computing owds") debug_dataframe(res, "before owds") # TODO we don't necessarely need to generate the OWDs here, might be put out res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')] debug_dataframe( res, "owd", usecols=["owd", _sender('abstime'), _receiver('abstime')]) # with pd.option_context('float_format', '{:f}'.format): # print( # res[_sender(["ipsrc", "ipdst", "abstime"]) # + _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS + ["owd"] ] # ) except Exception as e: log.exception("exception happened while merging") # pd.set_option('display.max_rows', 200) # pd.set_option('display.max_colwidth', -1) # print("dtypes=", dict(dtypes)) log.log(mp.TRACE, "Dtypes after load:%s\n", pp.pformat(res.dtypes)) log.info("Finished loading. merged dataframe size: %d", len(res)) return res