def _convert_to_enums(): # per_pcap_artificial_fields for col in [ _first("tcpdest"), _first("mptcpdest"), _second("tcpdest"), _second("mptcpdest") ]: merged_df[col] = merged_df[col].apply( _convert_role, convert_dtype=False)
def _gen_dtypes(fields) -> Dict[str, Any]: dtypes = {} # type: ignore for _name in [_first, _second]: # TODO this could be simplified for k, v in fields.items(): if v is not None or k not in ["tcpflags"]: dtypes.setdefault(_name(k), v) # add generated field dtypes dtypes.update({ _name(f.fullname): f.type for f in per_pcap_artificial_fields.values() }) # these are overrides from the generated dtypes dtypes.update({ # during the merge, we join even unmapped packets so some entries # may be empty => float64 _first("packetid"): np.float64, _second("packetid"): np.float64, }) return dtypes
def map_tcp_packets_via_hash( # TODO rename, these are not host1/host2 anymore host1_df, host2_df, *kargs, **kwargs): """ Merge on hash of different fields Resulting dataframe has H1_SUFFIX / H2_SUFFIX """ log.info("Merging dataframes via hash") debug_cols = ["packetid", "hash", "abstime"] # debug_dataframe(total, "concatenated df", # usecols=_first(["abstime", "tcpdest"]) + _second(["abstime", "tcpdest"])) debug_dataframe( host1_df, "host1_df", ) debug_dataframe(host2_df, "host2 df") # todo we could now use merge_asof # TODO here we should be able to drop some columns in double try: # first check hashes are identical # check hashes are different host1_df = deal_with_duplicated_hash(host1_df) host2_df = deal_with_duplicated_hash(host2_df) res = pd.merge( host1_df, host2_df, on="hash", suffixes=(HOST1_SUFFIX, HOST2_SUFFIX), # columns suffixes how="outer", # we want to keep packets from both # we want to know how many packets were not mapped correctly, adds the merge column # can take values "left_only"/ "right_only" or both indicator="merge_status", # run additionnal checks against duplicate hashes validate="one_to_one", # can slow process ) except pd.errors.MergeError as e: # TODO we don't want to print here print("An error happened during the merge of the 2 pcaps") print(e) raise e # TCP_DEBUG_FIELDS TCP_DEBUG_FIELDS = ['packetid', "abstime"] debug_cols = _first(TCP_DEBUG_FIELDS) + _second(TCP_DEBUG_FIELDS) debug_dataframe(res, "Result of merging by hash", usecols=debug_cols) return res
def _gen_converters() -> Dict[str, Callable]: # converters = {} # type: Dict[str, Any] fields = dict(tshark_config.fields) fields.update(per_pcap_artificial_fields) converters = {} # no need to convert tcpflags default_converters = { name: f.converter for name, f in fields.items() if f.converter and name != "tcpflags" } # converters.update({ name: f.converter for name, f in per_pcap_artificial_fields.items() if f.converter}) for name, converter in default_converters.items(): converters.update({ _first(name): converter, _second(name): converter }) return converters
def merge_tcp_dataframes_known_streams( con1: Tuple[pd.DataFrame, TcpConnection], con2: Tuple[pd.DataFrame, TcpConnection] # , dest: ConnectionRoles ) -> pd.DataFrame: """ Generates an intermediate file with the owds. 1/ clean up dataframe to keep 2/ identify which dataframe is server's/client's 2/ Args: con1: Tuple dataframe/tcpstream id con2: same Returns: res To ease debug we want to see packets in chronological order """ h1_df, main_connection = con1 h2_df, mapped_connection = con2 logging.info( "Trying to merge connection {} to {} of respective sizes {} and {}". format(mapped_connection, main_connection, len(h1_df), len(h2_df))) # print(h1_df[["packetid","hash", "reltime"]].head(5)) # print(h2_df[["packetid","hash", "reltime"]].head(5)) # cleanup the dataframes to contain only the current stream packets h1_df = h1_df[h1_df.tcpstream == main_connection.tcpstreamid] h2_df = h2_df[h2_df.tcpstream == mapped_connection.tcpstreamid] # TODO reorder columns to have packet ids first ! total = pd.DataFrame() for tcpdest in ConnectionRoles: log.debug("Looking at tcpdestination %s" % tcpdest) q = main_connection.generate_direction_query(tcpdest) h1_unidirectional_df = h1_df.query(q) q = mapped_connection.generate_direction_query(tcpdest) h2_unidirectional_df = h2_df.query(q) res = map_tcp_packets(h1_unidirectional_df, h2_unidirectional_df) # pandas trick to avoid losing dtype # see https://github.com/pandas-dev/pandas/issues/22361#issuecomment-413147667 # no need to set _second (as they are just opposite) # TODO this should be done somewhere else # else summary won't work res[_first('tcpdest')][:] = tcpdest res[_second('tcpdest')][:] = tcpdest # generate_mptcp_direction_query if isinstance(main_connection, MpTcpSubflow): print("THIS IS A SUBFLOW") mptcpdest = main_connection.mptcp_dest_from_tcpdest(tcpdest) res[_first('mptcpdest')][:] = mptcpdest res[_second('mptcpdest')][:] = mptcpdest print("Setting mptcpdest to %s", mptcpdest) # if tcpdest == main_connection.mptcpdest # TODO here we should total = pd.concat([res, total]) # TODO move elsewhere, to outer function log.info( "Resulting merged tcp dataframe of size {} ({} mapped packets vs {} unmapped)" "with input dataframes of size {} and {}.".format( len(total), len(total[total._merge == "both"]), len(total[total._merge != "both"]), len(h1_df), len(h2_df))) # print("unmapped packets:") # print(total.loc[total._merge != "both", _sender(TCP_DEBUG_FIELDS) + _receiver(TCP_DEBUG_FIELDS) ]) return total
def convert_to_sender_receiver( df # def tcp_compute_owd( # already merged df # con1: Tuple[pd.DataFrame, TcpConnection], # con2: Tuple[pd.DataFrame, TcpConnection] # tcp_sender_df, # tcp_receiver_df ): """ each packet has a destination marker Assume clocks are fine here ! """ logging.debug("Converting to sender/receiver format") total = pd.DataFrame() # min_h1 = df.iloc[0, subdf.columns.get_loc(_first('abstime'))] # min_h2 = df.iloc[0, subdf.columns.get_loc(_second('abstime'))] for tcpstream, subdf in df.groupby(_first("tcpstream")): # assume packets are in chronological order, else we would have to use min # min_h1 = h1_df['abstime'].min() # min_h2 = h2_df['abstime'].min() # min_h1 = subdf.loc[0, _first('abstime')] min_h1 = subdf.iloc[0, subdf.columns.get_loc(_first('abstime'))] min_h2 = subdf.iloc[0, subdf.columns.get_loc(_second('abstime'))] # min_h2 = subdf[_second('abstime')][0] print("min_h1 = %r" % min_h1) print("min_h1 float = %f" % min_h1) # def _rename_columns(h1_role: ConnectionRoles): # """ # client_suffix, server_suffix # Params: # client_suffix must be one of HOST1_SUFFIX or HOST2_SUFFIX # server_suffix can be deduced # """ def _rename_column(col_name, suffixes) -> str: for suffix_to_replace, new_suffix in suffixes.items(): if col_name.endswith(suffix_to_replace): return col_name.replace(suffix_to_replace, new_suffix) return col_name # total = pd.concat([total, subdf], ignore_index=True) # min_h1 = h1_df['abstime'].min() # min_h2 = h2_df['abstime'].min() logging.debug("Comparing %f (h1) with %f (h2)" % (min_h1, min_h2)) if min_h1 < min_h2: logging.debug("Looks like h1 is the tcp client") # suffixes = { HOST1_SUFFIX: SENDER_SUFFIX, HOST2_SUFFIX: RECEIVER_SUFFIX } h1_role = ConnectionRoles.Client else: logging.debug("Looks like h2 is the tcp client") # suffixes = { HOST2_SUFFIX: SENDER_SUFFIX, HOST1_SUFFIX: RECEIVER_SUFFIX } h1_role = (ConnectionRoles.Server) print("renaming") # _rename_columns(role) for tcpdest, tdf in subdf.groupby(_first("tcpdest"), sort=False): if tcpdest == h1_role: suffixes = { HOST2_SUFFIX: SENDER_SUFFIX, HOST1_SUFFIX: RECEIVER_SUFFIX } else: suffixes = { HOST1_SUFFIX: SENDER_SUFFIX, HOST2_SUFFIX: RECEIVER_SUFFIX } rename_func = functools.partial(_rename_column, suffixes=suffixes) print("renaming inplace") tdf.rename(columns=rename_func, inplace=True) total = pd.concat([total, tdf], ignore_index=True) # subdf[ _first("tcpdest") == ConnectionRole.Client] .rename(columns=_rename_cols, inplace=True) print(subdf.columns) print(total.columns) logging.debug("Converted to sender/receiver format") return total
def load_merged_streams_into_pandas( pcap1: str, pcap2: str, streamid1: int, # Union[MpTcpStreamId, TcpStreamId], streamid2: int, mptcp: bool, tshark_config: TsharkConfig, clock_offset1: int = 0, clock_offset2: int = 0, mapping_mode: PacketMappingMode = PacketMappingMode.HASH, **extra): """ Arguments: protocol: mptcp or tcp mapping_mode: Only HASH works for now Returns a dataframe with columns... owd ? """ log.debug( "Asked to load merged tcp streams %d and %d from pcaps %s and %s" % (streamid1, streamid2, pcap1, pcap2)) cache = mp.get_cache() protocolStr = "mptcp" if mptcp else "tcp" cacheid = cache.cacheuid( "merged", [ getrealpath(pcap1), getrealpath(pcap2), ], protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv") # if we can't load that file from cache try: merged_df = pd.DataFrame() res = pd.DataFrame() valid, cachename = cache.get(cacheid) log.info("Cache validity=%s and cachename=%s" % (valid, cachename)) # TODO disable when clock_offset is set if not valid: df1 = load_into_pandas(pcap1, tshark_config, clock_offset=clock_offset1) df2 = load_into_pandas(pcap2, tshark_config, clock_offset=clock_offset2) main_connection = None # type: Union[MpTcpConnection, TcpConnection] other_connection = None # type: Union[MpTcpConnection, TcpConnection] if mptcp: main_connection = MpTcpConnection.build_from_dataframe( df1, streamid1) other_connection = MpTcpConnection.build_from_dataframe( df2, streamid2) # TODO generate # map_mptcp_connection() # for now we use known streams exclusively # might be interested to use merge_tcp_dataframes later merged_df = merge_mptcp_dataframes_known_streams( (df1, main_connection), (df2, other_connection)) else: main_connection = TcpConnection.build_from_dataframe( df1, streamid1) other_connection = TcpConnection.build_from_dataframe( df2, streamid2) # for now we use known streams exclusively # might be interested to use merge_tcp_dataframes later merged_df = merge_tcp_dataframes_known_streams( (df1, main_connection), (df2, other_connection)) assert cachename logging.info("Saving into %s" % cachename) # trying to export lists correctly # print(merged_df.reinjected_in.dropna().head()) # convert arrays back to strings # merged_df.apply(",".join() merged_df.to_csv( cachename, # columns=columns, index=False, header=True, sep=tshark_config.delimiter, ) # la on a perdu tcpdest est devenu object print("saving with dtypes=", dict(merged_df.dtypes)) # print("MERGED_DF", merged_df[TCP_DEBUG_FIELDS].head(20)) # if log level >= DEBUG then save to xls too ! # if True: # filename = cachename + ".xls" # logging.debug("Saved a debug excel copy at %s" % filename) # merged_df.to_excel(filename) else: logging.info("Loading from cache %s" % cachename) # dtypes = {k: v for k, v in temp.items() if v is not None or k not in ["tcpflags"]} def _gen_dtypes(fields) -> Dict[str, Any]: dtypes = {} # type: ignore for _name in [_first, _second]: # TODO this could be simplified for k, v in fields.items(): if v is not None or k not in ["tcpflags"]: dtypes.setdefault(_name(k), v) # add generated field dtypes dtypes.update({ _name(f.fullname): f.type for f in per_pcap_artificial_fields.values() }) # these are overrides from the generated dtypes dtypes.update({ # during the merge, we join even unmapped packets so some entries # may be empty => float64 _first("packetid"): np.float64, _second("packetid"): np.float64, }) return dtypes def _gen_converters() -> Dict[str, Callable]: # converters = {} # type: Dict[str, Any] fields = dict(tshark_config.fields) fields.update(per_pcap_artificial_fields) converters = {} # no need to convert tcpflags default_converters = { name: f.converter for name, f in fields.items() if f.converter and name != "tcpflags" } # converters.update({ name: f.converter for name, f in per_pcap_artificial_fields.items() if f.converter}) for name, converter in default_converters.items(): converters.update({ _first(name): converter, _second(name): converter }) return converters with open(cachename) as fd: dtypes = _gen_dtypes({ name: field.type for name, field in tshark_config.fields.items() }) converters = _gen_converters() # more recent versions can do without it # pd.set_option('display.max_rows', 200) # pd.set_option('display.max_colwidth', -1) # print("converters=", converters) merged_df = pd.read_csv( fd, skip_blank_lines=True, comment='#', # we don't need 'header' when metadata is with comment sep=tshark_config.delimiter, # memory_map=True, # could speed up processing dtype=dtypes, # poping still generates converters=converters, ) # log.debug("Column names after loading from cache: %s", merged_df.columns) # TODO: # No columns to parse from file # we fix the clocks a posteriori so that the cache is still usable logging.debug("Postprocessing clock if needed") merged_df[_first('abstime')] += clock_offset1 merged_df[_second('abstime')] += clock_offset2 logging.debug("Converting dataframes to be sender/receiver based...") # in both cases # TODO here we should attribute the definite mptcprole # compute owd if mptcp: print("Should be merging OWDs") logging.error( "We should correct the clocks if the argument is passed !") # raise mp.MpTcpException("Implement mptcp merge") res = convert_to_sender_receiver(merged_df) else: # tcp # c la ou ou corrige les temps # on rename les colonnes host1 ou host2 par _sender ou bien _receiver ?! res = convert_to_sender_receiver(merged_df) # don't do it here else we might repeat it # data["abstime"] += clock_offset logging.debug("Computing owds") log.debug("Column names: %s", res.columns) log.debug("Dtypes after load:%s\n" % dict(res.dtypes)) print("res=") # TODO we don't necessarely need to generate the OWDs here, might be put out res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')] # .head(40)) with pd.option_context('float_format', '{:f}'.format): print(res[_sender(["ipsrc", "ipdst", "abstime"]) + _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS + ["owd"]]) except Exception: logging.exception("exception happened while merging") # pd.set_option('display.max_rows', 200) # pd.set_option('display.max_colwidth', -1) # print("dtypes=", dict(dtypes)) # log.debug("Dtypes after load:%s\n" % pp.pformat(merged_df.dtypes)) log.info("Finished loading. merged dataframe size: %d" % len(merged_df)) return res
def merge_tcp_dataframes_known_streams( con1: Tuple[pd.DataFrame, TcpConnection], con2: Tuple[pd.DataFrame, TcpConnection]) -> pd.DataFrame: """ Generates an intermediate file with the owds. 1/ clean up dataframe to keep 2/ identify which dataframe is server's/client's Args: con1: Tuple dataframe/tcpstream id con2: same Returns: A dataframe with a "merge_status" column and valid tcp/mptcp destinations To ease debug we want to see packets in chronological order """ h1_df, main_connection = con1 h2_df, mapped_connection = con2 log.info( "Trying to merge connection {} to {} of respective sizes {} and {}". format(mapped_connection, main_connection, len(h1_df), len(h2_df))) # cleanup the dataframes to contain only the current stream packets h1_df = h1_df[h1_df.tcpstream == main_connection.tcpstreamid] h2_df = h2_df[h2_df.tcpstream == mapped_connection.tcpstreamid] # TODO reorder columns to have packet ids first ! total = pd.DataFrame() for tcpdest in ConnectionRoles: log.debug("Merging tcp destination %s" % tcpdest) q = main_connection.generate_direction_query(tcpdest) h1_unidirectional_df = h1_df.query(q, engine="python") q = mapped_connection.generate_direction_query(tcpdest) h2_unidirectional_df = h2_df.query(q, engine="python") res = map_tcp_packets(h1_unidirectional_df, h2_unidirectional_df) # pandas trick to avoid losing dtype # see https://github.com/pandas-dev/pandas/issues/22361#issuecomment-413147667 # no need to set _second (as they are just opposite) # TODO this should be done somewhere else # else summary won't work res[_first('tcpdest')][:] = tcpdest res[_second('tcpdest')][:] = tcpdest # generate_mptcp_direction_query # TODO this is not always reached ? log.info("con of TYPE %r", main_connection) if isinstance(main_connection, MpTcpSubflow): log.debug("This is a subflow, setting mptcp destinations...") mptcpdest = main_connection.mptcp_dest_from_tcpdest(tcpdest) log.debug("Setting mptcpdest to {mptcpdest}") res[_first('mptcpdest')][:] = mptcpdest res[_second('mptcpdest')][:] = mptcpdest log.debug("Setting mptcpdest to %s" % mptcpdest) total = pd.concat([res, total]) debugcols = _first(["abstime", "tcpdest", "mptcpdest"]) + \ _second(["abstime", "tcpdest", "mptcpdest"]) debug_dataframe(total, "concatenated df", usecols=debugcols) log.info( "Resulting merged tcp dataframe of size {} ({} mapped packets vs {} unmapped)" "with input dataframes of size {} and {}.".format( len(total), len(total[total.merge_status == "both"]), len(total[total.merge_status != "both"]), len(h1_df), len(h2_df))) # print("unmapped packets:") # print(total.loc[total._merge != "both", _sender(TCP_DEBUG_FIELDS) + _receiver(TCP_DEBUG_FIELDS) ]) return total
def convert_to_sender_receiver(df) -> pd.DataFrame: """ Convert dataframe from X_HOST1 | X_HOST2 to X_SENDER | X_RECEIVER each packet has a destination marker Assume clocks are fine here ! """ log.debug("Converting from host_1/host_2 to sender/receiver format") # fill up afterwards total = pd.DataFrame() for tcpstream, subdf in df.groupby(_first("tcpstream")): min_h1 = subdf.iloc[0, subdf.columns.get_loc(_first('abstime'))] min_h2 = subdf.iloc[0, subdf.columns.get_loc(_second('abstime'))] # def _rename_columns(h1_role: ConnectionRoles): # """ # client_suffix, server_suffix # Params: # client_suffix must be one of HOST1_SUFFIX or HOST2_SUFFIX # server_suffix can be deduced # """ def _rename_column(col_name, suffixes) -> str: for suffix_to_replace, new_suffix in suffixes.items(): if col_name.endswith(suffix_to_replace): return col_name.replace(suffix_to_replace, new_suffix) return col_name # total = pd.concat([total, subdf], ignore_index=True) log.debug(f"Comparing {min_h1} (h1) with {min_h2} (h2)") assert min_h1 != min_h2, ( f"Same sending {min_h1} and receiving time {min_h2}." "Either the clock is not precise enough or it's a bug" " (more likely)") if min_h1 < min_h2: log.debug("Looks like h1 is the tcp client") # suffixes = { HOST1_SUFFIX: SENDER_SUFFIX, HOST2_SUFFIX: RECEIVER_SUFFIX } h1_role = ConnectionRoles.Client else: if min_h1 == min_h2: log.warn("there is an issue") log.debug("Looks like h2 is the tcp client") h1_role = (ConnectionRoles.Server) # _rename_columns(role) for tcpdest, tdf in subdf.groupby(_first("tcpdest"), sort=False): if tcpdest == h1_role: suffixes = { HOST2_SUFFIX: SENDER_SUFFIX, HOST1_SUFFIX: RECEIVER_SUFFIX } else: suffixes = { HOST1_SUFFIX: SENDER_SUFFIX, HOST2_SUFFIX: RECEIVER_SUFFIX } log.debug("suffixes: %s" % suffixes) rename_func = functools.partial(_rename_column, suffixes=suffixes) log.log(mp.TRACE, "renaming inplace") rename_func = functools.partial(_rename_column, suffixes=suffixes) log.debug("total df size = %d" % len(total)) with pd.option_context('precision', 20): debug_cols = _first(["abstime", "tcpdest"]) + _second( ["abstime", "tcpdest"]) log.log(mp.TRACE, "before rename \n%s", tdf[debug_cols]) tdf = tdf.rename(columns=rename_func, copy=True, inplace=False) debug_cols = _sender(["abstime", "tcpdest"]) + _receiver( ["abstime", "tcpdest"]) log.log(mp.TRACE, "After rename \n%s" % tdf[debug_cols]) # print(tdf[debug_cols]) # debug_dataframe(tdf, "temporary dataframe") total = pd.concat( [total, tdf], ignore_index=True, sort=False, ) # print("total df size = %d" % len(total)) # subdf[ _first("tcpdest") == ConnectionRole.Client] .rename(columns=_rename_cols, inplace=True) # print(subdf.columns) # print(total.columns) # debug_dataframe(total, "total") log.debug("Converted to sender/receiver format") log.log(mp.TRACE, "Comparing #unique entries %d vs #all %d", total[_sender("abstime")].count(), len(total[_sender("abstime")])) # assert total[_sender("abstime")].count() == len(total[_sender("abstime")]) return total
def load_merged_streams_into_pandas( pcap1: str, pcap2: str, streamid1: int, streamid2: int, # TODO changed to protocol mptcp: bool, tshark_config: TsharkConfig, clock_offset1: int = 0, clock_offset2: int = 0, mapping_mode: PacketMappingMode = PacketMappingMode.HASH, **extra): """ Arguments: protocol: mptcp or tcp mapping_mode: Only HASH works for now clock_offset: untested Returns a dataframe with columns... owd ? """ protocolStr = "mptcp" if mptcp else "tcp" log.debug(f"Asked to load {protocolStr} merged streams {streamid1} and " "{streamid2} from pcaps {pcap1} and {pcap2}") cache = mp.get_cache() cacheid = cache.cacheuid( "merged", [getrealpath(pcap1), getrealpath(pcap2)], protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv") # if we can't load that file from cache try: merged_df = pd.DataFrame() res = pd.DataFrame() valid, cachename = cache.get(cacheid) log.info("Cache validity=%s and cachename=%s" % (valid, cachename)) # TODO disable when clock_offset is set if not valid: df1 = load_into_pandas(pcap1, tshark_config, clock_offset=clock_offset1) df2 = load_into_pandas(pcap2, tshark_config, clock_offset=clock_offset2) main_connection = None # type: Union[MpTcpConnection, TcpConnection] other_connection = None # type: Union[MpTcpConnection, TcpConnection] if mptcp: main_connection = MpTcpConnection.build_from_dataframe( df1, MpTcpStreamId(streamid1)) other_connection = MpTcpConnection.build_from_dataframe( df2, MpTcpStreamId(streamid2)) # for now we use known streams exclusively # might be interested to use merge_tcp_dataframes later merged_df = merge_mptcp_dataframes_known_streams( (df1, main_connection), (df2, other_connection)) else: main_connection = TcpConnection.build_from_dataframe( df1, TcpStreamId(streamid1)) other_connection = TcpConnection.build_from_dataframe( df2, TcpStreamId(streamid2)) # for now we use known streams exclusively # might be interested to use merge_tcp_dataframes later merged_df = merge_tcp_dataframes_known_streams( (df1, main_connection), (df2, other_connection)) assert cachename log.info("Saving into %s" % cachename) # trying to export lists correctly # print(merged_df.reinjected_in.dropna().head()) # convert arrays back to strings # merged_df.apply(",".join() # or abstime ? # TODO rechange the flags hex() merged_df.to_csv( cachename, # columns=columns, index=False, header=True, sep=tshark_config.delimiter, ) # tcpdest had become an objected instead of a CategoricalDtype # see https://github.com/pandas-dev/pandas/issues/22361 log.log(mp.TRACE, "saving with dtypes=", dict(merged_df.dtypes)) else: log.info("Loading from cache %s", cachename) date_cols = get_date_cols(tshark_config.fields) with open(cachename) as fd: # generate fieldlist def _gen_fields(fields): gfields = {} # type: ignore for _name in [_first, _second]: gfields.update( {_name(k): v for k, v in fields.items()}) return gfields # reltime discarded on save ? tshark_config.fields.pop("reltime") gfields = _gen_fields(tshark_config.fields) merge_dtypes = get_dtypes(gfields) # log.log(mp.TRACE, "Using gfields %s" % pp.pformat(gfields)) # we don't need any converters converters = {} date_cols = get_date_cols(gfields) log.log(mp.TRACE, "Using date_cols %s" % pp.pformat(date_cols)) log.log(mp.TRACE, "Using dtypes %s" % pp.pformat(merge_dtypes)) # log.log(mp.TRACE, "Using converters %s" % (pp.pformat(converters))) merged_df = pd.read_csv( fd, skip_blank_lines=True, comment='#', # we don't need 'header' when metadata is with comment sep=tshark_config.delimiter, # memory_map=True, # could speed up processing dtype=merge_dtypes, # poping still generates converters=converters, # date_parser=date_converter, parse_dates=date_cols, ) # at this stage, destinatiosn are nan debug_fields = ["abstime", "tcpstream", "tcpdest", "mptcpdest"] mptcpanalyzer.debug.debug_dataframe( merged_df, "Merged dataframe", usecols=(_first(debug_fields) + _second(debug_fields))) # workaround bug https://github.com/pandas-dev/pandas/issues/25448 def _convert_to_enums(): # per_pcap_artificial_fields for col in [ _first("tcpdest"), _first("mptcpdest"), _second("tcpdest"), _second("mptcpdest") ]: merged_df[col] = merged_df[col].apply( _convert_role, convert_dtype=False) # we fix the clocks a posteriori so that the cache is still usable log.debug("Postprocessing clock if needed") # merged_df[_first('abstime')] += clock_offset1 # merged_df[_second('abstime')] += clock_offset2 log.debug("Converting dataframes to be sender/receiver based...") # in both cases # TODO here we should attribute the definite mptcprole if mptcp: log.error( "We should correct the clocks if the argument is passed !") # raise mp.MpTcpException("Implement mptcp merge") res = convert_to_sender_receiver(merged_df) # fill MPTCP dest ? else: # tcp res = convert_to_sender_receiver(merged_df) # log.debug("Sorting by sender abstime") # merged_df.sort_values(by=_sender("abstime"), ascending=True, inplace=True) # debug_dataframe(res, "checking merge", usecols=["merge_status"]) # print("%d nan values" % len(res[res.merge_status == np.nan])) log.debug("Computing owds") debug_dataframe(res, "before owds") # TODO we don't necessarely need to generate the OWDs here, might be put out res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')] debug_dataframe( res, "owd", usecols=["owd", _sender('abstime'), _receiver('abstime')]) # with pd.option_context('float_format', '{:f}'.format): # print( # res[_sender(["ipsrc", "ipdst", "abstime"]) # + _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS + ["owd"] ] # ) except Exception as e: log.exception("exception happened while merging") # pd.set_option('display.max_rows', 200) # pd.set_option('display.max_colwidth', -1) # print("dtypes=", dict(dtypes)) log.log(mp.TRACE, "Dtypes after load:%s\n", pp.pformat(res.dtypes)) log.info("Finished loading. merged dataframe size: %d", len(res)) return res