Beispiel #1
0
    def _map_subflows(main: MpTcpConnection, mapped: MpTcpConnection):
        """
        """
        mapped_subflows = []
        for sf in main.subflows():

            # generates a list (subflow, score)
            scores = list(
                map(lambda x: TcpMapping(x, sf.score(x)), mapped.subflows()))
            scores.sort(key=lambda x: x[1], reverse=True)
            # print("sorted scores when mapping %s:\n %r" % (sf, scores))
            mapped_subflows.append((sf, scores[0]))
            # TODO might want to remove the selected subflow from the pool of candidates
        return mapped_subflows
Beispiel #2
0
    def _map_subflows(main: MpTcpConnection, mapped: MpTcpConnection):
        """
        """
        mapped_subflows = []
        for sf in main.subflows():

            # generates a list (subflow, score)
            scores = list(
                map(lambda x: TcpMapping(x, sf.score(x)), mapped.subflows()))
            scores.sort(key=lambda x: x.score, reverse=True)
            log.log(mp.TRACE,
                    "sorted scores when mapping %s:\n %r" % (sf, scores))
            mapped_subflows.append((sf, scores[0]))
        return mapped_subflows
Beispiel #3
0
def merge_mptcp_dataframes(df1: pd.DataFrame, df2: pd.DataFrame,
                           df1_mptcpstream: int) -> Tuple[pd.DataFrame, str]:
    """
    First looks in df2 for a stream matching df1_mptcpstream

    See:
        merge_mptcp_dataframes_known_streams
    """
    main_connection = MpTcpConnection.build_from_dataframe(
        df1, df1_mptcpstream)

    # we map over df2
    mappings = map_mptcp_connection(df2, main_connection)

    if len(mappings) <= 0:
        # TODO throw instead
        return None, "Could not find a match in the second pcap for mptcpstream %d" % df1_mptcpstream

    if len(mappings) <= 0:
        return None, "Could not find a match in the second pcap for tcpstream %d" % df1_mptcpstream

    print("len(df1)=", len(df1), " len(rawdf2)=", len(df2))
    mapped_connection = mappings[0].mapped
    print("Found mappings %s" % mappings)
    for mapping in mappings:
        print("Con: %s" % (mapping.mapped))

    return merge_mptcp_dataframes_known_streams((df1, main_connection),
                                                (df2, mapped_connection)), None
Beispiel #4
0
def map_mptcp_connection_from_known_streams(
        # rawdf2: pd.DataFrame,
        main: MpTcpConnection,
        other: MpTcpConnection) -> MpTcpMapping:
    """
    Attempts to map subflows only if score is high enough
    """

    # other = MpTcpConnection.build_from_dataframe(rawdf2, mptcpstream2)
    def _map_subflows(main: MpTcpConnection, mapped: MpTcpConnection):
        """
        """
        mapped_subflows = []
        for sf in main.subflows():

            # generates a list (subflow, score)
            scores = list(
                map(lambda x: TcpMapping(x, sf.score(x)), mapped.subflows()))
            scores.sort(key=lambda x: x[1], reverse=True)
            # print("sorted scores when mapping %s:\n %r" % (sf, scores))
            mapped_subflows.append((sf, scores[0]))
            # TODO might want to remove the selected subflow from the pool of candidates
        return mapped_subflows

    mptcpscore = main.score(other)
    mapped_subflows = None
    if mptcpscore > float('-inf'):
        # (other, score)
        mapped_subflows = _map_subflows(main, other)

    mapping = MpTcpMapping(mapped=other,
                           score=mptcpscore,
                           subflow_mappings=mapped_subflows)
    # print("mptcp mapping %s" % (mapping,))
    return mapping
Beispiel #5
0
def map_mptcp_connection_from_known_streams(
        main: MpTcpConnection, other: MpTcpConnection) -> MpTcpMapping:
    """
    Attempts to map subflows only if score is high enough
    """
    def _map_subflows(main: MpTcpConnection, mapped: MpTcpConnection):
        """
        """
        mapped_subflows = []
        for sf in main.subflows():

            # generates a list (subflow, score)
            scores = list(
                map(lambda x: TcpMapping(x, sf.score(x)), mapped.subflows()))
            scores.sort(key=lambda x: x.score, reverse=True)
            log.log(mp.TRACE,
                    "sorted scores when mapping %s:\n %r" % (sf, scores))
            mapped_subflows.append((sf, scores[0]))
        return mapped_subflows

    mptcpscore = main.score(other)
    mapped_subflows = None
    if mptcpscore > float('-inf'):
        # (other, score)
        mapped_subflows = _map_subflows(main, other)

    mapping = MpTcpMapping(mapped=other,
                           score=mptcpscore,
                           subflow_mappings=mapped_subflows)
    log.log(mp.TRACE, "mptcp mapping %s", mapping)
    return mapping
Beispiel #6
0
    def list_subflows(self, mptcpstreamid: int):

        try:
            con = MpTcpConnection.build_from_dataframe(self.data, mptcpstreamid)
            self.poutput("mptcp.stream %d has %d subflow(s) (client/server): " % (mptcpstreamid, len(con.subflows())))
            for sf in con.subflows():
                self.poutput("\t%s" % sf)
        except mp.MpTcpException as e:
            self.perror(e)
Beispiel #7
0
def mptcpdest_from_connections(df, con: MpTcpConnection) -> pd.DataFrame:

    for dest in ConnectionRoles:

        log.debug("Looking at mptcp destination %s" % dest)
        q = con.generate_direction_query(dest)
        df_dest = df.query(q)
        print("mptcpdest %r" % dest)
        df.loc[df_dest.index, 'mptcpdest'] = dest

    return df
Beispiel #8
0
    def plot(df, mptcpstream, field, **kwargs):
        """
        We get min/max
        Need a direction !
         idxmin() and idxmax()
        """
        # need to look for min/max DSN on each subflow
        con = MpTcpConnection.build_from_dataframe(df, mptcpstream)

        # streams = df.groupby("tcpstream")
        for subflow in con.subflows:
            ds = df.query(subflow.generate_direction_query())
            min_dsn, max_dsn = ds["dsn"].min(), ds["dsn"].max()
            print("Transferred bytes on subflow %s =" %
                  (subflow, max_dsn - min_dsn))
Beispiel #9
0
    def do_map_mptcp_connection(self, args):
        """
        Tries to map mptcp.streams from different pcaps.
        Score based mechanism

        Todo:
            - Limit number of displayed matches
        """

        df1 = load_into_pandas(args.pcap1, self.tshark_config)
        df2 = load_into_pandas(args.pcap2, self.tshark_config)


        main_connection = MpTcpConnection.build_from_dataframe(df1, args.mptcpstreamid)
        mappings = map_mptcp_connection(df2, main_connection)


        self.poutput("%d mapping(s) found" % len(mappings))
        mappings.sort(key=lambda x: x.score, reverse=True)

        for rank, match in enumerate(mappings):

            if rank >= args.limit:
                self.pfeedback("ignoring mappings left")
                break

            winner_like = match.score == float('inf')

            output = "{c1.mptcpstreamid} <-> {c2.mptcpstreamid} with score={score} {extra}"
            formatted_output = output.format(
                c1=main_connection,
                c2=match.mapped,
                score=FG_COLORS['red'] + str(match.score) + color_off,
                extra= " <-- should be a correct match" if winner_like else ""
            )

            if match.score < args.trim:
                continue

            # match = MpTcpMapping(match.mapped, match.score, mapped_subflows)
            def _print_subflow(x):
                return "\n-" + x[0].format_mapping(x[1])
                
            
            formatted_output += ''.join( [ _print_subflow(x) for x in match.subflow_mappings])

            self.poutput(formatted_output)
Beispiel #10
0
    def preprocess(self, dataframe, mptcpstream=None, skipped_subflows=[], **opt):
        """
        Can filter a dataframe beforehand

        :param opt Should be the expanded result of argparse
        :param mptcpstream: Filters the dataframe so as to keep only the packets
        related to mptcp.stream == mptcpstream
        :type mptcpstream: int
        :param skipped_subflows:

        This baseclass can filter on:

        - mptcpstream
        - destination (mptcpstream required)
        - skipped_subflows

        Returns updated dataframe
        """
        queries = []
        if mptcpstream:
            queries.append("mptcpstream == %d" % mptcpstream)
            if opt.get("destination", False):
                # Generate a filter for the connection
                con = MpTcpConnection.build_from_dataframe(dataframe, mptcpstream)
                q = con.generate_direction_query(opt.get("destination"))
                queries.append(q)

        # for skipped_subflow in opt.get("skipped_subflows", []):
        for skipped_subflow in skipped_subflows:
            queries.append(" tcpstream != %d " % skipped_subflow)

        query = " and ".join(queries)

        # throws when querying with an empty query
        if len(query) > 0:
            log.info("Running query:\n%s" % query)
            dataframe = dataframe.query(query)

        if not len(dataframe.index):
            raise Exception("Empty dataframe after running query [%s]" % query)
            # print("no packet matching mptcp.stream %d"
            #     "(use 'lc' command to list connections)" % args.mptcpstream)
            # return

        return dataframe
Beispiel #11
0
def mptcp_compute_throughput(
        rawdf, mptcpstreamid, destination: ConnectionRoles
    # mptcpstreamid2=None
) -> Tuple[bool, Any]:
    """
    Very raw computation: substract highest dsn from lowest by the elapsed time

    Returns:
        a tuple (True/false, dict)
    """

    df = rawdf[rawdf.mptcpstream == mptcpstreamid]
    if df.empty:
        return False, "No packet with mptcp.stream == %d" % mptcpstreamid

    con = MpTcpConnection.build_from_dataframe(df, mptcpstreamid)
    q = con.generate_direction_query(destination)
    df = unidirectional_df = df.query(q)

    dsn_min = df.dss_dsn.min()
    dsn_max = df.dss_dsn.max()
    total_transferred = dsn_max - dsn_min
    d = df.groupby(_sender('tcpstream'))
    subflow_stats: List[Any] = []
    for tcpstream, group in d:
        # TODO drop retransmitted
        subflow_load = group.drop_duplicates(subset="dss_dsn").dss_length.sum()
        subflow_load = subflow_load if not math.isnan(subflow_load) else 0
        subflow_stats.append({
            'tcpstreamid': tcpstream,
            'throughput_bytes': int(subflow_load)
        })

    return True, {
        'mptcpstreamid':
        mptcpstreamid,
        # TODO append bytes
        'mptcp_goodput_bytes':
        total_transferred,
        'mptcp_throughput_bytes':
        sum(map(lambda x: x['throughput_bytes'], subflow_stats)),
        'subflow_stats':
        subflow_stats,
    }
Beispiel #12
0
def map_mptcp_connection(rawdf2: pd.DataFrame,
                         main: MpTcpConnection) -> List[MpTcpMapping]:
    """
    warn: Do not trust the results yet WIP !

    Returns:
        List of (connection, score) with the best mapping first

    This function tries to map a mptcp.stream from a dataframe (aka pcap) to mptcp.stream
    in another dataframe. For now it just looks at IP level stuff without considering subflow
    mapping score
    """
    log.warning("mapping between datasets is not considered trustable yet")
    results: List[MpTcpMapping] = []

    for mptcpstream2 in rawdf2[_sender("mptcpstream")].dropna().unique():
        other = MpTcpConnection.build_from_dataframe(rawdf2, mptcpstream2)
        mapping = map_mptcp_connection_from_known_streams(main, other)
        results.append(mapping)

    results.sort(key=lambda x: x.score, reverse=True)

    return results
Beispiel #13
0
def map_mptcp_connection(rawdf2: pd.DataFrame,
                         main: MpTcpConnection) -> List[MpTcpMapping]:
    """
    warn: Do not trust the results yet WIP !

    Returns:
        List of (connection, score) with the best mapping first

    This function tries to map a mptcp.stream from a dataframe (aka pcap) to mptcp.stream
    in another dataframe. For now it just looks at IP level stuff without considering subflow
    mapping score
    """
    log.warning("mapping between datasets is not considered trustable yet")
    results = []  # type: List[MpTcpMapping]

    # mappings = {}  # type: Dict[int,Tuple[Any, float]]

    score = -1  # type: float
    results = []

    # print("%r" % main)
    # print(rawdf2["mptcpstream"].unique().dropna())

    for mptcpstream2 in rawdf2[_sender("mptcpstream")].dropna().unique():
        other = MpTcpConnection.build_from_dataframe(rawdf2, mptcpstream2)
        mapping = map_mptcp_connection_from_known_streams(main, other)
        # score = main.score(other)
        # if score > float('-inf'):
        #     # (other, score)
        #     mapped_subflows = _map_subflows(main, other)
        #     mapping = MpTcpMapping(mapped=other, score=score, subflow_mappings=mapped_subflows)
        results.append(mapping)

    # sort based on the score
    results.sort(key=lambda x: x[1], reverse=True)

    return results
Beispiel #14
0
 def connection(self, streamid) -> MpTcpConnection:
     return MpTcpConnection.build_from_dataframe(self._obj, streamid)
Beispiel #15
0
def load_merged_streams_into_pandas(
        pcap1: str,
        pcap2: str,
        streamid1: int,
        streamid2: int,
        # TODO changed to protocol
        mptcp: bool,
        tshark_config: TsharkConfig,
        clock_offset1: int = 0,
        clock_offset2: int = 0,
        mapping_mode: PacketMappingMode = PacketMappingMode.HASH,
        **extra):
    """
    Arguments:
        protocol: mptcp or tcp
        mapping_mode: Only HASH works for now
        clock_offset: untested

    Returns
        a dataframe with columns... owd ?
    """
    protocolStr = "mptcp" if mptcp else "tcp"
    log.debug(f"Asked to load {protocolStr} merged streams {streamid1} and "
              "{streamid2} from pcaps {pcap1} and {pcap2}")

    cache = mp.get_cache()

    cacheid = cache.cacheuid(
        "merged", [getrealpath(pcap1), getrealpath(pcap2)],
        protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv")

    # if we can't load that file from cache
    try:
        merged_df = pd.DataFrame()
        res = pd.DataFrame()

        valid, cachename = cache.get(cacheid)
        log.info("Cache validity=%s and cachename=%s" % (valid, cachename))

        # TODO disable when clock_offset is set
        if not valid:
            df1 = load_into_pandas(pcap1,
                                   tshark_config,
                                   clock_offset=clock_offset1)
            df2 = load_into_pandas(pcap2,
                                   tshark_config,
                                   clock_offset=clock_offset2)

            main_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            other_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            if mptcp:
                main_connection = MpTcpConnection.build_from_dataframe(
                    df1, MpTcpStreamId(streamid1))
                other_connection = MpTcpConnection.build_from_dataframe(
                    df2, MpTcpStreamId(streamid2))

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_mptcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            else:
                main_connection = TcpConnection.build_from_dataframe(
                    df1, TcpStreamId(streamid1))
                other_connection = TcpConnection.build_from_dataframe(
                    df2, TcpStreamId(streamid2))

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_tcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            assert cachename
            log.info("Saving into %s" % cachename)
            # trying to export lists correctly
            # print(merged_df.reinjected_in.dropna().head())
            # convert arrays back to strings
            # merged_df.apply(",".join()
            # or abstime ?

            # TODO rechange the flags hex()
            merged_df.to_csv(
                cachename,
                # columns=columns,
                index=False,
                header=True,
                sep=tshark_config.delimiter,
            )

            # tcpdest had become an objected instead of a CategoricalDtype
            # see https://github.com/pandas-dev/pandas/issues/22361
            log.log(mp.TRACE, "saving with dtypes=", dict(merged_df.dtypes))

        else:
            log.info("Loading from cache %s", cachename)

            date_cols = get_date_cols(tshark_config.fields)

            with open(cachename) as fd:
                # generate fieldlist
                def _gen_fields(fields):
                    gfields = {}  # type: ignore
                    for _name in [_first, _second]:
                        gfields.update(
                            {_name(k): v
                             for k, v in fields.items()})
                    return gfields

                # reltime discarded on save ?
                tshark_config.fields.pop("reltime")
                gfields = _gen_fields(tshark_config.fields)
                merge_dtypes = get_dtypes(gfields)
                # log.log(mp.TRACE, "Using gfields %s" % pp.pformat(gfields))

                # we don't need any converters
                converters = {}
                date_cols = get_date_cols(gfields)

                log.log(mp.TRACE, "Using date_cols %s" % pp.pformat(date_cols))
                log.log(mp.TRACE, "Using dtypes %s" % pp.pformat(merge_dtypes))
                # log.log(mp.TRACE, "Using converters %s" % (pp.pformat(converters)))
                merged_df = pd.read_csv(
                    fd,
                    skip_blank_lines=True,
                    comment='#',
                    # we don't need 'header' when metadata is with comment
                    sep=tshark_config.delimiter,
                    # memory_map=True, # could speed up processing
                    dtype=merge_dtypes,  # poping still generates
                    converters=converters,
                    # date_parser=date_converter,
                    parse_dates=date_cols,
                )
                # at this stage, destinatiosn are nan

                debug_fields = ["abstime", "tcpstream", "tcpdest", "mptcpdest"]
                mptcpanalyzer.debug.debug_dataframe(
                    merged_df,
                    "Merged dataframe",
                    usecols=(_first(debug_fields) + _second(debug_fields)))

                # workaround bug https://github.com/pandas-dev/pandas/issues/25448
                def _convert_to_enums():
                    # per_pcap_artificial_fields
                    for col in [
                            _first("tcpdest"),
                            _first("mptcpdest"),
                            _second("tcpdest"),
                            _second("mptcpdest")
                    ]:
                        merged_df[col] = merged_df[col].apply(
                            _convert_role, convert_dtype=False)

        # we fix the clocks a posteriori so that the cache is still usable
        log.debug("Postprocessing clock if needed")
        # merged_df[_first('abstime')] += clock_offset1
        # merged_df[_second('abstime')] += clock_offset2

        log.debug("Converting dataframes to be sender/receiver based...")

        # in both cases
        # TODO here we should attribute the definite mptcprole
        if mptcp:
            log.error(
                "We should correct the clocks if the argument is passed !")
            # raise mp.MpTcpException("Implement mptcp merge")

            res = convert_to_sender_receiver(merged_df)
            # fill MPTCP dest ?
        else:
            # tcp
            res = convert_to_sender_receiver(merged_df)

        # log.debug("Sorting by sender abstime")
        # merged_df.sort_values(by=_sender("abstime"), ascending=True, inplace=True)
        # debug_dataframe(res, "checking merge", usecols=["merge_status"])
        # print("%d nan values" % len(res[res.merge_status == np.nan]))

        log.debug("Computing owds")

        debug_dataframe(res, "before owds")
        # TODO we don't necessarely need to generate the OWDs here, might be put out
        res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')]

        debug_dataframe(
            res,
            "owd",
            usecols=["owd", _sender('abstime'),
                     _receiver('abstime')])
        # with pd.option_context('float_format', '{:f}'.format):
        #     print(
        #         res[_sender(["ipsrc", "ipdst", "abstime"])
        #          + _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS + ["owd"] ]
        #     )

    except Exception as e:
        log.exception("exception happened while merging")

    # pd.set_option('display.max_rows', 200)
    # pd.set_option('display.max_colwidth', -1)
    # print("dtypes=", dict(dtypes))
    log.log(mp.TRACE, "Dtypes after load:%s\n", pp.pformat(res.dtypes))
    log.info("Finished loading. merged dataframe size: %d", len(res))

    return res
Beispiel #16
0
def filter_dataframe(
        self,
        rawdf,
        # TODO choose prefix
        merged_one,
        tcpstream=None,
        mptcpstream=None,
        skipped_subflows=[],
        destinations: list = None,
        extra_query: str = None,
        **kwargs):
    """
    Can filter a single dataframe beforehand
    (hence call it several times for several dataframes).

    Feel free to inherit/override this class.

    Args:
        rawdf: Raw dataframe
        kwargs: expanded arguments returned by the parser
        destination: Filters packets depending on their :enum:`.ConnectionRoles`
        stream: keep only the packets related to mptcp.stream == mptcpstream
        skipped_subflows: list of skipped subflows
        extra_query: Add some more filters to the pandas query

    This baseclass can filter on:

    - mptcpstream
    - destination (mptcpstream required)
    - skipped_subflows

    Returns:
        Filtered dataframe
    """
    log.debug("Preprocessing dataframe with extra args %s" % kwargs)
    queries = []
    log.debug("tcp.stream %d mptcp: %d" % (tcpstream, mptcpstream))
    stream = tcpstream if tcpstream is not None else mptcpstream
    dataframe = rawdf

    for skipped_subflow in skipped_subflows:
        log.debug("Skipping subflow %d" % skipped_subflow)
        queries.append(" tcpstream!=%d " % skipped_subflow)

    if stream is not None:
        protocol = "mptcp" if mptcpstream is not None else "tcp"
        log.debug("Filtering %s stream #%d." % (protocol, stream))
        queries.append(protocol + "stream==%d" % stream)

        if protocol == "tcp":
            # generates the "tcpdest" component of the dataframe
            con2 = TcpConnection.build_from_dataframe(dataframe, stream)
            dataframe = tcpdest_from_connections(dataframe, con2)
            # trust plots to do the filtering
            # if destinations is not []:
            #     queries.append(protocol + "dest==%d" % stream)
        else:
            # todo shall do the same for mptcp destinations
            con = MpTcpConnection.build_from_dataframe(dataframe, stream)
            # mptcpdest = main_connection.mptcp_dest_from_tcpdest(tcpdest)
            df = mptcpdest_from_connections(dataframe, con)
            # TODO generate mptcpdest
            # if protocol == "mptcp":
            if destinations is not None:
                raise Exception(
                    "destination filtering is not ready yet for mptcp")

                log.debug("Filtering destination")

                # Generate a filter for the connection
                # con = MpTcpConnection.build_from_dataframe(dataframe, stream)
                # q = con.generate_direction_query(destination)
                # queries.append(q)
    if extra_query:
        log.debug("Appending extra_query=%s" % extra_query)
        queries.append(extra_query)

    query = " and ".join(queries)

    # throws when querying with an empty query
    if len(query) > 0:
        log.info("Running query:\n%s\n" % query)
        dataframe.query(query, inplace=True)

    return dataframe
Beispiel #17
0
def load_merged_streams_into_pandas(
        pcap1: str,
        pcap2: str,
        streamid1: int,  # Union[MpTcpStreamId, TcpStreamId],
        streamid2: int,
        mptcp: bool,
        tshark_config: TsharkConfig,
        clock_offset1: int = 0,
        clock_offset2: int = 0,
        mapping_mode: PacketMappingMode = PacketMappingMode.HASH,
        **extra):
    """
    Arguments:
        protocol: mptcp or tcp

        mapping_mode: Only HASH works for now

    Returns
        a dataframe with columns... owd ?
    """
    log.debug(
        "Asked to load merged tcp streams %d and %d from pcaps %s and %s" %
        (streamid1, streamid2, pcap1, pcap2))

    cache = mp.get_cache()
    protocolStr = "mptcp" if mptcp else "tcp"

    cacheid = cache.cacheuid(
        "merged", [
            getrealpath(pcap1),
            getrealpath(pcap2),
        ], protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv")

    # if we can't load that file from cache
    try:
        merged_df = pd.DataFrame()
        res = pd.DataFrame()

        valid, cachename = cache.get(cacheid)
        log.info("Cache validity=%s and cachename=%s" % (valid, cachename))

        # TODO disable when clock_offset is set
        if not valid:
            df1 = load_into_pandas(pcap1,
                                   tshark_config,
                                   clock_offset=clock_offset1)
            df2 = load_into_pandas(pcap2,
                                   tshark_config,
                                   clock_offset=clock_offset2)

            main_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            other_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            if mptcp:
                main_connection = MpTcpConnection.build_from_dataframe(
                    df1, streamid1)
                other_connection = MpTcpConnection.build_from_dataframe(
                    df2, streamid2)

                # TODO generate
                # map_mptcp_connection()

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_mptcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            else:
                main_connection = TcpConnection.build_from_dataframe(
                    df1, streamid1)
                other_connection = TcpConnection.build_from_dataframe(
                    df2, streamid2)

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_tcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            assert cachename
            logging.info("Saving into %s" % cachename)
            # trying to export lists correctly
            # print(merged_df.reinjected_in.dropna().head())
            # convert arrays back to strings
            # merged_df.apply(",".join()
            merged_df.to_csv(
                cachename,
                # columns=columns,
                index=False,
                header=True,
                sep=tshark_config.delimiter,
            )

            # la on a perdu tcpdest est devenu object
            print("saving with dtypes=", dict(merged_df.dtypes))
            # print("MERGED_DF", merged_df[TCP_DEBUG_FIELDS].head(20))

            # if log level >= DEBUG then save to xls too !
            # if True:
            #     filename = cachename + ".xls"
            #     logging.debug("Saved a debug excel copy at %s" % filename)
            #     merged_df.to_excel(filename)

        else:
            logging.info("Loading from cache %s" % cachename)

            # dtypes = {k: v for k, v in temp.items() if v is not None or k not in ["tcpflags"]}

            def _gen_dtypes(fields) -> Dict[str, Any]:
                dtypes = {}  # type: ignore
                for _name in [_first, _second]:

                    # TODO this could be simplified
                    for k, v in fields.items():
                        if v is not None or k not in ["tcpflags"]:
                            dtypes.setdefault(_name(k), v)

                    # add generated field dtypes
                    dtypes.update({
                        _name(f.fullname): f.type
                        for f in per_pcap_artificial_fields.values()
                    })

                # these are overrides from the generated dtypes
                dtypes.update({
                    # during the merge, we join even unmapped packets so some entries
                    # may be empty => float64
                    _first("packetid"):
                    np.float64,
                    _second("packetid"):
                    np.float64,
                })

                return dtypes

            def _gen_converters() -> Dict[str, Callable]:

                # converters = {}   # type: Dict[str, Any]
                fields = dict(tshark_config.fields)
                fields.update(per_pcap_artificial_fields)
                converters = {}
                # no need to convert tcpflags
                default_converters = {
                    name: f.converter
                    for name, f in fields.items()
                    if f.converter and name != "tcpflags"
                }
                # converters.update({ name: f.converter for name, f in per_pcap_artificial_fields.items() if f.converter})
                for name, converter in default_converters.items():
                    converters.update({
                        _first(name): converter,
                        _second(name): converter
                    })

                return converters

            with open(cachename) as fd:
                dtypes = _gen_dtypes({
                    name: field.type
                    for name, field in tshark_config.fields.items()
                })
                converters = _gen_converters()
                # more recent versions can do without it
                # pd.set_option('display.max_rows', 200)
                # pd.set_option('display.max_colwidth', -1)
                # print("converters=", converters)
                merged_df = pd.read_csv(
                    fd,
                    skip_blank_lines=True,
                    comment='#',
                    # we don't need 'header' when metadata is with comment
                    sep=tshark_config.delimiter,
                    # memory_map=True, # could speed up processing
                    dtype=dtypes,  # poping still generates
                    converters=converters,
                )

                # log.debug("Column names after loading from cache: %s", merged_df.columns)

                # TODO:
                # No columns to parse from file

        # we fix the clocks a posteriori so that the cache is still usable

        logging.debug("Postprocessing clock if needed")
        merged_df[_first('abstime')] += clock_offset1
        merged_df[_second('abstime')] += clock_offset2

        logging.debug("Converting dataframes to be sender/receiver based...")
        # in both cases
        # TODO here we should attribute the definite mptcprole
        # compute owd
        if mptcp:
            print("Should be merging OWDs")
            logging.error(
                "We should correct the clocks if the argument is passed !")
            # raise mp.MpTcpException("Implement mptcp merge")

            res = convert_to_sender_receiver(merged_df)
        else:
            # tcp
            # c la ou ou corrige les temps
            # on rename les colonnes host1 ou host2 par _sender ou bien _receiver ?!
            res = convert_to_sender_receiver(merged_df)

            # don't do it here else we might repeat it
            # data["abstime"] += clock_offset

        logging.debug("Computing owds")
        log.debug("Column names: %s", res.columns)
        log.debug("Dtypes after load:%s\n" % dict(res.dtypes))
        print("res=")
        # TODO we don't necessarely need to generate the OWDs here, might be put out
        res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')]
        # .head(40))
        with pd.option_context('float_format', '{:f}'.format):
            print(res[_sender(["ipsrc", "ipdst", "abstime"]) +
                      _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS +
                      ["owd"]])

    except Exception:
        logging.exception("exception happened while merging")

    # pd.set_option('display.max_rows', 200)
    # pd.set_option('display.max_colwidth', -1)
    # print("dtypes=", dict(dtypes))
    # log.debug("Dtypes after load:%s\n" % pp.pformat(merged_df.dtypes))
    log.info("Finished loading. merged dataframe size: %d" % len(merged_df))

    return res