Beispiel #1
0
        def _print_reinjection_comparison(original_packet, reinj, ):
            """
            Expects tuples of original and reinjection packets
            """
            # original_packet  = sender_df.loc[ sender_df.packetid == initial_packetid, ].iloc[0]
            row = reinj

            reinjection_packetid = getattr(row, _sender("packetid")),
            reinjection_start    = getattr(row, _sender("abstime")),
            reinjection_arrival  = getattr(row, _receiver("abstime")),
            original_start       = original_packet[_sender("abstime")],
            original_arrival     = original_packet[_receiver("abstime")] 

            if reinj.redundant == False:
                # print(original_packet["packetid"])
                msg = ("packet {pktid} is a successful reinjection of {initial_packetid}."
                        " It arrived at {reinjection_arrival} to compare with {original_arrival}"
                        " while being transmitted at {reinjection_start} to compare with "
                        "{original_start}, i.e., {reinj_delta} before")
                # TODO use assert instead
                if getattr(row, _receiver("abstime")) > original_packet[ _receiver("abstime") ]:
                    print("BUG: this is not a valid reinjection after all ?")

            elif args.failed:
                # only de
                msg = "packet {pktid} is a failed reinjection of {initial_packetid}."
            else:
                return

            msg = msg.format(
                pktid               = reinjection_packetid,
                initial_packetid    = initial_packetid,

                reinjection_start   = reinjection_start,
                reinjection_arrival = reinjection_arrival,
                original_start      = original_start,
                original_arrival    = original_arrival,
                reinj_delta         = reinj.reinj_delta,
            )
            self.poutput(msg)
Beispiel #2
0
    def plot(self, pcap, protocol, **kwargs):
        """
        Ideally it should be mapped automatically
        For now plots only one direction but there could be a wrapper to plot forward owd, then backward OWDs
        Disclaimer: Keep in mind this assumes a perfect synchronization between nodes, i.e.,
        it relies on the pcap absolute time field.
        While this is true in discrete time simulators such as ns3

        """
        fig = plt.figure()
        axes = fig.gca()
        res = pcap
        res[_sender("abstime")] = pd.to_datetime(res[_sender("abstime")],
                                                 unit="s")

        # TODO here we should rewrite
        debug_fields = _sender(TCP_DEBUG_FIELDS) + _receiver(
            TCP_DEBUG_FIELDS) + ["owd"]

        print("columns", pcap)
        print("columns", res.columns)
        print("info", res.info())
        print(res.loc[res._merge == "both", debug_fields])

        df = res

        print("STARTING LOOP")
        print("DESTINATION=%r" % kwargs.get("pcapdestinations", []))
        # df= df[df.owd > 0.010]

        fields = [
            "tcpdest",
            "tcpstream",
        ]
        # if True:
        if protocol == "mptcp":
            self.plot_mptcp(df, fig, fields, **kwargs)
        else:
            self.plot_tcp(df, fig, fields, **kwargs)

        # TODO add units
        axes.set_xlabel("Time (s)")
        axes.set_ylabel("One Way Delay (s)")

        self.title = "One Way Delays for {} streams {} <-> {} {dest}".format(
            protocol,
            kwargs.get("pcap1stream"),
            kwargs.get("pcap2stream"),
            dest="")

        return fig
Beispiel #3
0
    def plot(self, pcap, protocol, **kwargs):
        """
        Ideally it should be mapped automatically
        For now plots only one direction but there could be a wrapper to plot forward owd, then backward OWDs
        Disclaimer: Keep in mind this assumes a perfect synchronization between nodes, i.e.,
        it relies on the pcap absolute time field.
        While this is true in discrete time simulators such as ns3

        """
        fig = plt.figure()
        axes = fig.gca()
        res = pcap
        destinations = kwargs.get("pcap_destinations")
        # should already be done
        # res[_sender("abstime")] = pd.to_datetime(res[_sender("abstime")], unit="s")

        # TODO here we should rewrite
        debug_fields = _sender(TCP_DEBUG_FIELDS) + _receiver(TCP_DEBUG_FIELDS) + ["owd"]

        # print("columns", pcap)
        debug_dataframe(res, "owd dataframe")
        # print(res.loc[res.merge_status == "both", debug_fields])

        df = res

        # print("DESTINATION=%r" % destinations)
        # df= df[df.owd > 0.010]

        fields = ["tcpdest", "tcpstream", ]
        # if True:
        # TODO: use Protocol.MPTCP:
        if protocol == "mptcp":
            self.plot_mptcp(df, fig, fields, **kwargs)
        elif protocol == "tcp":
            self.plot_tcp(df, fig, fields, **kwargs)
        else:
            raise Exception("Unsupported protocol %r" % protocol)


        self.title_fmt = "One Way Delays for {protocol}"
        if len(destinations) == 1:
            self.title_fmt = self.title_fmt + " towards {dest}"

        self.title_fmt = self.title_fmt.format(
            protocol=protocol,
            # kwargs.get("pcap1stream"),
            # kwargs.get("pcap2stream"),
            dest=destinations[0].to_string()
        )

        return fig
Beispiel #4
0
    def do_qualify_reinjections(self, args, unknown):
        """
        test with:
            mp qualify_reinjections 0

        TODO move the code into a proper function
        """
        # TODO this should be done automatically right ?
        df_all = load_merged_streams_into_pandas(
            args.pcap1,
            args.pcap2,
            args.pcap1stream,
            args.pcap2stream,
            mptcp=True,
            tshark_config=self.tshark_config
        )

        # adds a redundant column
        df = classify_reinjections(df_all)

        # print(df_all[ pd.notnull(df_all[_sender("reinjection_of")])] [
        #     _sender(["reinjection_of", "reinjected_in", "packetid", "reltime"]) +
        #     _receiver(["packetid", "reltime"])
        # ])

        # to help debug
        # df.to_excel("temp.xls")

        def _print_reinjection_comparison(original_packet, reinj, ):
            """
            Expects tuples of original and reinjection packets
            """
            # original_packet  = sender_df.loc[ sender_df.packetid == initial_packetid, ].iloc[0]
            row = reinj

            reinjection_packetid = getattr(row, _sender("packetid")),
            reinjection_start    = getattr(row, _sender("abstime")),
            reinjection_arrival  = getattr(row, _receiver("abstime")),
            original_start       = original_packet[_sender("abstime")],
            original_arrival     = original_packet[_receiver("abstime")] 

            if reinj.redundant == False:
                # print(original_packet["packetid"])
                msg = ("packet {pktid} is a successful reinjection of {initial_packetid}."
                        " It arrived at {reinjection_arrival} to compare with {original_arrival}"
                        " while being transmitted at {reinjection_start} to compare with "
                        "{original_start}, i.e., {reinj_delta} before")
                # TODO use assert instead
                if getattr(row, _receiver("abstime")) > original_packet[ _receiver("abstime") ]:
                    print("BUG: this is not a valid reinjection after all ?")

            elif args.failed:
                # only de
                msg = "packet {pktid} is a failed reinjection of {initial_packetid}."
            else:
                return

            msg = msg.format(
                pktid               = reinjection_packetid,
                initial_packetid    = initial_packetid,

                reinjection_start   = reinjection_start,
                reinjection_arrival = reinjection_arrival,
                original_start      = original_start,
                original_arrival    = original_arrival,
                reinj_delta         = reinj.reinj_delta,
            )
            self.poutput(msg)


        # with pd.option_context('display.max_rows', None, 'display.max_columns', 300):
        #     print(reinjected_packets[["packetid", "packetid_receiver", *_receiver(["reinjected_in", "reinjection_of"])]].head())
        # TODO filter depending on --failed and --destinations

        if args.csv:
            self.pfeedback("Exporting to csv")
            # keep redundant
            # only export a subset ?
            # for 
            # df1 = df[['a','d']]
            # smalldf = df.drop()
            columns = _sender(["abstime", "reinjection_of", "reinjected_in", "packetid", "tcpstream", "mptcpstream", "tcpdest", "mptcpdest"])
            columns += _receiver(["abstime", "packetid"])
            columns += ["redundant", "owd", "reinj_delta"]

            df[columns].to_csv(
                self.stdout,
                sep="|",
                index=False,
                header=True,
            )
            return

        for destination in ConnectionRoles:

            if args.destinations and destination not in args.destinations:
                log.debug("ignoring destination %s " % destination)
                continue

            self.poutput("looking for reinjections towards mptcp %s" % destination)
            sender_df = df[df.mptcpdest == destination]
            log.debug("%d reinjections in that direction" % (len(sender_df), ))

            # TODO we now need to display successful reinjections
            reinjections = sender_df[pd.notnull(sender_df[_sender("reinjection_of")])]

            successful_reinjections = reinjections[reinjections.redundant == False]

            self.poutput("%d successful reinjections" % len(successful_reinjections))
            # print(successful_reinjections[ _sender(["packetid", "reinjection_of"]) + _receiver(["packetid"]) ])

            for row in reinjections.itertuples(index=False):

                # loc ? this is an array, sort it and take the first one ?
                initial_packetid = row.reinjection_of[0]
                # print("initial_packetid = %r %s" % (initial_packetid, type(initial_packetid)))

                original_packet  = df_all.loc[df_all.packetid == initial_packetid].iloc[0]
                # print("original packet = %r %s" % (original_packet, type(original_packet)))

                # if row.redundant == True and args.failed:
                    # _print_failed_reinjection(original_packet, row, debug=args.debug)

                _print_reinjection_comparison(original_packet, row, )
Beispiel #5
0
def load_merged_streams_into_pandas(
        pcap1: str,
        pcap2: str,
        streamid1: int,  # Union[MpTcpStreamId, TcpStreamId],
        streamid2: int,
        mptcp: bool,
        tshark_config: TsharkConfig,
        clock_offset1: int = 0,
        clock_offset2: int = 0,
        mapping_mode: PacketMappingMode = PacketMappingMode.HASH,
        **extra):
    """
    Arguments:
        protocol: mptcp or tcp

        mapping_mode: Only HASH works for now

    Returns
        a dataframe with columns... owd ?
    """
    log.debug(
        "Asked to load merged tcp streams %d and %d from pcaps %s and %s" %
        (streamid1, streamid2, pcap1, pcap2))

    cache = mp.get_cache()
    protocolStr = "mptcp" if mptcp else "tcp"

    cacheid = cache.cacheuid(
        "merged", [
            getrealpath(pcap1),
            getrealpath(pcap2),
        ], protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv")

    # if we can't load that file from cache
    try:
        merged_df = pd.DataFrame()
        res = pd.DataFrame()

        valid, cachename = cache.get(cacheid)
        log.info("Cache validity=%s and cachename=%s" % (valid, cachename))

        # TODO disable when clock_offset is set
        if not valid:
            df1 = load_into_pandas(pcap1,
                                   tshark_config,
                                   clock_offset=clock_offset1)
            df2 = load_into_pandas(pcap2,
                                   tshark_config,
                                   clock_offset=clock_offset2)

            main_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            other_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            if mptcp:
                main_connection = MpTcpConnection.build_from_dataframe(
                    df1, streamid1)
                other_connection = MpTcpConnection.build_from_dataframe(
                    df2, streamid2)

                # TODO generate
                # map_mptcp_connection()

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_mptcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            else:
                main_connection = TcpConnection.build_from_dataframe(
                    df1, streamid1)
                other_connection = TcpConnection.build_from_dataframe(
                    df2, streamid2)

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_tcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            assert cachename
            logging.info("Saving into %s" % cachename)
            # trying to export lists correctly
            # print(merged_df.reinjected_in.dropna().head())
            # convert arrays back to strings
            # merged_df.apply(",".join()
            merged_df.to_csv(
                cachename,
                # columns=columns,
                index=False,
                header=True,
                sep=tshark_config.delimiter,
            )

            # la on a perdu tcpdest est devenu object
            print("saving with dtypes=", dict(merged_df.dtypes))
            # print("MERGED_DF", merged_df[TCP_DEBUG_FIELDS].head(20))

            # if log level >= DEBUG then save to xls too !
            # if True:
            #     filename = cachename + ".xls"
            #     logging.debug("Saved a debug excel copy at %s" % filename)
            #     merged_df.to_excel(filename)

        else:
            logging.info("Loading from cache %s" % cachename)

            # dtypes = {k: v for k, v in temp.items() if v is not None or k not in ["tcpflags"]}

            def _gen_dtypes(fields) -> Dict[str, Any]:
                dtypes = {}  # type: ignore
                for _name in [_first, _second]:

                    # TODO this could be simplified
                    for k, v in fields.items():
                        if v is not None or k not in ["tcpflags"]:
                            dtypes.setdefault(_name(k), v)

                    # add generated field dtypes
                    dtypes.update({
                        _name(f.fullname): f.type
                        for f in per_pcap_artificial_fields.values()
                    })

                # these are overrides from the generated dtypes
                dtypes.update({
                    # during the merge, we join even unmapped packets so some entries
                    # may be empty => float64
                    _first("packetid"):
                    np.float64,
                    _second("packetid"):
                    np.float64,
                })

                return dtypes

            def _gen_converters() -> Dict[str, Callable]:

                # converters = {}   # type: Dict[str, Any]
                fields = dict(tshark_config.fields)
                fields.update(per_pcap_artificial_fields)
                converters = {}
                # no need to convert tcpflags
                default_converters = {
                    name: f.converter
                    for name, f in fields.items()
                    if f.converter and name != "tcpflags"
                }
                # converters.update({ name: f.converter for name, f in per_pcap_artificial_fields.items() if f.converter})
                for name, converter in default_converters.items():
                    converters.update({
                        _first(name): converter,
                        _second(name): converter
                    })

                return converters

            with open(cachename) as fd:
                dtypes = _gen_dtypes({
                    name: field.type
                    for name, field in tshark_config.fields.items()
                })
                converters = _gen_converters()
                # more recent versions can do without it
                # pd.set_option('display.max_rows', 200)
                # pd.set_option('display.max_colwidth', -1)
                # print("converters=", converters)
                merged_df = pd.read_csv(
                    fd,
                    skip_blank_lines=True,
                    comment='#',
                    # we don't need 'header' when metadata is with comment
                    sep=tshark_config.delimiter,
                    # memory_map=True, # could speed up processing
                    dtype=dtypes,  # poping still generates
                    converters=converters,
                )

                # log.debug("Column names after loading from cache: %s", merged_df.columns)

                # TODO:
                # No columns to parse from file

        # we fix the clocks a posteriori so that the cache is still usable

        logging.debug("Postprocessing clock if needed")
        merged_df[_first('abstime')] += clock_offset1
        merged_df[_second('abstime')] += clock_offset2

        logging.debug("Converting dataframes to be sender/receiver based...")
        # in both cases
        # TODO here we should attribute the definite mptcprole
        # compute owd
        if mptcp:
            print("Should be merging OWDs")
            logging.error(
                "We should correct the clocks if the argument is passed !")
            # raise mp.MpTcpException("Implement mptcp merge")

            res = convert_to_sender_receiver(merged_df)
        else:
            # tcp
            # c la ou ou corrige les temps
            # on rename les colonnes host1 ou host2 par _sender ou bien _receiver ?!
            res = convert_to_sender_receiver(merged_df)

            # don't do it here else we might repeat it
            # data["abstime"] += clock_offset

        logging.debug("Computing owds")
        log.debug("Column names: %s", res.columns)
        log.debug("Dtypes after load:%s\n" % dict(res.dtypes))
        print("res=")
        # TODO we don't necessarely need to generate the OWDs here, might be put out
        res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')]
        # .head(40))
        with pd.option_context('float_format', '{:f}'.format):
            print(res[_sender(["ipsrc", "ipdst", "abstime"]) +
                      _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS +
                      ["owd"]])

    except Exception:
        logging.exception("exception happened while merging")

    # pd.set_option('display.max_rows', 200)
    # pd.set_option('display.max_colwidth', -1)
    # print("dtypes=", dict(dtypes))
    # log.debug("Dtypes after load:%s\n" % pp.pformat(merged_df.dtypes))
    log.info("Finished loading. merged dataframe size: %d" % len(merged_df))

    return res
Beispiel #6
0
def classify_reinjections(df_all: pd.DataFrame) -> pd.DataFrame:
    """
    here the idea is to look at reinjections on the receiver side, see which one is first
    packets with reinjected_in_receiver are (at least they should) be the first DSN arrived.

    Returns
        a new dataframe with an added column "redundant"
    """

    df_all["redundant"] = False
    df_all["reinj_delta"] = np.nan

    # rename to df_both ?
    df = df_all[df_all._merge == "both"]

    # print(df_all[ pd.notnull(df_all[_sender("reinjection_of")])] [
    #     _sender(["reinjection_of", "reinjected_in", "packetid", "reltime"]) +
    #     _receiver(["packetid", "reltime"])
    # ])

    for destination in ConnectionRoles:

        sender_df = df[df.mptcpdest == destination]

        # print(sender_df[ sender_df.reinjected_in.notna() ][["packetid", "reinjected_in"]])
        # print("successful reinjections" % len(reinjected_in))

        # select only packets that have been reinjected

        # print("%d sender_df packets" % len(sender_df))
        # print(sender_df["reinjection_of"])
        reinjected_packets = sender_df.dropna(
            axis='index', subset=[_sender("reinjection_of")])

        logging.debug("%d reinjected packets" % len(reinjected_packets))
        # with pd.option_context('display.max_rows', None, 'display.max_columns', 300):
        #     print(reinjected_packets[
        #         _sender(["packetid", "reinjected_in", "reinjection_of"]) + _receiver(["reinjected_in", "reinjection_of"])
        #         ].head())

        for reinjection in reinjected_packets.itertuples():
            # here we look at all the reinjected packets

            # print("full reinjection %r" % (reinjection,))

            # if there are packets in _receiver(reinjected_in), it means the reinjections
            # arrived before other similar segments and thus these segments are useless
            # it should work because
            # useless_reinjections = getattr(reinjection, _receiver("reinjected_in"), [])

            # if it was correctly mapped
            # TODO why reinjection._merge doesn't exist ?
            if reinjection._1 != "both":
                # TODO count missed classifications ?
                log.debug("reinjection %d could not be mapped, giving up..." %
                          (reinjection.packetid))
                continue

            # print("%r" % reinjection.reinjection_of)
            initial_packetid = reinjection.reinjection_of[0]
            # print("initial_packetid = %r %s" % (initial_packetid, type(initial_packetid)))

            original_packet = df_all.loc[df_all.packetid ==
                                         initial_packetid].iloc[0]

            if original_packet._merge != "both":
                # TODO count missed classifications ?
                logging.debug(
                    "Original packet %d could not be mapped, giving up..." %
                    (original_packet.packetid))
                continue

            orig_arrival = getattr(original_packet, _receiver("reltime"))
            reinj_arrival = getattr(reinjection, _receiver("reltime"))
            reinj_pktid = getattr(reinjection, _sender("packetid"))

            reinj_delta = orig_arrival - reinj_arrival
            df_all.loc[reinj_pktid, "reinj_delta"] = reinj_delta

            if reinj_delta < 0:
                # print("GOT A MATCH")
                df_all.loc[df_all[_sender("packetid")] == reinjection.packetid,
                           "redundant"] = True
                #TODO set reinj_delta for reinjection.packetid

    return df_all
Beispiel #7
0
def convert_to_sender_receiver(df) -> pd.DataFrame:
    """
    Convert dataframe from  X_HOST1 | X_HOST2 to X_SENDER | X_RECEIVER

    each packet has a destination marker
    Assume clocks are fine here !
    """
    log.debug("Converting from host_1/host_2 to sender/receiver format")

    # fill up afterwards
    total = pd.DataFrame()

    for tcpstream, subdf in df.groupby(_first("tcpstream")):

        min_h1 = subdf.iloc[0, subdf.columns.get_loc(_first('abstime'))]
        min_h2 = subdf.iloc[0, subdf.columns.get_loc(_second('abstime'))]

        #         def _rename_columns(h1_role: ConnectionRoles):
        #             """
        # client_suffix, server_suffix
        #             Params:
        #                 client_suffix must be one of HOST1_SUFFIX or HOST2_SUFFIX
        #                 server_suffix can be deduced
        #             """
        def _rename_column(col_name, suffixes) -> str:

            for suffix_to_replace, new_suffix in suffixes.items():
                if col_name.endswith(suffix_to_replace):
                    return col_name.replace(suffix_to_replace, new_suffix)
            return col_name

            # total = pd.concat([total, subdf], ignore_index=True)

        log.debug(f"Comparing {min_h1} (h1) with {min_h2} (h2)")

        assert min_h1 != min_h2, (
            f"Same sending {min_h1} and receiving time {min_h2}."
            "Either the clock is not precise enough or it's a bug"
            " (more likely)")
        if min_h1 < min_h2:
            log.debug("Looks like h1 is the tcp client")
            # suffixes = { HOST1_SUFFIX: SENDER_SUFFIX, HOST2_SUFFIX: RECEIVER_SUFFIX }
            h1_role = ConnectionRoles.Client

        else:
            if min_h1 == min_h2:
                log.warn("there is an issue")

            log.debug("Looks like h2 is the tcp client")
            h1_role = (ConnectionRoles.Server)

        # _rename_columns(role)
        for tcpdest, tdf in subdf.groupby(_first("tcpdest"), sort=False):
            if tcpdest == h1_role:
                suffixes = {
                    HOST2_SUFFIX: SENDER_SUFFIX,
                    HOST1_SUFFIX: RECEIVER_SUFFIX
                }
            else:
                suffixes = {
                    HOST1_SUFFIX: SENDER_SUFFIX,
                    HOST2_SUFFIX: RECEIVER_SUFFIX
                }

            log.debug("suffixes: %s" % suffixes)
            rename_func = functools.partial(_rename_column, suffixes=suffixes)
            log.log(mp.TRACE, "renaming inplace")
            rename_func = functools.partial(_rename_column, suffixes=suffixes)

            log.debug("total df size = %d" % len(total))
            with pd.option_context('precision', 20):
                debug_cols = _first(["abstime", "tcpdest"]) + _second(
                    ["abstime", "tcpdest"])
                log.log(mp.TRACE, "before rename \n%s", tdf[debug_cols])
                tdf = tdf.rename(columns=rename_func, copy=True, inplace=False)

                debug_cols = _sender(["abstime", "tcpdest"]) + _receiver(
                    ["abstime", "tcpdest"])
                log.log(mp.TRACE, "After rename \n%s" % tdf[debug_cols])
                # print(tdf[debug_cols])
                # debug_dataframe(tdf, "temporary dataframe")
                total = pd.concat(
                    [total, tdf],
                    ignore_index=True,
                    sort=False,
                )
                # print("total df size = %d" % len(total))

        # subdf[ _first("tcpdest") == ConnectionRole.Client] .rename(columns=_rename_cols, inplace=True)
        # print(subdf.columns)
        # print(total.columns)
    # debug_dataframe(total, "total")

    log.debug("Converted to sender/receiver format")
    log.log(mp.TRACE, "Comparing #unique entries %d vs #all %d",
            total[_sender("abstime")].count(), len(total[_sender("abstime")]))
    # assert total[_sender("abstime")].count() == len(total[_sender("abstime")])
    return total
Beispiel #8
0
def load_merged_streams_into_pandas(
        pcap1: str,
        pcap2: str,
        streamid1: int,
        streamid2: int,
        # TODO changed to protocol
        mptcp: bool,
        tshark_config: TsharkConfig,
        clock_offset1: int = 0,
        clock_offset2: int = 0,
        mapping_mode: PacketMappingMode = PacketMappingMode.HASH,
        **extra):
    """
    Arguments:
        protocol: mptcp or tcp
        mapping_mode: Only HASH works for now
        clock_offset: untested

    Returns
        a dataframe with columns... owd ?
    """
    protocolStr = "mptcp" if mptcp else "tcp"
    log.debug(f"Asked to load {protocolStr} merged streams {streamid1} and "
              "{streamid2} from pcaps {pcap1} and {pcap2}")

    cache = mp.get_cache()

    cacheid = cache.cacheuid(
        "merged", [getrealpath(pcap1), getrealpath(pcap2)],
        protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv")

    # if we can't load that file from cache
    try:
        merged_df = pd.DataFrame()
        res = pd.DataFrame()

        valid, cachename = cache.get(cacheid)
        log.info("Cache validity=%s and cachename=%s" % (valid, cachename))

        # TODO disable when clock_offset is set
        if not valid:
            df1 = load_into_pandas(pcap1,
                                   tshark_config,
                                   clock_offset=clock_offset1)
            df2 = load_into_pandas(pcap2,
                                   tshark_config,
                                   clock_offset=clock_offset2)

            main_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            other_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            if mptcp:
                main_connection = MpTcpConnection.build_from_dataframe(
                    df1, MpTcpStreamId(streamid1))
                other_connection = MpTcpConnection.build_from_dataframe(
                    df2, MpTcpStreamId(streamid2))

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_mptcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            else:
                main_connection = TcpConnection.build_from_dataframe(
                    df1, TcpStreamId(streamid1))
                other_connection = TcpConnection.build_from_dataframe(
                    df2, TcpStreamId(streamid2))

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_tcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            assert cachename
            log.info("Saving into %s" % cachename)
            # trying to export lists correctly
            # print(merged_df.reinjected_in.dropna().head())
            # convert arrays back to strings
            # merged_df.apply(",".join()
            # or abstime ?

            # TODO rechange the flags hex()
            merged_df.to_csv(
                cachename,
                # columns=columns,
                index=False,
                header=True,
                sep=tshark_config.delimiter,
            )

            # tcpdest had become an objected instead of a CategoricalDtype
            # see https://github.com/pandas-dev/pandas/issues/22361
            log.log(mp.TRACE, "saving with dtypes=", dict(merged_df.dtypes))

        else:
            log.info("Loading from cache %s", cachename)

            date_cols = get_date_cols(tshark_config.fields)

            with open(cachename) as fd:
                # generate fieldlist
                def _gen_fields(fields):
                    gfields = {}  # type: ignore
                    for _name in [_first, _second]:
                        gfields.update(
                            {_name(k): v
                             for k, v in fields.items()})
                    return gfields

                # reltime discarded on save ?
                tshark_config.fields.pop("reltime")
                gfields = _gen_fields(tshark_config.fields)
                merge_dtypes = get_dtypes(gfields)
                # log.log(mp.TRACE, "Using gfields %s" % pp.pformat(gfields))

                # we don't need any converters
                converters = {}
                date_cols = get_date_cols(gfields)

                log.log(mp.TRACE, "Using date_cols %s" % pp.pformat(date_cols))
                log.log(mp.TRACE, "Using dtypes %s" % pp.pformat(merge_dtypes))
                # log.log(mp.TRACE, "Using converters %s" % (pp.pformat(converters)))
                merged_df = pd.read_csv(
                    fd,
                    skip_blank_lines=True,
                    comment='#',
                    # we don't need 'header' when metadata is with comment
                    sep=tshark_config.delimiter,
                    # memory_map=True, # could speed up processing
                    dtype=merge_dtypes,  # poping still generates
                    converters=converters,
                    # date_parser=date_converter,
                    parse_dates=date_cols,
                )
                # at this stage, destinatiosn are nan

                debug_fields = ["abstime", "tcpstream", "tcpdest", "mptcpdest"]
                mptcpanalyzer.debug.debug_dataframe(
                    merged_df,
                    "Merged dataframe",
                    usecols=(_first(debug_fields) + _second(debug_fields)))

                # workaround bug https://github.com/pandas-dev/pandas/issues/25448
                def _convert_to_enums():
                    # per_pcap_artificial_fields
                    for col in [
                            _first("tcpdest"),
                            _first("mptcpdest"),
                            _second("tcpdest"),
                            _second("mptcpdest")
                    ]:
                        merged_df[col] = merged_df[col].apply(
                            _convert_role, convert_dtype=False)

        # we fix the clocks a posteriori so that the cache is still usable
        log.debug("Postprocessing clock if needed")
        # merged_df[_first('abstime')] += clock_offset1
        # merged_df[_second('abstime')] += clock_offset2

        log.debug("Converting dataframes to be sender/receiver based...")

        # in both cases
        # TODO here we should attribute the definite mptcprole
        if mptcp:
            log.error(
                "We should correct the clocks if the argument is passed !")
            # raise mp.MpTcpException("Implement mptcp merge")

            res = convert_to_sender_receiver(merged_df)
            # fill MPTCP dest ?
        else:
            # tcp
            res = convert_to_sender_receiver(merged_df)

        # log.debug("Sorting by sender abstime")
        # merged_df.sort_values(by=_sender("abstime"), ascending=True, inplace=True)
        # debug_dataframe(res, "checking merge", usecols=["merge_status"])
        # print("%d nan values" % len(res[res.merge_status == np.nan]))

        log.debug("Computing owds")

        debug_dataframe(res, "before owds")
        # TODO we don't necessarely need to generate the OWDs here, might be put out
        res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')]

        debug_dataframe(
            res,
            "owd",
            usecols=["owd", _sender('abstime'),
                     _receiver('abstime')])
        # with pd.option_context('float_format', '{:f}'.format):
        #     print(
        #         res[_sender(["ipsrc", "ipdst", "abstime"])
        #          + _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS + ["owd"] ]
        #     )

    except Exception as e:
        log.exception("exception happened while merging")

    # pd.set_option('display.max_rows', 200)
    # pd.set_option('display.max_colwidth', -1)
    # print("dtypes=", dict(dtypes))
    log.log(mp.TRACE, "Dtypes after load:%s\n", pp.pformat(res.dtypes))
    log.info("Finished loading. merged dataframe size: %d", len(res))

    return res
Beispiel #9
0
def classify_reinjections(df_all: pd.DataFrame) -> pd.DataFrame:
    """
    look at reinjections on the receiver side, see which one is first
    packets with reinjected_in_receiver are (at least they should) be the first DSN arrived.

    Returns:
        a new dataframe with an added column "redundant" and "time_delta"
    """
    log.info("Classifying reinjections")

    if df_all.merged.already_classified():
        log.debug("Already classified, aborting")
        return df_all

    df_all = df_all.assign(redundant=False, reinj_delta=np.nan)

    df = df_all[df_all.merge_status == "both"]

    # print(df_all[ pd.notnull(df_all[_sender("reinjection_of")])] [
    #     _sender(["reinjection_of", "reinjected_in", "packetid", "reltime"]) +
    #     _receiver(["packetid", "reltime"])
    # ])

    for destination in ConnectionRoles:

        log.debug("Looking at mptcp destination %r", destination)
        sender_df = df[df.mptcpdest == destination]

        # print(sender_df[ sender_df.reinjected_in.notna() ][["packetid", "reinjected_in"]])
        # print("successful reinjections" % len(reinjected_in))

        # select only packets that have been reinjected

        # debug_dataframe(sender_df, "reinjections", usecols=["reinjection_of"])
        reinjected_packets = sender_df.dropna(
            axis='index', subset=[_sender("reinjection_of")])

        log.debug("%d reinjected packets", len(reinjected_packets))
        # with pd.option_context('display.max_rows', None, 'display.max_columns', 300):
        #     print(reinjected_packets[
        #         _sender(["packetid", "reinjected_in", "reinjection_of"])
        #          + _receiver(["reinjected_in", "reinjection_of"])
        #         ].head())

        for reinjection in reinjected_packets.itertuples():
            # here we look at all the reinjected packets

            # print("full reinjection %r" % (reinjection,))

            # if there are packets in _receiver(reinjected_in), it means the reinjections
            # arrived before other similar segments and thus these segments are useless
            # it should work because
            # useless_reinjections = getattr(reinjection, _receiver("reinjected_in"), [])

            # if it was correctly mapped
            if reinjection.merge_status != "both":
                log.log(mp.TRACE,
                        "reinjection %d could not be mapped, giving up...",
                        reinjection.packetid)
                continue

            # print("%r" % reinjection.reinjection_of)
            initial_packetid = reinjection.reinjection_of[0]
            # print("initial_packetid = %r %s" % (initial_packetid, type(initial_packetid)))

            original_packet = df_all.loc[df_all.packetid ==
                                         initial_packetid].iloc[0]

            if original_packet.merge_status != "both":
                # TODO count missed classifications ?
                log.log(
                    mp.TRACE,
                    "Original packet %d could not be mapped, giving up...",
                    original_packet.packetid)
                continue

            orig_arrival = getattr(original_packet, _receiver("reltime"))
            reinj_arrival = getattr(reinjection, _receiver("reltime"))
            reinj_pktid = getattr(reinjection, _sender("packetid"))

            reinj_delta = orig_arrival - reinj_arrival
            df_all.loc[reinj_pktid, "reinj_delta"] = reinj_delta

            if reinj_delta < pd.Timedelta(0):
                # print("GOT A failed reinjection")
                df_all.loc[df_all[_sender("packetid")] == reinjection.packetid,
                           "redundant"] = True
                #TODO set reinj_delta for reinjection.packetid
            else:
                # print("GOT a successful reinjection")
                pass

    return df_all