Esempio n. 1
0
def map_tcp_stream(rawdf: pd.DataFrame,
                   main: TcpConnection) -> List[TcpMapping]:
    """
    Returns:
        a sorted list of mappings (tcpconnection, score) with the first one being the most probable
    """

    results = []
    for tcpstream in rawdf["tcpstream"].unique():
        other = TcpConnection.build_from_dataframe(rawdf, tcpstream)
        score = main.score(other)
        if score > float('-inf'):
            mapping = TcpMapping(other, score)
            results.append(mapping)

    # decreasing sort based on the score
    results.sort(key=lambda x: x[1], reverse=True)

    return results
Esempio n. 2
0
 def do_list_tcp_connections(self, *args):
     """
     List tcp connections via their ids (tcp.stream)
     """
     streams = self.data.groupby("tcpstream")
     self.poutput('%d tcp connection(s)' % len(streams))
     for tcpstream, group in streams:
         # self.list_subflows(mptcpstream)
         self.data.tcp.connection(tcpstream)
         con = TcpConnection.build_from_dataframe(self.data, tcpstream)
         self.poutput(con)
         self.poutput("\n")
Esempio n. 3
0
def tcpdest_from_connections(df, con: TcpConnection) -> pd.DataFrame:

    for dest in ConnectionRoles:

        log.debug("Looking at destination %s" % dest)
        q = con.generate_direction_query(dest)
        df_dest = df.query(q)
        print("tcpdest %r" % dest)
        df.loc[df_dest.index, 'tcpdest'] = dest

    # print(df.tcpdest.head())
    # assert df['tcpdest'].notnull() == , "every packet should have tcpdest set"
    return df
Esempio n. 4
0
    def do_tocsv(self, args):
        """
        Selects tcp/mptcp/udp connection and exports it to csv
        """

        df = self.data
        # TODO let the parser do it
        # if args.tcpstream:
        #     # df = df[ df.tcpstream == args.tcpstream]

        #     self.poutput("Filtering tcpstream")
        #     con = TcpConnection.build_from_dataframe(df, args.tcpstream)
        #     if args.destination:
        #         self.poutput("Filtering destination")
        #         q = con.generate_direction_query(args.destination)
        #         df = df.query(q)

        # elif args.mptcpstream:
        #     self.poutput("Unsupported yet")
            # df = df[ df.mptcpstream == args.mptcpstream]

        # need to compute the destinations before dropping syn from the dataframe
        # df['tcpdest'] = np.nan;
        for streamid, subdf in df.groupby("tcpstream"):
            con = TcpConnection.build_from_dataframe(df, streamid)
            df = mpdata.tcpdest_from_connections(df, con)

            if args.drop_syn:
                # use subdf ?
                self.poutput("drop-syn Unsupported yet")
                df.drop(subdf.head(3).index, inplace=True)
                # drop 3 first packets of each connection ?
                # this should be a filter
                syns = df[df.tcpflags == mp.TcpFlags.SYN]
        #     df = df[ df.flags ]
        # if args.destination:
        #     if args.tcpstream:
                # TODO we should filter destination
        self.poutput("Writing to %s" % args.output)
        pandas_to_csv(df, args.output)
Esempio n. 5
0
    def do_map_tcp_connection(self, args):

        df1 = load_into_pandas(args.pcap1, self.tshark_config)
        df2 = load_into_pandas(args.pcap2, self.tshark_config)

        main_connection = TcpConnection.build_from_dataframe(df1, args.tcpstreamid)

        mappings = map_tcp_stream(df2, main_connection)

        self.poutput("Trying to map %s" % (main_connection,))
        self.poutput("%d mapping(s) found" % len(mappings))

        for match in mappings:

            # formatted_output = main.format_mapping(match)
            # output = "{c1.tcpstreamid} <-> {c2.tcpstreamid} with score={score}"
            # formatted_output = output.format(
            #     c1=main_connection,
            #     c2=match,
            #     score=score
            # )
            # print(formatted_output)
            self.poutput("%s" % str(match))
Esempio n. 6
0
 def connection(self, streamid) -> TcpConnection:
     # if tcpdest is None:
     #     tcpdest = list(mp.ConnectionRoles)
     return TcpConnection.build_from_dataframe(self._obj, streamid)
Esempio n. 7
0
def filter_dataframe(
        self,
        rawdf,
        # TODO choose prefix
        merged_one,
        tcpstream=None,
        mptcpstream=None,
        skipped_subflows=[],
        destinations: list = None,
        extra_query: str = None,
        **kwargs):
    """
    Can filter a single dataframe beforehand
    (hence call it several times for several dataframes).

    Feel free to inherit/override this class.

    Args:
        rawdf: Raw dataframe
        kwargs: expanded arguments returned by the parser
        destination: Filters packets depending on their :enum:`.ConnectionRoles`
        stream: keep only the packets related to mptcp.stream == mptcpstream
        skipped_subflows: list of skipped subflows
        extra_query: Add some more filters to the pandas query

    This baseclass can filter on:

    - mptcpstream
    - destination (mptcpstream required)
    - skipped_subflows

    Returns:
        Filtered dataframe
    """
    log.debug("Preprocessing dataframe with extra args %s" % kwargs)
    queries = []
    log.debug("tcp.stream %d mptcp: %d" % (tcpstream, mptcpstream))
    stream = tcpstream if tcpstream is not None else mptcpstream
    dataframe = rawdf

    for skipped_subflow in skipped_subflows:
        log.debug("Skipping subflow %d" % skipped_subflow)
        queries.append(" tcpstream!=%d " % skipped_subflow)

    if stream is not None:
        protocol = "mptcp" if mptcpstream is not None else "tcp"
        log.debug("Filtering %s stream #%d." % (protocol, stream))
        queries.append(protocol + "stream==%d" % stream)

        if protocol == "tcp":
            # generates the "tcpdest" component of the dataframe
            con2 = TcpConnection.build_from_dataframe(dataframe, stream)
            dataframe = tcpdest_from_connections(dataframe, con2)
            # trust plots to do the filtering
            # if destinations is not []:
            #     queries.append(protocol + "dest==%d" % stream)
        else:
            # todo shall do the same for mptcp destinations
            con = MpTcpConnection.build_from_dataframe(dataframe, stream)
            # mptcpdest = main_connection.mptcp_dest_from_tcpdest(tcpdest)
            df = mptcpdest_from_connections(dataframe, con)
            # TODO generate mptcpdest
            # if protocol == "mptcp":
            if destinations is not None:
                raise Exception(
                    "destination filtering is not ready yet for mptcp")

                log.debug("Filtering destination")

                # Generate a filter for the connection
                # con = MpTcpConnection.build_from_dataframe(dataframe, stream)
                # q = con.generate_direction_query(destination)
                # queries.append(q)
    if extra_query:
        log.debug("Appending extra_query=%s" % extra_query)
        queries.append(extra_query)

    query = " and ".join(queries)

    # throws when querying with an empty query
    if len(query) > 0:
        log.info("Running query:\n%s\n" % query)
        dataframe.query(query, inplace=True)

    return dataframe
Esempio n. 8
0
 def connection(self, streamid):
     return TcpConnection.build_from_dataframe(self._obj, streamid)
Esempio n. 9
0
def load_merged_streams_into_pandas(
        pcap1: str,
        pcap2: str,
        streamid1: int,  # Union[MpTcpStreamId, TcpStreamId],
        streamid2: int,
        mptcp: bool,
        tshark_config: TsharkConfig,
        clock_offset1: int = 0,
        clock_offset2: int = 0,
        mapping_mode: PacketMappingMode = PacketMappingMode.HASH,
        **extra):
    """
    Arguments:
        protocol: mptcp or tcp

        mapping_mode: Only HASH works for now

    Returns
        a dataframe with columns... owd ?
    """
    log.debug(
        "Asked to load merged tcp streams %d and %d from pcaps %s and %s" %
        (streamid1, streamid2, pcap1, pcap2))

    cache = mp.get_cache()
    protocolStr = "mptcp" if mptcp else "tcp"

    cacheid = cache.cacheuid(
        "merged", [
            getrealpath(pcap1),
            getrealpath(pcap2),
        ], protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv")

    # if we can't load that file from cache
    try:
        merged_df = pd.DataFrame()
        res = pd.DataFrame()

        valid, cachename = cache.get(cacheid)
        log.info("Cache validity=%s and cachename=%s" % (valid, cachename))

        # TODO disable when clock_offset is set
        if not valid:
            df1 = load_into_pandas(pcap1,
                                   tshark_config,
                                   clock_offset=clock_offset1)
            df2 = load_into_pandas(pcap2,
                                   tshark_config,
                                   clock_offset=clock_offset2)

            main_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            other_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            if mptcp:
                main_connection = MpTcpConnection.build_from_dataframe(
                    df1, streamid1)
                other_connection = MpTcpConnection.build_from_dataframe(
                    df2, streamid2)

                # TODO generate
                # map_mptcp_connection()

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_mptcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            else:
                main_connection = TcpConnection.build_from_dataframe(
                    df1, streamid1)
                other_connection = TcpConnection.build_from_dataframe(
                    df2, streamid2)

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_tcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            assert cachename
            logging.info("Saving into %s" % cachename)
            # trying to export lists correctly
            # print(merged_df.reinjected_in.dropna().head())
            # convert arrays back to strings
            # merged_df.apply(",".join()
            merged_df.to_csv(
                cachename,
                # columns=columns,
                index=False,
                header=True,
                sep=tshark_config.delimiter,
            )

            # la on a perdu tcpdest est devenu object
            print("saving with dtypes=", dict(merged_df.dtypes))
            # print("MERGED_DF", merged_df[TCP_DEBUG_FIELDS].head(20))

            # if log level >= DEBUG then save to xls too !
            # if True:
            #     filename = cachename + ".xls"
            #     logging.debug("Saved a debug excel copy at %s" % filename)
            #     merged_df.to_excel(filename)

        else:
            logging.info("Loading from cache %s" % cachename)

            # dtypes = {k: v for k, v in temp.items() if v is not None or k not in ["tcpflags"]}

            def _gen_dtypes(fields) -> Dict[str, Any]:
                dtypes = {}  # type: ignore
                for _name in [_first, _second]:

                    # TODO this could be simplified
                    for k, v in fields.items():
                        if v is not None or k not in ["tcpflags"]:
                            dtypes.setdefault(_name(k), v)

                    # add generated field dtypes
                    dtypes.update({
                        _name(f.fullname): f.type
                        for f in per_pcap_artificial_fields.values()
                    })

                # these are overrides from the generated dtypes
                dtypes.update({
                    # during the merge, we join even unmapped packets so some entries
                    # may be empty => float64
                    _first("packetid"):
                    np.float64,
                    _second("packetid"):
                    np.float64,
                })

                return dtypes

            def _gen_converters() -> Dict[str, Callable]:

                # converters = {}   # type: Dict[str, Any]
                fields = dict(tshark_config.fields)
                fields.update(per_pcap_artificial_fields)
                converters = {}
                # no need to convert tcpflags
                default_converters = {
                    name: f.converter
                    for name, f in fields.items()
                    if f.converter and name != "tcpflags"
                }
                # converters.update({ name: f.converter for name, f in per_pcap_artificial_fields.items() if f.converter})
                for name, converter in default_converters.items():
                    converters.update({
                        _first(name): converter,
                        _second(name): converter
                    })

                return converters

            with open(cachename) as fd:
                dtypes = _gen_dtypes({
                    name: field.type
                    for name, field in tshark_config.fields.items()
                })
                converters = _gen_converters()
                # more recent versions can do without it
                # pd.set_option('display.max_rows', 200)
                # pd.set_option('display.max_colwidth', -1)
                # print("converters=", converters)
                merged_df = pd.read_csv(
                    fd,
                    skip_blank_lines=True,
                    comment='#',
                    # we don't need 'header' when metadata is with comment
                    sep=tshark_config.delimiter,
                    # memory_map=True, # could speed up processing
                    dtype=dtypes,  # poping still generates
                    converters=converters,
                )

                # log.debug("Column names after loading from cache: %s", merged_df.columns)

                # TODO:
                # No columns to parse from file

        # we fix the clocks a posteriori so that the cache is still usable

        logging.debug("Postprocessing clock if needed")
        merged_df[_first('abstime')] += clock_offset1
        merged_df[_second('abstime')] += clock_offset2

        logging.debug("Converting dataframes to be sender/receiver based...")
        # in both cases
        # TODO here we should attribute the definite mptcprole
        # compute owd
        if mptcp:
            print("Should be merging OWDs")
            logging.error(
                "We should correct the clocks if the argument is passed !")
            # raise mp.MpTcpException("Implement mptcp merge")

            res = convert_to_sender_receiver(merged_df)
        else:
            # tcp
            # c la ou ou corrige les temps
            # on rename les colonnes host1 ou host2 par _sender ou bien _receiver ?!
            res = convert_to_sender_receiver(merged_df)

            # don't do it here else we might repeat it
            # data["abstime"] += clock_offset

        logging.debug("Computing owds")
        log.debug("Column names: %s", res.columns)
        log.debug("Dtypes after load:%s\n" % dict(res.dtypes))
        print("res=")
        # TODO we don't necessarely need to generate the OWDs here, might be put out
        res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')]
        # .head(40))
        with pd.option_context('float_format', '{:f}'.format):
            print(res[_sender(["ipsrc", "ipdst", "abstime"]) +
                      _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS +
                      ["owd"]])

    except Exception:
        logging.exception("exception happened while merging")

    # pd.set_option('display.max_rows', 200)
    # pd.set_option('display.max_colwidth', -1)
    # print("dtypes=", dict(dtypes))
    # log.debug("Dtypes after load:%s\n" % pp.pformat(merged_df.dtypes))
    log.info("Finished loading. merged dataframe size: %d" % len(merged_df))

    return res
Esempio n. 10
0
def load_merged_streams_into_pandas(
        pcap1: str,
        pcap2: str,
        streamid1: int,
        streamid2: int,
        # TODO changed to protocol
        mptcp: bool,
        tshark_config: TsharkConfig,
        clock_offset1: int = 0,
        clock_offset2: int = 0,
        mapping_mode: PacketMappingMode = PacketMappingMode.HASH,
        **extra):
    """
    Arguments:
        protocol: mptcp or tcp
        mapping_mode: Only HASH works for now
        clock_offset: untested

    Returns
        a dataframe with columns... owd ?
    """
    protocolStr = "mptcp" if mptcp else "tcp"
    log.debug(f"Asked to load {protocolStr} merged streams {streamid1} and "
              "{streamid2} from pcaps {pcap1} and {pcap2}")

    cache = mp.get_cache()

    cacheid = cache.cacheuid(
        "merged", [getrealpath(pcap1), getrealpath(pcap2)],
        protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv")

    # if we can't load that file from cache
    try:
        merged_df = pd.DataFrame()
        res = pd.DataFrame()

        valid, cachename = cache.get(cacheid)
        log.info("Cache validity=%s and cachename=%s" % (valid, cachename))

        # TODO disable when clock_offset is set
        if not valid:
            df1 = load_into_pandas(pcap1,
                                   tshark_config,
                                   clock_offset=clock_offset1)
            df2 = load_into_pandas(pcap2,
                                   tshark_config,
                                   clock_offset=clock_offset2)

            main_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            other_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            if mptcp:
                main_connection = MpTcpConnection.build_from_dataframe(
                    df1, MpTcpStreamId(streamid1))
                other_connection = MpTcpConnection.build_from_dataframe(
                    df2, MpTcpStreamId(streamid2))

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_mptcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            else:
                main_connection = TcpConnection.build_from_dataframe(
                    df1, TcpStreamId(streamid1))
                other_connection = TcpConnection.build_from_dataframe(
                    df2, TcpStreamId(streamid2))

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_tcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            assert cachename
            log.info("Saving into %s" % cachename)
            # trying to export lists correctly
            # print(merged_df.reinjected_in.dropna().head())
            # convert arrays back to strings
            # merged_df.apply(",".join()
            # or abstime ?

            # TODO rechange the flags hex()
            merged_df.to_csv(
                cachename,
                # columns=columns,
                index=False,
                header=True,
                sep=tshark_config.delimiter,
            )

            # tcpdest had become an objected instead of a CategoricalDtype
            # see https://github.com/pandas-dev/pandas/issues/22361
            log.log(mp.TRACE, "saving with dtypes=", dict(merged_df.dtypes))

        else:
            log.info("Loading from cache %s", cachename)

            date_cols = get_date_cols(tshark_config.fields)

            with open(cachename) as fd:
                # generate fieldlist
                def _gen_fields(fields):
                    gfields = {}  # type: ignore
                    for _name in [_first, _second]:
                        gfields.update(
                            {_name(k): v
                             for k, v in fields.items()})
                    return gfields

                # reltime discarded on save ?
                tshark_config.fields.pop("reltime")
                gfields = _gen_fields(tshark_config.fields)
                merge_dtypes = get_dtypes(gfields)
                # log.log(mp.TRACE, "Using gfields %s" % pp.pformat(gfields))

                # we don't need any converters
                converters = {}
                date_cols = get_date_cols(gfields)

                log.log(mp.TRACE, "Using date_cols %s" % pp.pformat(date_cols))
                log.log(mp.TRACE, "Using dtypes %s" % pp.pformat(merge_dtypes))
                # log.log(mp.TRACE, "Using converters %s" % (pp.pformat(converters)))
                merged_df = pd.read_csv(
                    fd,
                    skip_blank_lines=True,
                    comment='#',
                    # we don't need 'header' when metadata is with comment
                    sep=tshark_config.delimiter,
                    # memory_map=True, # could speed up processing
                    dtype=merge_dtypes,  # poping still generates
                    converters=converters,
                    # date_parser=date_converter,
                    parse_dates=date_cols,
                )
                # at this stage, destinatiosn are nan

                debug_fields = ["abstime", "tcpstream", "tcpdest", "mptcpdest"]
                mptcpanalyzer.debug.debug_dataframe(
                    merged_df,
                    "Merged dataframe",
                    usecols=(_first(debug_fields) + _second(debug_fields)))

                # workaround bug https://github.com/pandas-dev/pandas/issues/25448
                def _convert_to_enums():
                    # per_pcap_artificial_fields
                    for col in [
                            _first("tcpdest"),
                            _first("mptcpdest"),
                            _second("tcpdest"),
                            _second("mptcpdest")
                    ]:
                        merged_df[col] = merged_df[col].apply(
                            _convert_role, convert_dtype=False)

        # we fix the clocks a posteriori so that the cache is still usable
        log.debug("Postprocessing clock if needed")
        # merged_df[_first('abstime')] += clock_offset1
        # merged_df[_second('abstime')] += clock_offset2

        log.debug("Converting dataframes to be sender/receiver based...")

        # in both cases
        # TODO here we should attribute the definite mptcprole
        if mptcp:
            log.error(
                "We should correct the clocks if the argument is passed !")
            # raise mp.MpTcpException("Implement mptcp merge")

            res = convert_to_sender_receiver(merged_df)
            # fill MPTCP dest ?
        else:
            # tcp
            res = convert_to_sender_receiver(merged_df)

        # log.debug("Sorting by sender abstime")
        # merged_df.sort_values(by=_sender("abstime"), ascending=True, inplace=True)
        # debug_dataframe(res, "checking merge", usecols=["merge_status"])
        # print("%d nan values" % len(res[res.merge_status == np.nan]))

        log.debug("Computing owds")

        debug_dataframe(res, "before owds")
        # TODO we don't necessarely need to generate the OWDs here, might be put out
        res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')]

        debug_dataframe(
            res,
            "owd",
            usecols=["owd", _sender('abstime'),
                     _receiver('abstime')])
        # with pd.option_context('float_format', '{:f}'.format):
        #     print(
        #         res[_sender(["ipsrc", "ipdst", "abstime"])
        #          + _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS + ["owd"] ]
        #     )

    except Exception as e:
        log.exception("exception happened while merging")

    # pd.set_option('display.max_rows', 200)
    # pd.set_option('display.max_colwidth', -1)
    # print("dtypes=", dict(dtypes))
    log.log(mp.TRACE, "Dtypes after load:%s\n", pp.pformat(res.dtypes))
    log.info("Finished loading. merged dataframe size: %d", len(res))

    return res