Example #1
0
def load_into_pandas(
        input_file: str,
        config: TsharkConfig,
        # clock_offset: int = 0,
        **extra) -> pd.DataFrame:
    """
    load mptcp  data into pandas

    Args:
        input_file: pcap filename
        config: Hard, keep changing
        load_cb: callback to use if cache not available
        extra: extra arguments to forward to load_cb
    """
    log.debug("Asked to load simple pcap %s" % input_file)

    filename = getrealpath(input_file)
    cache = mp.get_cache()

    tshark_dtypes = {
        fullname: field.type
        for fullname, field in config.fields.items()
    }

    artifical_dtypes = {
        name: field.type
        for name, field in per_pcap_artificial_fields.items()
    }
    dtypes = dict(tshark_dtypes, **artifical_dtypes)

    # TODO add per_pcap_artificial_fields hash
    pseudohash = hash(config) + hash(frozenset(dtypes.items()))
    uid = cache.cacheuid(
        '',  # prefix (might want to shorten it a bit)
        [filename],  # dependencies
        str(pseudohash) + '.csv')

    is_cache_valid, csv_filename = cache.get(uid)

    logging.debug("cache validity=%d cachename: %s" %
                  (is_cache_valid, csv_filename))
    if not is_cache_valid:
        logging.info("Cache invalid .. Converting %s " % (filename, ))

        with tempfile.NamedTemporaryFile(mode='w+',
                                         prefix="mptcpanalyzer-",
                                         delete=False) as out:
            tshark_fields = [
                field.fullname for _, field in config.fields.items()
            ]
            retcode, stderr = config.export_to_csv(filename, out,
                                                   tshark_fields)
            log.info("exporter exited with code=%d", retcode)
            if retcode is 0:
                out.close()
                cache.put(uid, out.name)
            else:
                raise Exception(stderr)

    log.debug("Loading a csv file %s" % csv_filename)

    try:
        with open(csv_filename) as fd:

            converters = {
                f.fullname: f.converter
                for _, f in config.fields.items() if f.converter
            }
            converters.update({
                name: f.converter
                for name, f in per_pcap_artificial_fields.items()
                if f.converter
            })
            # print("converters\n", converters)

            dtypes = {
                field.fullname: field.type
                for _, field in config.fields.items()
            }
            log.debug("Dtypes before load: %s" % dtypes)
            data = pd.read_csv(
                fd,
                comment='#',
                sep=config.delimiter,
                dtype=dtypes,
                # seems like for now we can't change the default representation apart from converting the column to
                # a string !!!
                # https://stackoverflow.com/questions/46930201/pandas-to-datetime-is-not-formatting-the-datetime-value-in-the-desired-format
                # date_parser=_convert_timestamp,
                # parse_dates=["frame.time_epoch"],
                converters=converters,
                # float_precision="high",  # might be necessary
                # nrows=10, # useful for debugging purpose
            )
            # 1 to 1 -> can't add new columns
            data.rename(inplace=True,
                        columns={
                            f.fullname: name
                            for name, f in config.fields.items()
                        })

            # add new columns
            data = data.assign(
                **{name: np.nan
                   for name in per_pcap_artificial_fields.keys()})
            column_names = set(data.columns)
            # print("column_names", column_names)
            data = data.astype(dtype=artifical_dtypes, copy=False)

            # we want packetid column to survive merges/dataframe transformation so keepit as a column
            # TODO remove ? let other functions do it ?
            data.set_index("packetid", drop=False, inplace=True)
            log.debug("Column names: %s" % data.columns)

            hashing_fields = [
                name for name, field in config.fields.items() if field.hash
            ]
            log.debug("Hashing over fields %s" % hashing_fields)

            # won't work because it passes a Serie (mutable)_
            # TODO generate hashing fields from Fields
            temp = pd.DataFrame(data, columns=hashing_fields)
            data["hash"] = temp.apply(lambda x: hash(tuple(x)), axis=1)

    except Exception as e:
        logging.error(
            "You may need to filter more your pcap to keep only mptcp packets")
        raise e

    log.info("Finished loading dataframe for %s. Size=%d" %
             (input_file, len(data)))

    # print("FINAL_DTYPES")
    log.debug(data.dtypes)
    # print(data.head(5))
    return data
Example #2
0
def load_into_pandas(input_file: str, config: TsharkConfig,
                     **extra) -> pd.DataFrame:
    """
    load mptcp data into pandas

    Args:
        input_file: pcap filename
        config: Hard, keep changing
        load_cb: callback to use if cache not available
        extra: extra arguments to forward to load_cb
    """
    log.debug("Asked to load simple pcap %s", input_file)

    filename = getrealpath(input_file)
    cache = mp.get_cache()

    # {fullname: field.type for fullname, field in config.fields.items()}
    tshark_dtypes = get_dtypes(config.fields)

    artifical_dtypes = get_dtypes(per_pcap_artificial_fields)
    dtypes = dict(tshark_dtypes, **artifical_dtypes)

    # TODO add per_pcap_artificial_fields hash
    pseudohash = hash(config) + hash(frozenset(dtypes.items()))
    uid = cache.cacheuid(
        '',  # prefix (might want to shorten it a bit)
        [filename],  # dependencies
        str(pseudohash) + '.csv')
    # print(config.fields)

    is_cache_valid, csv_filename = cache.get(uid)

    log.debug("cache validity=%d cachename: %s", is_cache_valid, csv_filename)
    if not is_cache_valid:
        log.info(
            "Cache invalid .. Converting %s",
            filename,
        )

        with tempfile.NamedTemporaryFile(mode='w+',
                                         prefix="mptcpanalyzer-",
                                         delete=False) as out:
            # tshark_fields = [field.fullname for _, field in config.fields.items()]
            tshark_fields = {
                field.fullname: name
                for name, field in config.fields.items()
            }
            retcode, _, stderr = config.export_to_csv(filename, out,
                                                      tshark_fields)
            log.info("exporter exited with code=%d", retcode)
            if retcode is 0:
                out.close()
                cache.put(uid, out.name)
            else:
                raise Exception(stderr)

    log.debug("Loading a csv file %s", csv_filename)

    try:
        with open(csv_filename) as fd:

            # gets a list of fields to convert
            # we dont want to modify the passed parameter
            fields = config.fields.copy()
            fields.update(per_pcap_artificial_fields)
            converters = get_converters(config.fields)

            # builds a list of fields to be parsed as dates
            # (since converter/types don't seem to be great)
            date_cols = get_date_cols(config.fields)

            dtypes = get_dtypes(config.fields)

            log.log(mp.TRACE, "Dtypes before load:\n%s", pp.pformat(dtypes))
            log.log(mp.TRACE, "Converters before load:\n%s",
                    pp.pformat(converters))
            log.log(mp.TRACE, "Fields to load as times:\n%s",
                    pp.pformat(date_cols))

            # keep this commented code to help diagnosing pandas problems
            # from mptcpanalyzer.debug import read_csv_debug
            fields = [f.fullname for _, f in config.fields.items()]
            # fields =[ "tcp.options.mptcp.sendkey" ]
            # data = mptcpanalyzer.debug.read_csv_debug(fields,
            data = pd.read_csv(
                fd,
                comment='#',
                sep=config.delimiter,
                dtype=dtypes,
                date_parser=date_converter,
                parse_dates=date_cols,
                # ideally DON't user converters but pandas bugs...
                converters=converters,
                # float_precision="high",  # might be necessary
            )

            log.debug("Finished loading CSV file")
            # 1 to 1 -> can't add new columns
            data.rename(inplace=True,
                        columns={
                            f.fullname: name
                            for name, f in config.fields.items()
                        })

            # add new columns
            data = data.assign(
                **{name: np.nan
                   for name in per_pcap_artificial_fields.keys()})
            column_names = set(data.columns)
            data = data.astype(dtype=artifical_dtypes, copy=False)

            # we want packetid column to survive merges/dataframe transformation
            # so keepit as a column
            # TODO remove ? let other functions do it ?
            data.set_index("packetid", drop=False, inplace=True)

            hashing_fields = [
                name for name, field in config.fields.items() if field.hash
            ]
            log.debug("Hashing over fields %s", hashing_fields)

            # won't work because it passes a Serie (mutable)_
            # TODO generate hashing fields from Fields
            # TODO reference stack overflow problem
            temp = pd.DataFrame(data, columns=hashing_fields)
            data["hash"] = temp.apply(lambda x: hash(tuple(x)), axis=1)

    except TypeError as e:
        log.error(
            "You may need to filter more your pcap to keep only mptcp packets")
        raise e
    except Exception as e:
        log.error(
            "You may need to filter more your pcap to keep only mptcp packets")
        raise e

    log.info("Finished loading dataframe for %s. Size=%d", input_file,
             len(data))

    return data