Example #1
0
def main():
    if len(sys.argv) < 2:
        logging.error('incomplete command')
    elif sys.argv[1] == 'folder':
        timeFunction("totalRuntime", lambda: execute())
    else:
        logging.error('incomplete command')
Example #2
0
def execute():
    for i in range(1):
        config.seed = i
        meta, mapping = timeFunction(readFolderWithPCAPs.__name__,
                                     lambda: readFolderWithPCAPs())
        timeFunction(
            calculateDistancesAndGenerateOutput.__name__,
            lambda: calculateDistancesAndGenerateOutput(meta, mapping))

        resultsDf = pd.DataFrame.from_dict(results).T.rename_axis('Name')
        resultsDf.to_csv(f'{config.outputDirStats}stats-{config.thresh}.csv')
Example #3
0
def processMeasurements(normalizeDistanceMeasurement, mapping, inv_mapping,
                        name):
    if os.path.exists(f"{config.outputDirStats}{name}{config.addition}.txt"):
        os.remove(f"{config.outputDirStats}{name}{config.addition}.txt")

    clu, projection = timeFunction(
        f'[{name}] {generateClusters.__name__}',
        lambda: generateClusters(normalizeDistanceMeasurement, name))

    if config.generateTSNEGraphs:
        timeFunction(generateClusterGraph.__name__,
                     lambda: generateClusterGraph(clu, projection, name))

    finalClusters, dagClusters, heatmapCluster = saveClustersToCsv(
        clu, mapping, inv_mapping, name)

    finalClusterSummary(finalClusters, inv_mapping, name)

    return finalClusters, heatmapCluster
Example #4
0
def calculateDistancesAndGenerateOutput(metadata: dict[ConnectionKey,
                                                       list[PackageInfo]],
                                        mapping):
    inv_mapping: dict[int, ConnectionKey] = {v: k for k, v in mapping.items()}

    values = list(metadata.values())

    generateOutputFolders()

    if config.generateRaw:
        storeRawData(values)

    statisticalProperties, normalizeDistanceMeasurementStatistical = timeFunction(
        getStatisticalNormalizedDistanceMeasurement.__name__,
        lambda: getStatisticalNormalizedDistanceMeasurement(values, config))

    normalizeDistanceMeasurementSequential = timeFunction(
        getSequentialNormalizedDistanceMeasurement.__name__,
        lambda: getSequentialNormalizedDistanceMeasurement(values, config))

    finalClustersStatistical, heatmapClusterStatistical = processMeasurements(
        normalizeDistanceMeasurementStatistical, mapping, inv_mapping,
        'Statistical')
    finalClustersSequential, heatmapClusterSequential = processMeasurements(
        normalizeDistanceMeasurementSequential, mapping, inv_mapping,
        'Sequential')

    compareFinalClusters(finalClustersSequential, finalClustersStatistical)

    if config.generateAllGraphs:
        # clusterAmount = len(finalClusters)
        # generateDag(dagClusters, clusterAmount)
        timeFunction(
            generateGraphs.__name__,
            lambda: generateGraphs('Statistical', heatmapClusterStatistical,
                                   values, statisticalProperties))
        timeFunction(
            generateGraphs.__name__,
            lambda: generateGraphs('Sequential', heatmapClusterSequential,
                                   values, statisticalProperties))
Example #5
0
def readFolderWithPCAPs(useFileCache=False, forceFileCacheUse=True):
    meta = {}
    mapping = {}
    totalLabels = defaultdict(int)
    mappingIndex = 0
    if forceFileCacheUse:
        files = glob.glob(sys.argv[2] + "/*.pcap.pkl")
    else:
        files = glob.glob(sys.argv[2] + "/**/*.pcap")
    logging.info(f'About to read pcap... from {len(files)} files')

    if os.path.exists(config.pklCache) and os.path.exists(
            config.mappingCache) and os.path.exists(config.totalLabelsCache):
        with open(config.pklCache, 'rb') as file:
            meta = pickle.load(file)
        with open(config.mappingCache, 'rb') as file:
            mapping = pickle.load(file)
        with open(config.totalLabelsCache, 'rb') as file:
            totalLabels = pickle.load(file)
    else:
        for f in files:
            cacheKey = os.path.basename(f)
            cacheName = f'data/{cacheKey}.pkl'
            if os.path.exists(cacheName) and useFileCache:
                logging.debug(f'Using cache: {cacheKey}')
                with open(cacheName, 'rb') as file:
                    connections = pickle.load(file)
            elif os.path.exists(f) and forceFileCacheUse:
                logging.debug(f'Using cache: {cacheKey}')
                with open(f, 'rb') as file:
                    connections = pickle.load(file)
            elif not forceFileCacheUse:
                logging.info(f'Reading file: {cacheKey}')
                connections = timeFunction(readPCAP.__name__,
                                           lambda: readPCAP(f, config))

                if len(connections.items()) < 1:
                    continue

                with open(cacheName, 'wb') as file:
                    pickle.dump(connections, file)
            else:
                logging.info(
                    f'Skipping {f} because it has no cache file: {cacheName}')
                continue

            connectionItems: list[(ConnectionKey, list[PackageInfo])] = list(
                connections.items())
            random.shuffle(connectionItems)
            selectedLabelsPerFile = defaultdict(int)

            for i, v in connectionItems:
                wantedWindow = getWantedWindow(v)

                for window in wantedWindow:
                    selection: list[PackageInfo] = v[config.thresh *
                                                     window:config.thresh *
                                                     (window + 1)]
                    labels = set()
                    for package in selection:
                        labels.add(package.connectionLabel)

                    if len(labels) != 1:
                        continue

                    label = labels.pop()

                    # if selectedLabelsPerFile[label] >= 200:
                    #     continue

                    key = ConnectionKey(cacheKey, i[0], i[1], window,
                                        selection[0].connectionLabel)

                    selectedLabelsPerFile[label] += 1
                    mapping[key] = mappingIndex
                    mappingIndex += 1
                    meta[key] = selection

            # connectionSummary(connections, selectedLabelsPerFile)
            for k, v in selectedLabelsPerFile.items():
                totalLabels[k] += v

        with open(config.pklCache, 'wb') as file:
            pickle.dump(meta, file)
        with open(config.mappingCache, 'wb') as file:
            pickle.dump(mapping, file)
        with open(config.totalLabelsCache, 'wb') as file:
            pickle.dump(totalLabels, file)

    logging.info(f'Collective surviving connections {len(meta)}')
    connectionSummary(meta, totalLabels)

    if len(meta) < 50:
        logging.error('Too little connections to create clustering')
        raise Exception

    return meta, mapping
Example #6
0
def readPCAP(filename,
             config,
             cutOff=5000) -> dict[tuple[str, str], list[PackageInfo]]:
    preProcessed = defaultdict(list)
    reachedSizeLimit = []

    with open(filename, 'rb') as f:
        pcap = dpkt.pcap.Reader(f)
        for ts, pkt in tqdm(pcap,
                            unit='packages',
                            unit_scale=True,
                            postfix=filename,
                            mininterval=0.5):
            try:
                eth = dpkt.ethernet.Ethernet(pkt)
            except Exception:
                continue

            level3 = eth.data

            if type(level3) is not dpkt.ip.IP:
                continue

            key = hash((level3.src, level3.dst))

            if key in reachedSizeLimit:
                continue

            preProcessed[key].append((ts, pkt))

            if len(preProcessed[key]) > cutOff:
                reachedSizeLimit.append(key)

    logging.info(f'Before cleanup: {len(preProcessed)} connections.')

    flattened = []
    for values in preProcessed.values():
        if len(values) < config.thresh:
            continue
        flattened.extend(values)
    del preProcessed

    logging.info(f'After cleanup: {len(flattened)} packages.')

    connections = defaultdict(list)
    previousTimestamp = {}
    count = 0

    labels, lineCount = timeFunction(readLabeled.__name__,
                                     lambda: readLabeled(filename))

    for ts, pkt in tqdm(flattened,
                        unit='packages',
                        unit_scale=True,
                        postfix=filename,
                        mininterval=0.5):
        eth = dpkt.ethernet.Ethernet(pkt)

        count += 1
        level3 = eth.data

        level4 = level3.data

        src_ip = inet_to_str(level3.src)
        dst_ip = inet_to_str(level3.dst)

        key = (src_ip, dst_ip)
        timestamp = datetime.datetime.utcfromtimestamp(ts)

        if key in previousTimestamp:
            gap = round((timestamp - previousTimestamp[key]).microseconds)
        else:
            gap = 0

        previousTimestamp[key] = timestamp

        if type(level4) is dpkt.tcp.TCP:
            source_port = level4.sport
            destination_port = level4.dport
        elif type(level4) is dpkt.udp.UDP:
            source_port = level4.sport
            destination_port = level4.dport
        else:
            continue

        label = labels.get(
            hash((src_ip, dst_ip, source_port,
                  destination_port))) or labels.get(
                      hash((dst_ip, src_ip, destination_port,
                            source_port))) or '-'

        flow_data = PackageInfo(gap, level3.len, source_port, destination_port,
                                label)

        connections[key].append(flow_data)

    return {
        key: value
        for (key, value) in connections.items() if len(value) >= config.thresh
    }