def main(): if len(sys.argv) < 2: logging.error('incomplete command') elif sys.argv[1] == 'folder': timeFunction("totalRuntime", lambda: execute()) else: logging.error('incomplete command')
def execute(): for i in range(1): config.seed = i meta, mapping = timeFunction(readFolderWithPCAPs.__name__, lambda: readFolderWithPCAPs()) timeFunction( calculateDistancesAndGenerateOutput.__name__, lambda: calculateDistancesAndGenerateOutput(meta, mapping)) resultsDf = pd.DataFrame.from_dict(results).T.rename_axis('Name') resultsDf.to_csv(f'{config.outputDirStats}stats-{config.thresh}.csv')
def processMeasurements(normalizeDistanceMeasurement, mapping, inv_mapping, name): if os.path.exists(f"{config.outputDirStats}{name}{config.addition}.txt"): os.remove(f"{config.outputDirStats}{name}{config.addition}.txt") clu, projection = timeFunction( f'[{name}] {generateClusters.__name__}', lambda: generateClusters(normalizeDistanceMeasurement, name)) if config.generateTSNEGraphs: timeFunction(generateClusterGraph.__name__, lambda: generateClusterGraph(clu, projection, name)) finalClusters, dagClusters, heatmapCluster = saveClustersToCsv( clu, mapping, inv_mapping, name) finalClusterSummary(finalClusters, inv_mapping, name) return finalClusters, heatmapCluster
def calculateDistancesAndGenerateOutput(metadata: dict[ConnectionKey, list[PackageInfo]], mapping): inv_mapping: dict[int, ConnectionKey] = {v: k for k, v in mapping.items()} values = list(metadata.values()) generateOutputFolders() if config.generateRaw: storeRawData(values) statisticalProperties, normalizeDistanceMeasurementStatistical = timeFunction( getStatisticalNormalizedDistanceMeasurement.__name__, lambda: getStatisticalNormalizedDistanceMeasurement(values, config)) normalizeDistanceMeasurementSequential = timeFunction( getSequentialNormalizedDistanceMeasurement.__name__, lambda: getSequentialNormalizedDistanceMeasurement(values, config)) finalClustersStatistical, heatmapClusterStatistical = processMeasurements( normalizeDistanceMeasurementStatistical, mapping, inv_mapping, 'Statistical') finalClustersSequential, heatmapClusterSequential = processMeasurements( normalizeDistanceMeasurementSequential, mapping, inv_mapping, 'Sequential') compareFinalClusters(finalClustersSequential, finalClustersStatistical) if config.generateAllGraphs: # clusterAmount = len(finalClusters) # generateDag(dagClusters, clusterAmount) timeFunction( generateGraphs.__name__, lambda: generateGraphs('Statistical', heatmapClusterStatistical, values, statisticalProperties)) timeFunction( generateGraphs.__name__, lambda: generateGraphs('Sequential', heatmapClusterSequential, values, statisticalProperties))
def readFolderWithPCAPs(useFileCache=False, forceFileCacheUse=True): meta = {} mapping = {} totalLabels = defaultdict(int) mappingIndex = 0 if forceFileCacheUse: files = glob.glob(sys.argv[2] + "/*.pcap.pkl") else: files = glob.glob(sys.argv[2] + "/**/*.pcap") logging.info(f'About to read pcap... from {len(files)} files') if os.path.exists(config.pklCache) and os.path.exists( config.mappingCache) and os.path.exists(config.totalLabelsCache): with open(config.pklCache, 'rb') as file: meta = pickle.load(file) with open(config.mappingCache, 'rb') as file: mapping = pickle.load(file) with open(config.totalLabelsCache, 'rb') as file: totalLabels = pickle.load(file) else: for f in files: cacheKey = os.path.basename(f) cacheName = f'data/{cacheKey}.pkl' if os.path.exists(cacheName) and useFileCache: logging.debug(f'Using cache: {cacheKey}') with open(cacheName, 'rb') as file: connections = pickle.load(file) elif os.path.exists(f) and forceFileCacheUse: logging.debug(f'Using cache: {cacheKey}') with open(f, 'rb') as file: connections = pickle.load(file) elif not forceFileCacheUse: logging.info(f'Reading file: {cacheKey}') connections = timeFunction(readPCAP.__name__, lambda: readPCAP(f, config)) if len(connections.items()) < 1: continue with open(cacheName, 'wb') as file: pickle.dump(connections, file) else: logging.info( f'Skipping {f} because it has no cache file: {cacheName}') continue connectionItems: list[(ConnectionKey, list[PackageInfo])] = list( connections.items()) random.shuffle(connectionItems) selectedLabelsPerFile = defaultdict(int) for i, v in connectionItems: wantedWindow = getWantedWindow(v) for window in wantedWindow: selection: list[PackageInfo] = v[config.thresh * window:config.thresh * (window + 1)] labels = set() for package in selection: labels.add(package.connectionLabel) if len(labels) != 1: continue label = labels.pop() # if selectedLabelsPerFile[label] >= 200: # continue key = ConnectionKey(cacheKey, i[0], i[1], window, selection[0].connectionLabel) selectedLabelsPerFile[label] += 1 mapping[key] = mappingIndex mappingIndex += 1 meta[key] = selection # connectionSummary(connections, selectedLabelsPerFile) for k, v in selectedLabelsPerFile.items(): totalLabels[k] += v with open(config.pklCache, 'wb') as file: pickle.dump(meta, file) with open(config.mappingCache, 'wb') as file: pickle.dump(mapping, file) with open(config.totalLabelsCache, 'wb') as file: pickle.dump(totalLabels, file) logging.info(f'Collective surviving connections {len(meta)}') connectionSummary(meta, totalLabels) if len(meta) < 50: logging.error('Too little connections to create clustering') raise Exception return meta, mapping
def readPCAP(filename, config, cutOff=5000) -> dict[tuple[str, str], list[PackageInfo]]: preProcessed = defaultdict(list) reachedSizeLimit = [] with open(filename, 'rb') as f: pcap = dpkt.pcap.Reader(f) for ts, pkt in tqdm(pcap, unit='packages', unit_scale=True, postfix=filename, mininterval=0.5): try: eth = dpkt.ethernet.Ethernet(pkt) except Exception: continue level3 = eth.data if type(level3) is not dpkt.ip.IP: continue key = hash((level3.src, level3.dst)) if key in reachedSizeLimit: continue preProcessed[key].append((ts, pkt)) if len(preProcessed[key]) > cutOff: reachedSizeLimit.append(key) logging.info(f'Before cleanup: {len(preProcessed)} connections.') flattened = [] for values in preProcessed.values(): if len(values) < config.thresh: continue flattened.extend(values) del preProcessed logging.info(f'After cleanup: {len(flattened)} packages.') connections = defaultdict(list) previousTimestamp = {} count = 0 labels, lineCount = timeFunction(readLabeled.__name__, lambda: readLabeled(filename)) for ts, pkt in tqdm(flattened, unit='packages', unit_scale=True, postfix=filename, mininterval=0.5): eth = dpkt.ethernet.Ethernet(pkt) count += 1 level3 = eth.data level4 = level3.data src_ip = inet_to_str(level3.src) dst_ip = inet_to_str(level3.dst) key = (src_ip, dst_ip) timestamp = datetime.datetime.utcfromtimestamp(ts) if key in previousTimestamp: gap = round((timestamp - previousTimestamp[key]).microseconds) else: gap = 0 previousTimestamp[key] = timestamp if type(level4) is dpkt.tcp.TCP: source_port = level4.sport destination_port = level4.dport elif type(level4) is dpkt.udp.UDP: source_port = level4.sport destination_port = level4.dport else: continue label = labels.get( hash((src_ip, dst_ip, source_port, destination_port))) or labels.get( hash((dst_ip, src_ip, destination_port, source_port))) or '-' flow_data = PackageInfo(gap, level3.len, source_port, destination_port, label) connections[key].append(flow_data) return { key: value for (key, value) in connections.items() if len(value) >= config.thresh }