Beispiel #1
0
    def histogramFieldEnds(self, symbols: List[Symbol]):
        """
        Histogram of the absolute byte positions in all messages of all input symbols.

        :param symbols:
        :return:
        """
        from validation.dissectorMatcher import MessageComparator
        from collections import Counter
        from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage

        maxLen = 0
        cumulatedFieldEnds = Counter()
        for symbol in symbols:
            for message in symbol.messages:  # type: AbstractMessage
                maxLen = max(maxLen, len(message.data))
                cumulatedFieldEnds.update(
                    MessageComparator.fieldEndsPerSymbol(
                        symbol, message)[:-1])  # omit message end

        countAt = list()
        for bytepos in range(maxLen):
            if bytepos in cumulatedFieldEnds.keys():
                countAt.append(cumulatedFieldEnds[bytepos])
            else:
                countAt.append(0)

        plt.bar(list(range(maxLen)), countAt, width=1.0, color="green")
        plt.autoscale(tight=True)
Beispiel #2
0
def iterSimilarities(minSimilarity=40, maxSimilarity=60):
    # type: () -> Dict[int, Tuple[Dict[netzob.Symbol, List[List[Tuple[str, int]]]], float]]
    """
    Iterate input parameter similarity threshold for clustering (minEquivalence = 0...100).

    :returns a dict in the of structure of:
        dict (
            similaritythreshold : dict (
                symbol : list ( tformats )
                )
            )
    """
    symFmt = dict()
    for similaritythreshold in range(minSimilarity, maxSimilarity + 1):
        print("Similarity {:d}: ".format(similaritythreshold))
        symbols, runtime = getNetzobInference(
            list(specimens.messagePool.keys()),
            similaritythreshold)  # l5msgs (should be in original order)
        symFmt[similaritythreshold] = (dict(), runtime)
        for symbol in symbols:
            # l2msgs = [specimens.messagePool[msg] for msg in symbol.messages]
            tformats = MessageComparator.uniqueFormats(
                list(comparator.dissections.values()))
            symFmt[similaritythreshold][0][symbol] = tformats
    return symFmt
Beispiel #3
0
                        action='store_true')
    args = parser.parse_args()
    if not isfile(args.pcapfilename):
        print('File not found: ' + args.pcapfilename)
        exit(1)

    import time
    swstart = time.time()
    print('\nLoading ...')

    specimens = SpecimenLoader(args.pcapfilename,
                               layer=args.layer,
                               relativeToIP=args.relativeToIP)
    comparator = MessageComparator(specimens,
                                   layer=args.layer,
                                   relativeToIP=args.relativeToIP,
                                   failOnUndissectable=False,
                                   debug=debug)
    print('Loaded and dissected in {:.3f}s'.format(time.time() - swstart))

    print('\nNetzob Inference ...')
    # dict ( similaritythreshold : dict ( symbol : (quality, fieldcount, exactf, nearf, uospecific) ) )
    if args.smin:
        minThresh = args.smin
        maxThresh = args.smax if args.smax else args.smin
    threshSymbTfmtTime = iterSimilarities(minThresh, maxThresh)
    threshSymbTfmt = {t: s for t, (s, r) in threshSymbTfmtTime.items()}
    threshTime = {t: r for t, (s, r) in threshSymbTfmtTime.items()}

    print('\nCalculate Format Match Score...')
    swstart = time.time()
Beispiel #4
0
                        default=False,
                        action='store_true')
    args = parser.parse_args()
    if not isfile(args.pcapfilename):
        print('File not found: ' + args.pcapfilename)
        exit(1)

    sigma = 0.6 if not args.sigma else args.sigma

    print("Load messages...")
    specimens = SpecimenLoader(args.pcapfilename,
                               layer=args.layer,
                               relativeToIP=args.relativeToIP)
    comparator = MessageComparator(specimens,
                                   layer=args.layer,
                                   relativeToIP=args.relativeToIP,
                                   failOnUndissectable=False,
                                   debug=debug)

    ########################

    print("Segment messages...")
    inferenceTitle = 'bcDeltaGauss{:.1f}'.format(sigma)  # +hiPlateaus

    startsegmentation = time.time()
    segmentsPerMsg = bcDeltaGaussMessageSegmentation(specimens, sigma)
    runtimeSegmentation = time.time() - startsegmentation
    refinedPerMsg = refinements(segmentsPerMsg, None)
    runtimeRefinement = time.time() - startsegmentation

    print('Segmented and refined in {:.3f}s'.format(time.time() -
Beispiel #5
0
def writeReport(formatmatchmetrics: Dict[AbstractMessage, FormatMatchScore],
                runtime: float,
                specimens: SpecimenLoader,
                comparator: MessageComparator,
                inferenceTitle: str,
                folder="reports"):

    absFolder = abspath(folder)
    if not isdir(absFolder):
        raise NotADirectoryError(
            "The reports folder {:d} is not a directory. Reports cannot be written there."
            .format(absFolder))
    pcapName = splitext(basename(specimens.pcapFileName))[0]
    reportFolder = join(
        absFolder, pcapName + "_{}_{}".format(
            inferenceTitle, time.strftime("%Y%m%d-%H%M%S", time.localtime())))
    os.makedirs(reportFolder)

    print('Write report to ' + reportFolder)

    # write Format Match Score and Metrics to csv
    with open(os.path.join(reportFolder, 'FormatMatchMetrics.csv'),
              'w') as csvfile:
        fmmcsv = csv.writer(csvfile)
        fmmcsv.writerow(["Message", "Score", 'I', 'M', 'N', 'S', 'MG', 'SP'])
        fmmcsv.writerows([
            (message.data.hex(), fms.score, fms.inferredCount, fms.exactCount,
             fms.nearCount, fms.specificy, fms.matchGain, fms.specificyPenalty)
            for message, fms in formatmatchmetrics.items()
        ])

    minmeanmax = getMinMeanMaxFMS(
        [q.score for q in formatmatchmetrics.values()])

    with open(os.path.join(reportFolder, 'ScoreStatistics.csv'),
              'w') as csvfile:
        fmmcsv = csv.writer(csvfile)
        fmmcsv.writerow(["Inference", "min", "mean", "max", "runtime"])
        fmmcsv.writerow([inferenceTitle, *minmeanmax, runtime])

    # write Symbols to csvs
    multipleSymbolCSVs = False
    if multipleSymbolCSVs:
        for cnt, symbol in enumerate(  # by the set comprehension,
                { quality.symbol  # remove identical symbols due to multiple formats
                for quality
                in formatmatchmetrics.values() } ):
            fileNameS = 'Symbol_{:s}_{:d}'.format(symbol.name, cnt)
            with open(os.path.join(reportFolder, fileNameS + '.csv'),
                      'w') as csvfile:
                symbolcsv = csv.writer(csvfile)
                symbolcsv.writerow([field.name for field in symbol.fields])
                symbolcsv.writerows([val.hex() for val in msg]
                                    for msg in symbol.getCells())
    else:
        fileNameS = 'Symbols'
        with open(os.path.join(reportFolder, fileNameS + '.csv'),
                  'w') as csvfile:
            symbolcsv = csv.writer(csvfile)
            msgcells = chain.from_iterable([
                sym.getCells() for sym in  # unique symbols by set
                {fms.symbol
                 for fms in formatmatchmetrics.values()}
            ])
            symbolcsv.writerows([val.hex() for val in msg] for msg in msgcells)

    # # write tshark-dissection to csv
    # # currently only unique formats. For a specific trace a baseline could be determined
    # # by a one time run of per ParsedMessage
    # with open(os.path.join(reportFolder, 'tshark-dissections.csv'), 'w') as csvfile:
    #     formatscsv = csv.writer(csvfile)
    #     revmsg = {l2m: l5m for l5m, l2m in specimens.messagePool.items()}  # get L5 messages for the L2 in tformats
    #     formatscsv.writerows([(revmsg[l2m].data.hex(), f) for l2m, f in tformats.items()])

    # FMS : Symbol
    score2symbol = {
        fms.score: fms.symbol
        for fms in formatmatchmetrics.values()
    }

    tikzcode = comparator.tprintInterleaved(score2symbol[mmm]
                                            for mmm in minmeanmax)

    # write Format Match Score and Metrics to csv
    with open(join(reportFolder, 'example-inference-minmeanmax.tikz'),
              'w') as tikzfile:
        tikzfile.write(tikzcode)
Beispiel #6
0
def cacheAndLoadDC(pcapfilename: str, analysisTitle: str, tokenizer: str, debug: bool,
                   analyzerType: type, analysisArgs: Tuple=None, sigma: float=None, filterTrivial=False,
                   refinementCallback:Union[Callable, None] = refinements,
                   disableCache=False) \
        -> Tuple[SpecimenLoader, MessageComparator, List[Tuple[MessageSegment]], DistanceCalculator,
        float, float]:
    """
    cache or load the DistanceCalculator to or from the filesystem


    :param filterTrivial: Filter out **one-byte** segments and such just consisting of **zeros**.
    :param disableCache: When experimenting with distances manipulation, deactivate caching!
    :return:
    """
    pcapbasename = os.path.basename(pcapfilename)
    # if refinementCallback == pcaMocoRefinements:
    #     sigma = pcamocoSigmapertrace[pcapbasename] if not sigma and pcapbasename in pcamocoSigmapertrace else \
    #         0.9 if not sigma else sigma
    # else:
    sigma = sigmapertrace[pcapbasename] if not sigma and pcapbasename in sigmapertrace else \
        0.9 if not sigma else sigma
    pcapName = os.path.splitext(pcapbasename)[0]
    # noinspection PyUnboundLocalVariable
    tokenparm = tokenizer if tokenizer != "nemesys" else \
        "{}{:.0f}".format(tokenizer, sigma * 10)
    dccachefn = os.path.join(
        cacheFolder, 'cache-dc-{}-{}-{}-{}-{}.{}'.format(
            analysisTitle, tokenparm, "filtered" if filterTrivial else "all",
            refinementCallback.__name__
            if refinementCallback is not None else "raw", pcapName, 'ddc'))
    # dccachefn = 'cache-dc-{}-{}-{}.{}'.format(analysisTitle, tokenizer, pcapName, 'dc')
    if disableCache or not os.path.exists(dccachefn):
        # dissect and label messages
        print("Load messages from {}...".format(pcapName))
        specimens = SpecimenLoader(pcapfilename, 2, True)
        comparator = MessageComparator(specimens, 2, True, debug=debug)

        print("Segmenting messages...", end=' ')
        segmentationTime = time.time()
        # select tokenizer by command line parameter
        if tokenizer == "tshark":
            # 1. segment messages according to true fields from the labels
            segmentedMessages = annotateFieldTypes(analyzerType, analysisArgs,
                                                   comparator)
        elif tokenizer == "4bytesfixed":
            # 2. segment messages into fixed size chunks for testing
            segmentedMessages = segmentsFixed(4, comparator, analyzerType,
                                              analysisArgs)
        elif tokenizer == "nemesys":
            # 3. segment messages by NEMESYS
            segmentsPerMsg = bcDeltaGaussMessageSegmentation(specimens, sigma)

            # get analyzer requested by analyzerType/analysisArgs
            segmentedMessages = [[
                MessageSegment(
                    MessageAnalyzer.findExistingAnalysis(
                        analyzerType, MessageAnalyzer.U_BYTE, seg.message,
                        analysisArgs), seg.offset, seg.length) for seg in msg
            ] for msg in segmentsPerMsg]

            if refinementCallback is not None:
                if refinementCallback.__code__.co_argcount > 1:
                    # assume the second argument is expected to be a distance calculator
                    chainedSegments = list(
                        chain.from_iterable(segmentedMessages))
                    print("Refinement: Calculate distance for {} segments...".
                          format(len(chainedSegments)))
                    if len(chainedSegments)**2 > MemmapDC.maxMemMatrix:
                        refinementDC = MemmapDC(chainedSegments)
                    else:
                        refinementDC = DelegatingDC(chainedSegments)
                    segmentedMessages = refinementCallback(
                        segmentedMessages, refinementDC)
                else:
                    segmentedMessages = refinementCallback(segmentedMessages)

            # segments = list(chain.from_iterable(segmentedMessages))

        segmentationTime = time.time() - segmentationTime
        print("done.")

        if filterTrivial:
            # noinspection PyUnboundLocalVariable
            chainedSegments = [
                seg for seg in chain.from_iterable(segmentedMessages)
                if seg.length > 1 and set(seg.values) != {0}
            ]
        else:
            # noinspection PyUnboundLocalVariable
            chainedSegments = list(chain.from_iterable(segmentedMessages))

        print("Calculate distance for {} segments...".format(
            len(chainedSegments)))
        # dc = DistanceCalculator(chainedSegments, reliefFactor=0.33)  # Pairwise similarity of segments: dc.distanceMatrix
        dist_calc_segmentsTime = time.time()
        if len(chainedSegments)**2 > MemmapDC.maxMemMatrix:
            dc = MemmapDC(chainedSegments)
        else:
            dc = DelegatingDC(chainedSegments)
        assert chainedSegments == dc.rawSegments
        dist_calc_segmentsTime = time.time() - dist_calc_segmentsTime
        try:
            with open(dccachefn, 'wb') as f:
                pickle.dump((segmentedMessages, comparator, dc), f,
                            pickle.HIGHEST_PROTOCOL)
        except MemoryError as e:
            print("DC could not be cached due to a MemoryError. Removing",
                  dccachefn, "and continuing.")
            os.remove(dccachefn)
    else:
        print("Load distances from cache file {}".format(dccachefn))
        with open(dccachefn, 'rb') as f:
            segmentedMessages, comparator, dc = pickle.load(f)
        if not (isinstance(comparator, MessageComparator)
                and isinstance(dc, DistanceCalculator)):
            print('Loading of cached distances failed.')
            exit(10)
        specimens = comparator.specimens
        # chainedSegments = list(chain.from_iterable(segmentedMessages))
        segmentationTime, dist_calc_segmentsTime = None, None

    return specimens, comparator, segmentedMessages, dc, segmentationTime, dist_calc_segmentsTime
        'Adjust width/aspect ratio for use in one USENIX column wide plot (1) or '
        'for one USENIX column sideways leaving space for the caption (2)')
    args = parser.parse_args()
    if not isfile(args.pcapfilename):
        print('File not found: ' + args.pcapfilename)
        exit(1)

    sigma = 0.6 if not args.sigma else args.sigma

    print("Load messages...")
    specimens = SpecimenLoader(args.pcapfilename,
                               layer=args.layer,
                               relativeToIP=args.relativeToIP)
    comparator = MessageComparator(specimens,
                                   layer=args.layer,
                                   relativeToIP=args.relativeToIP,
                                   failOnUndissectable=False,
                                   debug=debug)

    ########################

    print("Segment messages...")
    startsegmentation = time.time()

    inferenceTitle = 'bcDeltaGauss{:.1f}'.format(sigma)  # +hiPlateaus
    segmentsPerMsg = bcDeltaGaussMessageSegmentation(specimens, sigma)
    refinedPerMsg = refinements(segmentsPerMsg)

    print('Segmented and refined in {:.3f}s'.format(time.time() -
                                                    startsegmentation))