Beispiel #1
0
def set_union(itr1, itr2):
    p1 = peekable(itr1)
    p2 = peekable(itr2)
    result = type(itr1)()

    while p1.hasNext() and p2.hasNext():
        i1 = p1.peek()
        i2 = p2.peek()

        if i1 < i2:
            result.append(i1)
            next(p1)
        elif i2 < i1:
            result.append(i2)
            next(p2)
        else:
            result.append(i1)
            next(p1)
            next(p2)

    while p1.hasNext():
        result.append(next(p1))
    while p2.hasNext():
        result.append(next(p2))
    return result
Beispiel #2
0
def set_union(itr1, itr2):  #pragma: no cover
    p1 = peekable(itr1)
    p2 = peekable(itr2)
    result = type(itr1)()
    while p1.hasNext() or p2.hasNext():
        i1 = p1.peek()
        i2 = p2.peek()

        try:
            if i1 < i2:
                result.append(i1)
                next(p1)
            elif i2 < i1:
                result.append(i2)
                next(p2)
            else:
                result.append(i1)
                next(p1)
                next(p2)
        except TypeError:
            if p1.isLast() and not p2.isLast():
                result.append(i2)
                next(p2)
            elif p2.isLast() and not p1.isLast():
                result.append(i1)
                next(p1)
    return result
Beispiel #3
0
def set_intersection(itr1, itr2):  #pragma: no cover
    p1 = peekable(itr1)
    p2 = peekable(itr2)
    result = type(itr1)()
    while p1.hasNext() and p2.hasNext():
        i1 = p1.peek()
        i2 = p2.peek()
        if i1 < i2:
            next(p1)
        elif i2 < i1:
            next(p2)
        else:
            result.append(i1)
            next(p1)
            next(p2)
    return result
Beispiel #4
0
def set_diff(itr1, itr2):  #pragma: no cover
    p1 = peekable(itr1)
    result = type(itr1)()
    flag = False

    while p1.hasNext():
        i1 = p1.peek()
        p2 = peekable(itr2)
        flag = False
        while p2.hasNext():
            i2 = p2.peek()
            if i2 == i1:
                flag = True
                break
            else:
                next(p2)

        if not flag:
            result.append(i1)
        next(p1)
    return result
Beispiel #5
0
 def strip_bash_funcs(self, iterable):
     """Strip bash functions from given lines of bash code
     Return a list with no functions a 2nd list of the missing functions"""
     it = []
     funcs = []
     peek = peekable(iterable)
     for elt in peek:
         if '()' in elt and (elt.strip().endswith('{') or
                     peek.peek().strip().startswith('{')):
             funcs.append([])
             while '}' not in elt:
                 elt = elt.next()
                 funcs[-1].append(elt)
         it.append(elt)
     return it, funcs
Beispiel #6
0
def joined_dotted(names):
    names = peekable(names)
    temp = []
    for name in names:
        try:
            peek = names.peek()
        except StopIteration:
            peek = None
        if peek == '.':
            temp.append(name)
            names.next() # soak up the dot op
            continue
        if temp:
            temp.append(name)
            yield '.'.join(temp)
            temp = []
            continue
        yield name
    if temp:
        yield '.'.join(temp)
def perform_semantic_inference(cluster_collection):
    """
    This function performs semantic inference on a list of clusters given
    For each message in these clusters semantics are inferred by analyzing the token
    resp. its context.
    
    At the moment only two semantics are automatically inferred: numeric and IPv4 address
    
    TODO: Add more semantics, e.g. EOL identifier, lenght fields, ...
    """
    # Try to perform semantic inferences

    # Walk through every cluster and check messages for obvious results
    cluster = cluster_collection.get_all_cluster()
    for c in cluster:
        messages = c.get_messages()
        for message in messages:
            tokenlist = message.get_tokenlist()
            iterator = peekable(tokenlist)
            idx = 0
            while not iterator.isLast():
                #for tokenRepresentation in tokenlist:
                tokenRepresentation = iterator.next()
                # TODO: do we need to keep semantics which involve multiple cluster? e.g. sessionids?
                previous_semantics = tokenRepresentation.get_semantics()
                tokenRepresentation.set_semantics(
                    [])  # Clear existing semantics from previous run
                #for s in previous_semantics:
                #    if s.startswith("sessionid"):
                #        tokenRepresentation.add_semantic(s)
                #        break

                if "sessionid" in previous_semantics:
                    # Check if we have at least 2 messages and we are not of type Const
                    if len(messages) > 1 and c.get_format(
                            idx) != Message.typeConst:
                        tokenRepresentation.add_semantic("sessionid")
                if "FD" in previous_semantics:
                    tokenRepresentation.add_semantic("FD")

                token = tokenRepresentation.get_token()
                # Check whether it is numeric

                try:
                    isNumber = tokenRepresentation.get_tokenType(
                    ) == Message.typeText and common.is_number(token)
                except TypeError:
                    if Globals.getConfig().debug:
                        print "Error checking token {0} for number semantics".format(
                            token)
                    isNumber = False
                if isNumber:
                    tokenRepresentation.add_semantic("numeric")
                    #c.add_semantics(idx,"numeric")
                    #print "Inferred semantic inference 'numeric' for token ", token

                # Check whether it is an IP address
                if isinstance(token, str) and common.is_ipv4(token):
                    tokenRepresentation.add_semantic("ipv4 address")
                    # Do not add to cluster unless it is valid for all c.add_semantics(idx,"ipv4 address")
                    #print "Inferred semantic inference 'ipv4 address' for token ", token

                # Check for carriage return identifiers
                # When 0d is followed by 0a we've got a CR-LF
                # Sensible? When 0d or 0a is the last token, we've got a single CR resp LF
                # In all other cases assume 0d/0a is just a hex value of the protocol
                if token == 0xd:
                    nextOne = iterator.peek()
                    if isinstance(nextOne, TokenRepresentation):
                        if nextOne.get_token() == 0xa:
                            inferred_formats = c.get_format_inference()
                            if inferred_formats[idx].getType(
                            ) == Message.typeConst and inferred_formats[
                                    idx + 1].getType() == Message.typeConst:
                                tokenRepresentation.add_semantic("CR")
                                #c.add_semantics(idx,"CR")
                                nextOne = iterator.next()
                                nextOne.set_semantics(["LF"])
                                #c.add_semantics(idx+1, "LF")
                                idx += 1

                idx += 1
        # Perform other tests like "is length field?"
        # explicitely iterate through all messages like stated in the paper
        # we could also postpone this to the call of 'pushToClusterSeminatics" but..

        reference_message = messages[0]
        tokenlist = reference_message.get_tokenlist()
        idx = 0
        for tokenRepresentation in tokenlist:
            if tokenRepresentation.get_tokenType(
            ) == Message.typeBinary and idx + 1 < len(tokenlist):
                ref_value = tokenRepresentation.get_token()
                if not tokenlist[idx + 1].get_tokenType(
                ) == Message.typeText:  # We require that the next token is the text token in question
                    idx += 1
                    continue
                ref_next_length = tokenlist[idx + 1].get_length()
                if not ref_value == ref_next_length:  # This is no length field
                    idx += 1
                    continue
                ref_message_length = reference_message.get_length()
                is_length = True
                for message in messages:
                    cmp_value = message.get_tokenlist()[idx].get_token()
                    cmp_next_length = message.get_tokenlist()[idx +
                                                              1].get_length()
                    cmp_message_length = message.get_length()
                    try:
                        diff_val = abs(cmp_value - ref_value)
                    except TypeError:  # Could happen if a short text token is mistaken as a binary value
                        break
                    diff_next_length = abs(cmp_next_length - ref_next_length)
                    # The next line also takes total msg length differences into account. This might not be true for
                    # all protocols
                    diff_msg_length = abs(cmp_message_length -
                                          ref_message_length)

                    if Globals.getConfig(
                    ).requireTotalLengthChangeForLengthField:
                        if not (diff_val == diff_next_length ==
                                diff_msg_length):
                            is_length = False
                        break
                    else:
                        if not (diff_val == diff_next_length):
                            is_length = False
                            break

                if is_length:  # set "lengthfield" semantic for every message in the cluster at the given position
                    for message in messages:  # TODO: What if there's only one message in the cluster? Sensible?
                        message.get_tokenlist()[idx].add_semantic(
                            "lengthfield")
                        c.add_semantic_for_token(idx, "lengthfield")
            idx += 1

        # Try to identify sessionid fields

        reference_message = messages[0]
        nextInFlow = reference_message.getNextInFlow()
        if nextInFlow != None and not (
                len(messages) == 1 and Globals.getConfig(
                ).sessionIDOnlyWithClustersWithMoreThanOneMessage):
            tokenlist = reference_message.get_tokenlist()
            next_tokenlist = nextInFlow.get_tokenlist()
            ref_idx = 0
            for tokenRepresentation in tokenlist:
                tokType = tokenRepresentation.get_tokenType()
                # If its not a binary, it cannot be a cookie
                if tokType != Message.typeBinary:
                    ref_idx += 1
                    continue
                fmt = c.get_format(ref_idx)
                # If its a binary but const, it cannot be a cookie
                if fmt[1] == Message.typeConst:
                    ref_idx += 1
                    continue
                # Set reference value
                ref_val = tokenRepresentation.get_token()
                # Walk next flow for reference value
                next_idx = 0
                for next_tokenRepresentation in next_tokenlist:
                    # Retrieve next token type
                    nextTokType = next_tokenRepresentation.get_tokenType()
                    # If it is not a binary we don't see it as a cookie
                    if Globals.getConfig().sessionIDOnlyWithBinary:
                        if nextTokType != Message.typeBinary:
                            next_idx += 1
                            continue
                    next_cluster = nextInFlow.getCluster()
                    # Get format of comparating message
                    comp_fmt = next_cluster.get_format(next_idx)
                    # If it is const, it cannot be a sessonid
                    if comp_fmt[1] == Message.typeConst:
                        next_idx += 1
                        continue
                    # Load comparator value
                    comp_val = next_tokenRepresentation.get_token()
                    if ref_val == comp_val:  # We've got a potential hit, now compare all messages for the same idx pairs
                        isCookie = True
                        for cmp_ref_msg in messages:
                            if not isCookie:
                                break
                            if cmp_ref_msg == messages[
                                    0]:  # Skip first message (we've already checked that one
                                continue
                            cmp_ref_tok_list = cmp_ref_msg.get_tokenlist()

                            cmp_ref_val = cmp_ref_tok_list[ref_idx].get_token()
                            cmp_cmp_msg = cmp_ref_msg.getNextInFlow()
                            if cmp_cmp_msg == None:
                                isCookie = False
                            else:
                                cmp_cmp_tok_list = cmp_cmp_msg.get_tokenlist()
                                if next_idx >= len(cmp_cmp_tok_list):
                                    # Obviously "next" points to messages in different clusters
                                    # so the len might differ from the reference next cluster
                                    # used to find our reference cookie value
                                    # Therefore this cannot be a cookie
                                    isCookie = False
                                    continue
                                # Make sure the comparing token is also not constant
                                cmp_cmp_fmt = cmp_cmp_msg.getCluster(
                                ).get_format(next_idx)
                                # If it is const, it cannot be a sessonid
                                if cmp_cmp_fmt == Message.typeConst:
                                    isCookie = False
                                    continue

                                # Finally compare the values
                                cmp_cmp_val = cmp_cmp_tok_list[
                                    next_idx].get_token()
                                if (cmp_ref_val != cmp_cmp_val) or (
                                    (cmp_ref_val == cmp_cmp_val) and
                                    (cmp_ref_val == ref_val)):
                                    isCookie = False
                        if isCookie:
                            # Set cookie semantic in this message and the other
                            #sessionid = uuid.uuid1()
                            for message in messages:  # Set for every message and the cluster itself
                                #message.get_tokenlist()[ref_idx].add_semantic("sessionid_{0}".format(sessionid))
                                message.get_tokenlist()[ref_idx].add_semantic(
                                    "sessionid")
                                nextMsg = message.getNextInFlow()
                                #nextMsg.get_tokenlist()[next_idx].add_semantic("sessionid_{0}".format(sessionid))
                                nextMsg.get_tokenlist()[next_idx].add_semantic(
                                    "sessionid")
                            c.add_semantic_for_token(ref_idx, "sessionid")
                            #c.add_semantic_for_token(ref_idx,"sessionid_{0}".format(sessionid))
                    next_idx += 1
                ref_idx += 1

        # Try to find random fields (16 bit)
        token_formats = c.get_formats()
        idx = 0
        for token_format in token_formats:
            rep, form, semantics = token_format
            if form.getType(
            ) == Message.typeVariable and rep == Message.typeBinary:
                try:
                    variance = c.getVariableStatistics()[idx].getVariance()
                except Exception:
                    pass

                if variance > 1000 and len(semantics) == 0:
                    # We've got a very high variance and no assigned semantics --> candidate for random
                    # Have a look at the last but one token
                    if idx - 1 >= 0:
                        rep, form, semantics = token_formats[idx - 1]
                        if form.getType(
                        ) == Message.typeVariable and rep == Message.typeBinary:
                            stats = c.getVariableStatistics()[idx - 1]
                            if stats != None:
                                variance2 = stats.getVariance()
                            else:
                                logging.error(
                                    "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})"
                                    .format(idx, len(token_formats),
                                            len(c.getVariableStatistics())))
                                idx += 1
                                continue

                            if variance2 > 1000 and len(semantics) == 0:
                                # Consider the two as a CRC-16
                                for message in messages:  # Set for every message and the cluster itself
                                    message.get_tokenlist()[
                                        idx - 1].add_semantic("random")
                                    message.get_tokenlist()[idx].add_semantic(
                                        "random")
                                c.add_semantic_for_token(idx - 1, "random")
                                c.add_semantic_for_token(idx, "random")
            idx += 1

        # Try to find sets (valued limited in variability with lower and upper bound)
        token_formats = c.get_formats()
        idx = 0
        for token_format in token_formats:
            rep, form, semantics = token_format
            if form.getType() == Message.typeVariable:
                stats = c.getVariableStatistics()[idx]
                if stats != None:
                    distinct = stats.numberOfDistinctSamples()
                else:
                    logging.error(
                        "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})"
                        .format(idx, len(token_formats),
                                len(c.getVariableStatistics())))
                    idx += 1
                    continue
                # How will be find out whether a number of variable values is a set or really variable?
                # We assume that there is an absolute maximum amount of distinct values which is independent
                # of the actual number of messages. However we also need to consider that when the number of
                # messages in a cluster definitily falls below the setAbsoluteMax value, we have to adapt
                # the local maximum in this cluster.
                # For the moment we take a multiplier for the number of messages (default 0.3 == 30%) and
                # assume it is a set, when both, setAbsoluteMax and the localThreshold is underrun
                # In addition we assume that we have no semantics for this token, as other semantics conflict
                # with the notion of a set

                if (distinct <= Globals.getConfig().setAbsoluteMax
                        and distinct <=
                    (c.getNumberOfMessages() *
                     Globals.getConfig().setPercentageThreshold)
                        and len(semantics) == 0):
                    for message in messages:  # Set for every message and the cluster itself
                        message.get_tokenlist()[idx].add_semantic("set")
                    c.add_semantic_for_token(idx - 1, "set")
            idx += 1
    # Push to cluster
    pushUpToCluster(cluster_collection)
Beispiel #8
0
    def calc_regExVisual(self):
        regexstr = "^"
        idx = 0
        iterator = peekable(self.get('format_inference'))
        while not iterator.isLast():
            item = iterator.next()
            tokType = self.get('representation')[idx]
            if tokType == Message.typeDirection:
                # Add a \s* before the first text token
                if not iterator.isLast():
                    if self.get('representation')[idx + 1] == Message.typeText:
                        # Only adding [\\t| ]* will not work for multiline texts like in ftp banners. Here we will need the \r \n as well.
                        # So get back to \s instead of "[\t| ]*"
                        #regexstr += "[\\t| ]*"
                        regexstr += "\\s*"
            else:
                if isinstance(item, formatinference.Constant):
                    #if isinstance(item.getConstValue(),str):
                    if self.get('representation')[idx] == Message.typeText:
                        val = item.getConstValue()
                        # Order matters!
                        val = string.replace(val, "\\", "\\\\")
                        val = string.replace(val, "(", "\(")
                        val = string.replace(val, ")", "\)")
                        val = string.replace(val, ".", "\.")
                        val = string.replace(val, "{", "\{")
                        val = string.replace(val, "}", "\}")
                        val = string.replace(val, "]", "\]")
                        val = string.replace(val, "[", "\[")
                        val = string.replace(val, "*", "\*")
                        val = string.replace(val, "?", "\?")
                        val = string.replace(val, "$", "\$")

                        regexstr += val
                    else:
                        val = hex(item.getConstValue())[2:]
                        if len(val) == 1:
                            val = "0{0}".format(val)
                        regexstr += "\\x{0}".format(val)
                elif isinstance(item, formatinference.Variable):
                    stats = self.getVariableStatistics()[idx]
                    min = 1
                    max = 1
                    if stats != None:
                        if isinstance(stats,
                                      formatinference.VariableTextStatistics):
                            min = len(stats.getShortest())
                            max = len(stats.getLongest())
                        else:  # We"re VariableBinaryStatistics
                            # min/max is always 1

                            #s = str(stats.getMin())
                            #min = len(s)
                            #s = str(stats.getMax())
                            #max = len(s)
                            min = 1
                            max = 1
                        if min == max:
                            regexstr += "."
                            if min > 1:
                                regexstr += "{" + str(min) + "}"
                        else:
                            regexstr += ".{" + str(min) + "," + str(max) + "}"
                    else:
                        regexstr += ".+"

                #===============================================================
                # if tokType == Message.typeText:
                #    # peek ahead if next is also text
                #    # Add separator for tokenseparator (nothing by bin-bin, bin-text, text-bin but whitespace when text-text
                #    # text-text is separated by \s (whitespace)
                #    nextOne = iterator.peek()
                #    if nextOne!=peekable.sentinel:
                #        nextType = self.get('representation')[idx+1]
                #        if nextType == Message.typeText:
                #            regexstr += "\s" # Add whitespace token separator
                #===============================================================

                curType = self.get('representation')[idx]
                if iterator.isLast():
                    if curType == Message.typeText:
                        # See comment on top why "[\\t| ]*" is not enough
                        #regexstr += "[\\t| ]*"
                        regexstr += "\\s*"
                else:
                    nextType = self.get('representation')[idx + 1]
                    if (curType == Message.typeBinary
                            and nextType == Message.typeText) or (
                                curType == Message.typeText
                                and nextType == Message.typeBinary):
                        # See comment on top why "[\\t| ]*" is not enough
                        #regexstr += "[\\t| ]*"
                        regexstr += "\\s*"
                    elif curType == Message.typeText and nextType == Message.typeText:
                        # See comment on top why "[\\t| ]+" is not enough
                        #regexstr += "[\\t| ]+"
                        regexstr += "\\s+"
            idx += 1
        if not (Globals.getConfig().calculateMaxMessageLength
                or Globals.getConfig().danglingRegEx):
            regexstr += "$"
        return regexstr
Beispiel #9
0
    def calc_regEx(self):
        if Globals.isText():
            # When we are text, only use visual regex
            return self.getRegExVisual()

        regexstr = "^"
        idx = 0
        iterator = peekable(self.get('format_inference'))
        content_msg = self.get_messages()[0]
        # Using only the first message COULD be a problem:
        # In cases where we have trailing whitespaces in one of the messages but not in the first one,
        # The regex will not include these and the regex will not be valid for this message. As a result this
        # message won't be parsed correctly in statemachine_accepts. See below for a workaround
        while not iterator.isLast():
            item = iterator.next()
            tokType = self.get('representation')[idx]
            if tokType == Message.typeDirection:
                # Add a \s* before the first text token
                if not iterator.isLast():
                    if self.get('representation')[idx + 1] == Message.typeText:
                        regexstr += "(?:20)*"
            else:
                token = content_msg.get_tokenAt(idx)
                startsAt = token.get_startsAt()
                length = token.get_length()

                if isinstance(item, formatinference.Constant):
                    #if isinstance(item.getConstValue(),str):
                    token = content_msg.get_tokenAt(idx)
                    startsAt = token.get_startsAt()
                    length = token.get_length()

                    payload = content_msg.get_payload()[startsAt:startsAt +
                                                        length]
                    s = "".join([str(elem) for elem in payload])
                    regexstr += s

                elif isinstance(item, formatinference.Variable):
                    #regexstr += "*?" # Non greedy match - will lead to lock ups
                    #regexstr += ".*" # Non greedy match - will lead to lock ups
                    #regexstr += "((?!20).)+" # Negative lookup for whitespace - does not read bytewise - will abort also on X20Xaabb200a0d

                    # New approach:
                    # Do not use .* but try to be more explicit: [0-9a-f]{2} reads multiple hex values. The trailing {...} value determines how often
                    # these are observed in a cluster (based on the VariableText/NumberStatistics

                    stats = self.getVariableStatistics()[idx]
                    min = 1
                    max = 1
                    if stats != None:
                        if isinstance(stats,
                                      formatinference.VariableTextStatistics):
                            min = len(stats.getShortest())
                            max = len(stats.getLongest())
                        else:  # This is VariableBinaryStatistics
                            # Binary length is always 1 because we only look at one byte values
                            # min/max length is invalid here
                            ##s = str(stats.getMin())
                            ##min = len(s)
                            ##s = str(stats.getMax())
                            ##max = len(s)
                            min = length
                            max = length

                        if min == max:
                            #regexstr += "(?:[0-9a-f]{2}){" + str(min) + "}"

                            regexstr += "(?:[0-9a-f]{2})"
                            if min > 1:
                                regexstr += "{" + str(min) + "}"
                        else:
                            regexstr += "(?:[0-9a-f]{2}){" + str(
                                min) + "," + str(max) + "}"
                    else:
                        regexstr += "(?:[0-9a-f]{2})+"

                #===============================================================
                # if not iterator.isLast():
                #
                #    gotGap = False
                #    nextStart = content_msg.get_tokenAt(idx+1).get_startsAt()
                #    if nextStart!=startsAt+length:
                #    #if nextOne.get_startsAt()!=startsAt+length+1:
                #        regexstr += "(?:20)+"
                #        gotGap = True
                #
                #    # Added 20120409 to compensate for trailing WS tokens
                #    if not gotGap:
                #===============================================================
                # Add (?:20)* token at the beginning and end of text/binary tokens
                # This copes for the problem that Hello_World_ and Hello_World evaluate to the same format, but should have other regexes
                curType = content_msg.get_tokenAt(idx).get_tokenType()
                if iterator.isLast():
                    if curType == Message.typeText:
                        regexstr += "(?:20)*"
                else:
                    curType = content_msg.get_tokenAt(idx).get_tokenType()
                    nextType = content_msg.get_tokenAt(idx + 1).get_tokenType()
                    if (curType == Message.typeBinary
                            and nextType == Message.typeText) or (
                                curType == Message.typeText
                                and nextType == Message.typeBinary):
                        regexstr += "(?:20)*"
                    elif curType == Message.typeText and nextType == Message.typeText:
                        regexstr += "(?:20)+"

                #===============================================================
                #
                # if tokType == Message.typeText:
                #    # peek ahead if next is also text
                #    # Add separator for tokenseparator (nothing by bin-bin, bin-text, text-bin but whitespace when text-text
                #    # text-text is separated by \s (whitespace)
                #    nextOne = iterator.peek()
                #    if nextOne!=None:
                #        nextType = self.get('representation')[idx+1]
                #        if nextType == Message.typeText:
                #            #regexstr += "((20)|(08)|(0a)|(0d))?" # Add whitespace token separator
                #            regexstr += "(?:20)+" # Add whitespace token separator
                #
                #===============================================================
            idx += 1
        if not (Globals.getConfig().calculateMaxMessageLength
                or Globals.getConfig().danglingRegEx):
            regexstr += "$"
        return regexstr
Beispiel #10
0
 def calc_regExVisual(self):
     regexstr = "^"
     idx = 0
     iterator = peekable(self.get('format_inference'))
     while not iterator.isLast():
         item = iterator.next()
         tokType = self.get('representation')[idx]
         if tokType==Message.typeDirection:
             # Add a \s* before the first text token
             if not iterator.isLast():
                 if self.get('representation')[idx+1]==Message.typeText:
                     # Only adding [\\t| ]* will not work for multiline texts like in ftp banners. Here we will need the \r \n as well.
                     # So get back to \s instead of "[\t| ]*"
                     #regexstr += "[\\t| ]*"
                     regexstr += "\\s*"
         else:            
             if isinstance(item,formatinference.Constant):
                 #if isinstance(item.getConstValue(),str):
                 if self.get('representation')[idx]==Message.typeText:
                     val = item.getConstValue()
                     # Order matters!
                     val = string.replace(val, "\\", "\\\\")
                     val = string.replace(val, "(", "\(")
                     val = string.replace(val, ")", "\)")
                     val = string.replace(val, ".", "\.")
                     val = string.replace(val, "{", "\{")
                     val = string.replace(val, "}", "\}")
                     val = string.replace(val, "]", "\]")
                     val = string.replace(val, "[", "\[")
                     val = string.replace(val, "*", "\*")
                     val = string.replace(val, "?", "\?")
                     val = string.replace(val, "$", "\$")
                     
                     
                     regexstr += val
                 else:
                     val = hex(item.getConstValue())[2:]
                     if len(val)==1:
                         val = "0{0}".format(val)
                     regexstr += "\\x{0}".format(val)
             elif isinstance(item,formatinference.Variable):
                 stats = self.getVariableStatistics()[idx]
                 min = 1
                 max = 1
                 if stats != None:
                     if isinstance(stats,formatinference.VariableTextStatistics):
                         min = len(stats.getShortest())
                         max = len(stats.getLongest())
                     else: # We"re VariableBinaryStatistics
                         # min/max is always 1
                         
                         #s = str(stats.getMin())
                         #min = len(s)
                         #s = str(stats.getMax())
                         #max = len(s)
                         min = 1
                         max = 1
                     if min == max:
                         regexstr += "."
                         if min>1:
                             regexstr += "{" + str(min) + "}"
                     else:
                         regexstr += ".{" + str(min) +","+ str(max) +"}"
                 else:
                     regexstr += ".+"
                  
             #===============================================================
             # if tokType == Message.typeText:
             #    # peek ahead if next is also text
             #    # Add separator for tokenseparator (nothing by bin-bin, bin-text, text-bin but whitespace when text-text
             #    # text-text is separated by \s (whitespace)
             #    nextOne = iterator.peek()
             #    if nextOne!=peekable.sentinel:
             #        nextType = self.get('representation')[idx+1]
             #        if nextType == Message.typeText:
             #            regexstr += "\s" # Add whitespace token separator                
             #===============================================================
             
             curType = self.get('representation')[idx]
             if iterator.isLast():
                 if curType==Message.typeText:
                     # See comment on top why "[\\t| ]*" is not enough
                     #regexstr += "[\\t| ]*"
                     regexstr += "\\s*"
             else:
                 nextType = self.get('representation')[idx+1]
                 if (curType==Message.typeBinary and nextType==Message.typeText) or ( 
                     curType==Message.typeText and nextType==Message.typeBinary):
                     # See comment on top why "[\\t| ]*" is not enough
                     #regexstr += "[\\t| ]*"
                     regexstr += "\\s*"
                 elif curType==Message.typeText and nextType==Message.typeText:
                     # See comment on top why "[\\t| ]+" is not enough
                     #regexstr += "[\\t| ]+"
                     regexstr += "\\s+"
         idx += 1
     if not (Globals.getConfig().calculateMaxMessageLength or Globals.getConfig().danglingRegEx):
         regexstr += "$"
     return regexstr 
Beispiel #11
0
 def calc_regEx(self):
     if Globals.isText():
         # When we are text, only use visual regex
         return self.getRegExVisual()
     
     
     regexstr = "^"
     idx = 0
     iterator = peekable(self.get('format_inference'))
     content_msg = self.get_messages()[0]
     # Using only the first message COULD be a problem:
     # In cases where we have trailing whitespaces in one of the messages but not in the first one,
     # The regex will not include these and the regex will not be valid for this message. As a result this
     # message won't be parsed correctly in statemachine_accepts. See below for a workaround
     while not iterator.isLast():
         item = iterator.next()
         tokType = self.get('representation')[idx]
         if tokType==Message.typeDirection:
             # Add a \s* before the first text token
             if not iterator.isLast():
                 if self.get('representation')[idx+1]==Message.typeText:
                     regexstr += "(?:20)*"                      
         else:
             token = content_msg.get_tokenAt(idx)
             startsAt = token.get_startsAt()
             length = token.get_length()
             
             if isinstance(item,formatinference.Constant):
                 #if isinstance(item.getConstValue(),str):
                 token = content_msg.get_tokenAt(idx)
                 startsAt = token.get_startsAt()
                 length = token.get_length()
                 
                 payload = content_msg.get_payload()[startsAt:startsAt+length]
                 s = "".join([str(elem) for elem in payload])
                 regexstr += s
                    
             elif isinstance(item,formatinference.Variable):
                 #regexstr += "*?" # Non greedy match - will lead to lock ups
                 #regexstr += ".*" # Non greedy match - will lead to lock ups
                 #regexstr += "((?!20).)+" # Negative lookup for whitespace - does not read bytewise - will abort also on X20Xaabb200a0d
                 
                 # New approach:
                 # Do not use .* but try to be more explicit: [0-9a-f]{2} reads multiple hex values. The trailing {...} value determines how often
                 # these are observed in a cluster (based on the VariableText/NumberStatistics
                 
                 stats = self.getVariableStatistics()[idx]
                 min = 1
                 max = 1
                 if stats != None:
                     if isinstance(stats,formatinference.VariableTextStatistics):
                         min = len(stats.getShortest())
                         max = len(stats.getLongest())
                     else: # This is VariableBinaryStatistics
                         # Binary length is always 1 because we only look at one byte values
                         # min/max length is invalid here
                         ##s = str(stats.getMin())
                         ##min = len(s)
                         ##s = str(stats.getMax())
                         ##max = len(s)
                         min = length
                         max = length
                         
                     if min == max:
                         #regexstr += "(?:[0-9a-f]{2}){" + str(min) + "}"
                     
                         regexstr += "(?:[0-9a-f]{2})"
                         if min>1:
                             regexstr += "{" + str(min) + "}"
                     else:
                         regexstr += "(?:[0-9a-f]{2}){" + str(min) +","+ str(max) +"}"
                 else:
                     regexstr += "(?:[0-9a-f]{2})+"
                         
             #===============================================================
             # if not iterator.isLast():
             #                     
             #    gotGap = False                       
             #    nextStart = content_msg.get_tokenAt(idx+1).get_startsAt()
             #    if nextStart!=startsAt+length:
             #    #if nextOne.get_startsAt()!=startsAt+length+1:
             #        regexstr += "(?:20)+"
             #        gotGap = True
             #    
             #    # Added 20120409 to compensate for trailing WS tokens
             #    if not gotGap:
             #===============================================================
             # Add (?:20)* token at the beginning and end of text/binary tokens
             # This copes for the problem that Hello_World_ and Hello_World evaluate to the same format, but should have other regexes
             curType = content_msg.get_tokenAt(idx).get_tokenType()
             if iterator.isLast():
                 if curType==Message.typeText:
                     regexstr += "(?:20)*"
             else:
                 curType = content_msg.get_tokenAt(idx).get_tokenType()
                 nextType = content_msg.get_tokenAt(idx+1).get_tokenType()
                 if (curType==Message.typeBinary and nextType==Message.typeText) or ( 
                     curType==Message.typeText and nextType==Message.typeBinary):
                     regexstr += "(?:20)*"
                 elif curType==Message.typeText and nextType==Message.typeText:
                     regexstr += "(?:20)+"
                         
             #===============================================================
             #       
             # if tokType == Message.typeText:
             #    # peek ahead if next is also text
             #    # Add separator for tokenseparator (nothing by bin-bin, bin-text, text-bin but whitespace when text-text
             #    # text-text is separated by \s (whitespace)
             #    nextOne = iterator.peek()
             #    if nextOne!=None:
             #        nextType = self.get('representation')[idx+1]
             #        if nextType == Message.typeText:
             #            #regexstr += "((20)|(08)|(0a)|(0d))?" # Add whitespace token separator
             #            regexstr += "(?:20)+" # Add whitespace token separator                
             #           
             #===============================================================
         idx += 1
     if not (Globals.getConfig().calculateMaxMessageLength or Globals.getConfig().danglingRegEx):
         regexstr += "$"
     return regexstr 
def perform_semantic_inference(cluster_collection):
    """
    This function performs semantic inference on a list of clusters given
    For each message in these clusters semantics are inferred by analyzing the token
    resp. its context.
    
    At the moment only two semantics are automatically inferred: numeric and IPv4 address
    
    TODO: Add more semantics, e.g. EOL identifier, lenght fields, ...
    """
    # Try to perform semantic inferences

    # Walk through every cluster and check messages for obvious results
    cluster = cluster_collection.get_all_cluster()
    for c in cluster:
        messages = c.get_messages()
        for message in messages:
            tokenlist = message.get_tokenlist()
            iterator = peekable(tokenlist)
            idx = 0
            while not iterator.isLast():
                # for tokenRepresentation in tokenlist:
                tokenRepresentation = iterator.next()
                # TODO: do we need to keep semantics which involve multiple cluster? e.g. sessionids?
                previous_semantics = tokenRepresentation.get_semantics()
                tokenRepresentation.set_semantics([])  # Clear existing semantics from previous run
                # for s in previous_semantics:
                #    if s.startswith("sessionid"):
                #        tokenRepresentation.add_semantic(s)
                #        break

                if "sessionid" in previous_semantics:
                    # Check if we have at least 2 messages and we are not of type Const
                    if len(messages) > 1 and c.get_format(idx) != Message.typeConst:
                        tokenRepresentation.add_semantic("sessionid")
                if "FD" in previous_semantics:
                    tokenRepresentation.add_semantic("FD")

                token = tokenRepresentation.get_token()
                # Check whether it is numeric

                try:
                    isNumber = tokenRepresentation.get_tokenType() == Message.typeText and common.is_number(token)
                except TypeError:
                    if Globals.getConfig().debug:
                        print "Error checking token {0} for number semantics".format(token)
                    isNumber = False
                if isNumber:
                    tokenRepresentation.add_semantic("numeric")
                    # c.add_semantics(idx,"numeric")
                    # print "Inferred semantic inference 'numeric' for token ", token

                # Check whether it is an IP address
                if isinstance(token, str) and common.is_ipv4(token):
                    tokenRepresentation.add_semantic("ipv4 address")
                    # Do not add to cluster unless it is valid for all c.add_semantics(idx,"ipv4 address")
                    # print "Inferred semantic inference 'ipv4 address' for token ", token

                # Check for carriage return identifiers
                # When 0d is followed by 0a we've got a CR-LF
                # Sensible? When 0d or 0a is the last token, we've got a single CR resp LF
                # In all other cases assume 0d/0a is just a hex value of the protocol
                if token == 0xD:
                    nextOne = iterator.peek()
                    if isinstance(nextOne, TokenRepresentation):
                        if nextOne.get_token() == 0xA:
                            inferred_formats = c.get_format_inference()
                            if (
                                inferred_formats[idx].getType() == Message.typeConst
                                and inferred_formats[idx + 1].getType() == Message.typeConst
                            ):
                                tokenRepresentation.add_semantic("CR")
                                # c.add_semantics(idx,"CR")
                                nextOne = iterator.next()
                                nextOne.set_semantics(["LF"])
                                # c.add_semantics(idx+1, "LF")
                                idx += 1

                idx += 1
        # Perform other tests like "is length field?"
        # explicitely iterate through all messages like stated in the paper
        # we could also postpone this to the call of 'pushToClusterSeminatics" but..

        reference_message = messages[0]
        tokenlist = reference_message.get_tokenlist()
        idx = 0
        for tokenRepresentation in tokenlist:
            if tokenRepresentation.get_tokenType() == Message.typeBinary and idx + 1 < len(tokenlist):
                ref_value = tokenRepresentation.get_token()
                if (
                    not tokenlist[idx + 1].get_tokenType() == Message.typeText
                ):  # We require that the next token is the text token in question
                    idx += 1
                    continue
                ref_next_length = tokenlist[idx + 1].get_length()
                if not ref_value == ref_next_length:  # This is no length field
                    idx += 1
                    continue
                ref_message_length = reference_message.get_length()
                is_length = True
                for message in messages:
                    cmp_value = message.get_tokenlist()[idx].get_token()
                    cmp_next_length = message.get_tokenlist()[idx + 1].get_length()
                    cmp_message_length = message.get_length()
                    try:
                        diff_val = abs(cmp_value - ref_value)
                    except TypeError:  # Could happen if a short text token is mistaken as a binary value
                        break
                    diff_next_length = abs(cmp_next_length - ref_next_length)
                    # The next line also takes total msg length differences into account. This might not be true for
                    # all protocols
                    diff_msg_length = abs(cmp_message_length - ref_message_length)

                    if Globals.getConfig().requireTotalLengthChangeForLengthField:
                        if not (diff_val == diff_next_length == diff_msg_length):
                            is_length = False
                        break
                    else:
                        if not (diff_val == diff_next_length):
                            is_length = False
                            break

                if is_length:  # set "lengthfield" semantic for every message in the cluster at the given position
                    for message in messages:  # TODO: What if there's only one message in the cluster? Sensible?
                        message.get_tokenlist()[idx].add_semantic("lengthfield")
                        c.add_semantic_for_token(idx, "lengthfield")
            idx += 1

        # Try to identify sessionid fields

        reference_message = messages[0]
        nextInFlow = reference_message.getNextInFlow()
        if nextInFlow != None and not (
            len(messages) == 1 and Globals.getConfig().sessionIDOnlyWithClustersWithMoreThanOneMessage
        ):
            tokenlist = reference_message.get_tokenlist()
            next_tokenlist = nextInFlow.get_tokenlist()
            ref_idx = 0
            for tokenRepresentation in tokenlist:
                tokType = tokenRepresentation.get_tokenType()
                # If its not a binary, it cannot be a cookie
                if tokType != Message.typeBinary:
                    ref_idx += 1
                    continue
                fmt = c.get_format(ref_idx)
                # If its a binary but const, it cannot be a cookie
                if fmt[1] == Message.typeConst:
                    ref_idx += 1
                    continue
                # Set reference value
                ref_val = tokenRepresentation.get_token()
                # Walk next flow for reference value
                next_idx = 0
                for next_tokenRepresentation in next_tokenlist:
                    # Retrieve next token type
                    nextTokType = next_tokenRepresentation.get_tokenType()
                    # If it is not a binary we don't see it as a cookie
                    if Globals.getConfig().sessionIDOnlyWithBinary:
                        if nextTokType != Message.typeBinary:
                            next_idx += 1
                            continue
                    next_cluster = nextInFlow.getCluster()
                    # Get format of comparating message
                    comp_fmt = next_cluster.get_format(next_idx)
                    # If it is const, it cannot be a sessonid
                    if comp_fmt[1] == Message.typeConst:
                        next_idx += 1
                        continue
                    # Load comparator value
                    comp_val = next_tokenRepresentation.get_token()
                    if (
                        ref_val == comp_val
                    ):  # We've got a potential hit, now compare all messages for the same idx pairs
                        isCookie = True
                        for cmp_ref_msg in messages:
                            if not isCookie:
                                break
                            if cmp_ref_msg == messages[0]:  # Skip first message (we've already checked that one
                                continue
                            cmp_ref_tok_list = cmp_ref_msg.get_tokenlist()

                            cmp_ref_val = cmp_ref_tok_list[ref_idx].get_token()
                            cmp_cmp_msg = cmp_ref_msg.getNextInFlow()
                            if cmp_cmp_msg == None:
                                isCookie = False
                            else:
                                cmp_cmp_tok_list = cmp_cmp_msg.get_tokenlist()
                                if next_idx >= len(cmp_cmp_tok_list):
                                    # Obviously "next" points to messages in different clusters
                                    # so the len might differ from the reference next cluster
                                    # used to find our reference cookie value
                                    # Therefore this cannot be a cookie
                                    isCookie = False
                                    continue
                                # Make sure the comparing token is also not constant
                                cmp_cmp_fmt = cmp_cmp_msg.getCluster().get_format(next_idx)
                                # If it is const, it cannot be a sessonid
                                if cmp_cmp_fmt == Message.typeConst:
                                    isCookie = False
                                    continue

                                # Finally compare the values
                                cmp_cmp_val = cmp_cmp_tok_list[next_idx].get_token()
                                if (cmp_ref_val != cmp_cmp_val) or (
                                    (cmp_ref_val == cmp_cmp_val) and (cmp_ref_val == ref_val)
                                ):
                                    isCookie = False
                        if isCookie:
                            # Set cookie semantic in this message and the other
                            # sessionid = uuid.uuid1()
                            for message in messages:  # Set for every message and the cluster itself
                                # message.get_tokenlist()[ref_idx].add_semantic("sessionid_{0}".format(sessionid))
                                message.get_tokenlist()[ref_idx].add_semantic("sessionid")
                                nextMsg = message.getNextInFlow()
                                # nextMsg.get_tokenlist()[next_idx].add_semantic("sessionid_{0}".format(sessionid))
                                nextMsg.get_tokenlist()[next_idx].add_semantic("sessionid")
                            c.add_semantic_for_token(ref_idx, "sessionid")
                            # c.add_semantic_for_token(ref_idx,"sessionid_{0}".format(sessionid))
                    next_idx += 1
                ref_idx += 1

        # Try to find random fields (16 bit)
        token_formats = c.get_formats()
        idx = 0
        for token_format in token_formats:
            rep, form, semantics = token_format
            if form.getType() == Message.typeVariable and rep == Message.typeBinary:
                try:
                    variance = c.getVariableStatistics()[idx].getVariance()
                except Exception:
                    pass

                if variance > 1000 and len(semantics) == 0:
                    # We've got a very high variance and no assigned semantics --> candidate for random
                    # Have a look at the last but one token
                    if idx - 1 >= 0:
                        rep, form, semantics = token_formats[idx - 1]
                        if form.getType() == Message.typeVariable and rep == Message.typeBinary:
                            stats = c.getVariableStatistics()[idx - 1]
                            if stats != None:
                                variance2 = stats.getVariance()
                            else:
                                logging.error(
                                    "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})".format(
                                        idx, len(token_formats), len(c.getVariableStatistics())
                                    )
                                )
                                idx += 1
                                continue

                            if variance2 > 1000 and len(semantics) == 0:
                                # Consider the two as a CRC-16
                                for message in messages:  # Set for every message and the cluster itself
                                    message.get_tokenlist()[idx - 1].add_semantic("random")
                                    message.get_tokenlist()[idx].add_semantic("random")
                                c.add_semantic_for_token(idx - 1, "random")
                                c.add_semantic_for_token(idx, "random")
            idx += 1

        # Try to find sets (valued limited in variability with lower and upper bound)
        token_formats = c.get_formats()
        idx = 0
        for token_format in token_formats:
            rep, form, semantics = token_format
            if form.getType() == Message.typeVariable:
                stats = c.getVariableStatistics()[idx]
                if stats != None:
                    distinct = stats.numberOfDistinctSamples()
                else:
                    logging.error(
                        "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})".format(
                            idx, len(token_formats), len(c.getVariableStatistics())
                        )
                    )
                    idx += 1
                    continue
                # How will be find out whether a number of variable values is a set or really variable?
                # We assume that there is an absolute maximum amount of distinct values which is independent
                # of the actual number of messages. However we also need to consider that when the number of
                # messages in a cluster definitily falls below the setAbsoluteMax value, we have to adapt
                # the local maximum in this cluster.
                # For the moment we take a multiplier for the number of messages (default 0.3 == 30%) and
                # assume it is a set, when both, setAbsoluteMax and the localThreshold is underrun
                # In addition we assume that we have no semantics for this token, as other semantics conflict
                # with the notion of a set

                if (
                    distinct <= Globals.getConfig().setAbsoluteMax
                    and distinct <= (c.getNumberOfMessages() * Globals.getConfig().setPercentageThreshold)
                    and len(semantics) == 0
                ):
                    for message in messages:  # Set for every message and the cluster itself
                        message.get_tokenlist()[idx].add_semantic("set")
                    c.add_semantic_for_token(idx - 1, "set")
            idx += 1
    # Push to cluster
    pushUpToCluster(cluster_collection)