コード例 #1
0
 def createXMLOutput(self):
     import os
     path = os.path.normpath(Globals.getConfig().dumpFile)
     file = os.path.basename(Globals.getConfig().inputFile)
     (filename,ext) = os.path.splitext(file)
     storePath = "{0}{1}{2}_output.xml".format(path,os.sep,filename)
     
     import sys
     import codecs
     old_stdout = sys.stdout
     handle = codecs.open(storePath,"w", "utf-8-sig")
     sys.stdout = handle
     
     print '<?xml version="1.0" encoding="utf-8" ?>'
     print "<protocolInformatics>"
     # Get the Discoverer XML result representation from the cluster collection object
     print self.getCCXMLRepresentation()
     # Get the statemachine XML result representation
     print self.env['sm'].getXMLRepresentation()  
     print "</protocolInformatics>"
     
     handle.close()         
     sys.stdout = old_stdout
     import os            
     logging.info("Finished XML output. File size {0}".format(self.convert_bytes(os.path.getsize(storePath))))         
コード例 #2
0
 def do_load_state(self, str):
     import cPickle
     handle = open(Globals.getConfig().dumpFile + "/disc_state","rb")
     self.env = cPickle.load(handle)
     # Update config with settings from backup
     Globals.setConfig(self.env['config'])
     discoverer.Globals.setProtocolClassification(self.env['protocolType'])
     handle.close()
コード例 #3
0
 def go(self, sequences):
     if self.env['sequences']==None:
         print "FATAL: No sequences loaded!"
         return
     import discoverer.statistics
     discoverer.statistics.reset_statistics()
     logging.info("Performing discoverer algorithm")
     
     start = time.time()
     # Perform the initial clustering
     self.setup(sequences)
         
     elapsed = (time.time() - start)
     logging.info("Setup took {:.3f} seconds".format(elapsed))
     # Combines server and client flows
     self.env['messageFlows'] = self.combineflows(self.env['cluster_collection'])
     # Create a linked list
     self.linkmessages(self.env['messageFlows'])
     start = time.time()
     # Perform format inference
     self.do_format_inference("")
     elapsed = (time.time() - start)
     logging.info("Format inference took {:.3f} seconds".format(elapsed))
     start = time.time()
     # Performs the semantic inference
     self.do_semantic_inference("")
     elapsed = (time.time() - start)
     logging.info("Semantic inference took {:.3f} seconds".format(elapsed))
     start = time.time()
     # Performs the recursive clustering step
     self.do_recursive_clustering("")        
     elapsed = (time.time() - start)
     logging.info("Recursive clustering took {:.3f} seconds".format(elapsed))
     start = time.time()
     # Fixes tokenization errors 
     self.do_fix_tokenization_errors("")
     elapsed = (time.time() - start)
     logging.info("Fixing tokenization errors took {:.3f} seconds".format(elapsed))
     #self.print_clusterCollectionInfo()
     start = time.time()
     print "Merging..."
     # Merge while merging potential is present
     while self.env['cluster_collection'].mergeClustersWithSameFormat():
         pass
     elapsed = (time.time() - start)
     logging.info("Merging took {:.3f} seconds".format(elapsed))
     logging.info("Finished")
     
     # Perform one last format inference and semantic inference
     oldvalue = Globals.getConfig().considerOneMessageAsConstant
     Globals.getConfig().considerOneMessageAsConstant = True
     self.do_format_inference("")
     Globals.getConfig().considerOneMessageAsConstant = oldvalue
     self.do_semantic_inference("")
     
     if Globals.getConfig().debug:                
         self.env['cluster_collection'].print_clusterCollectionInfo()
コード例 #4
0
 def dump_sm_dot(self, filename=""):
     if filename=="":
         path = os.path.normpath(Globals.getConfig().dumpFile)
         file = os.path.basename(Globals.getConfig().inputFile)
         (filename,ext) = os.path.splitext(file)
         storePath = "{0}{1}{2}.dot".format(path,os.sep,filename) 
     else:
         storePath = filename               
     self.env['sm'].dump_dot(storePath)
コード例 #5
0
 def __init__(self, env, config):
     cmd.Cmd.__init__(self)
     self.env = env
     # Just for backing it up into the state
     self.env['config'] = config
     self.config = config
     Globals.setConfig(config)
     self.__profile = collections.OrderedDict()
     self.__nextstate = 1
     logging.info("Discoverer CLI initialized")
コード例 #6
0
 def do_testsuite(self, args):
     basename = Globals.getConfig().testbasename
     highloop=0
     if args=="":
         highloop=4
     else:
         highloop=int(args)
     logging.info("Using {0} as highloop".format(highloop))
     for suffix in range(0,highloop):
         logging.info("Testing the {0}er batch".format(suffix))
         Globals.getConfig().testFile = basename+"_{0}".format(suffix)
         logging.info("Set config.testFile to {0}".format(Globals.getConfig().testFile))
         self.do_load_testdata("")
         # Perform the actual test
         self.do_statemachine_accepts("")
コード例 #7
0
 def do_dump_state(self, str):
     import cPickle
     handle = open(Globals.getConfig().dumpFile + "/disc_state","wb")
     sys.setrecursionlimit(50000)
     self.env['protocolType']=discoverer.Globals.getProtocolClassification()
     cPickle.dump(self.env, handle,2)
     handle.close()
コード例 #8
0
 def createPeachOutput(self):
     import os
     path = os.path.normpath(Globals.getConfig().dumpFile)
     file = os.path.basename(Globals.getConfig().inputFile)
     (filename,ext) = os.path.splitext(file)
     storePath = "{0}{1}{2}_peach.xml".format(path,os.sep,filename)
     import sys
     import codecs
     old_stdout = sys.stdout
     handle = codecs.open(storePath,"w", "utf-8-sig")
     sys.stdout = handle
     print self.env['sm'].dumpPeachXML()
     handle.close()         
     sys.stdout = old_stdout
     import os            
     logging.info("Finished Peach output. File size {0}".format(self.convert_bytes(os.path.getsize(storePath))))         
コード例 #9
0
 def setup(self, sequences): #, direction):        
     logging.info("Performing initial message analysis and clustering")
     if sequences == None:        
         logging.error("FATAL: No sequences loaded yet!")
         return False    
     
     # Perform initial token analysis
     setup = discoverer.setup.Setup(sequences, Globals.getConfig())
     
     self.env['cluster_collection'] = setup.get_cluster_collection()
     logging.info("Built {0} clusters".format(setup.get_cluster_collection().num_of_clusters()))
コード例 #10
0
 def linkmessages(self, messageFlows):
     maxFlowLength = 0
     minFlowLength = sys.maxint
     
     logging.info("Linking messages within flow")
     for flow in messageFlows:
         messages = messageFlows[flow]
         flowLength = len(messages)
         if flowLength>maxFlowLength:
             maxFlowLength = flowLength
         if flowLength<minFlowLength:
             minFlowLength = flowLength
         
         if len(messages)==1:
             if Globals.getConfig().debug:
                 print "Flow {0} has only 1 message. Skipping flow".format(flow)
             continue
         #message_indices = messages.keys()
         from discoverer.peekable import peekable
         iterator = peekable(messages.items())
         #for msg_id, message in messages.items():
         lastMsg = None
         (msg_id, message) = iterator.next()
         
         message = message[0]
         while not iterator.isLast():
             if lastMsg != None:
                 lastMsg.setNextInFlow(message)
                 message.setPrevInFlow(lastMsg)
             lastMsg = message
             #else
             #    lastMsg = message
             (msg_id, message) = iterator.next()
             message = message[0]
         if lastMsg != message:
             lastMsg.setNextInFlow(message)
             message.setPrevInFlow(lastMsg)
         
         if Globals.getConfig().debug:
             self.printflow(flow)
     logging.info("Linked flows. Min flow length: {0}, max flow length: {1}".format(minFlowLength, maxFlowLength))
コード例 #11
0
 def do_dumpresult(self, string):
     if not self.env.has_key('cluster_collection'): return
     
     if Globals.getConfig().loadClientAndServerParts == True:
         # Dump 2 collections to two files
         path = os.path.normpath(Globals.getConfig().dumpFile)
         file = os.path.basename(Globals.getConfig().inputFile)
         (filename,ext) = os.path.splitext(file)
         storePath = "{0}{1}{2}_formats_dump.txt".format(path,os.sep,filename)
         self.dump2File(self.env['cluster_collection'],storePath)
         #storePath = "{0}{1}{2}_formats.xml".format(path,os.sep,filename)
         #self.dumpXML(self.env['cluster_collection'], storePath)
         #storePath = "{0}{1}{2}_server_dump.txt".format(path,os.sep,filename)
         #self.dump2File(self.env['cluster_collection_server'],storePath)
     else:
         # Dump only one file (client traffic)
         path = os.path.normpath(Globals.getConfig().dumpFile)
         file = os.path.basename(Globals.getConfig().inputFile)
         (filename,ext) = os.path.splitext(file)
         storePath = "{0}{1}{2}_dump.txt".format(path,os.sep,filename)
         self.dump2File(self.env['cluster_collection'],storePath)
コード例 #12
0
 def do_split_loaded(self, args):
     chunksize = 0
     if args=="":
         chunksize = 2000
     else: 
         chunksize = int(args)
     if not self.env.has_key('testflows'):
         print "Error: No testflows laoded"
     testflows = self.env['testflows']
     
     nr = 0
     outfilename = Globals.getConfig().testFile
     fdoutclient = open("{0}_{1}_{2}_client".format(outfilename,chunksize, nr), "w")
     fdoutserver = open("{0}_{1}_{2}_server".format(outfilename,chunksize, nr), "w")
     
     linecnt = 0
     blockseparator = "******************************************"
     print "Opened output file {0}_{1}_{2}".format(outfilename, chunksize, nr)
     flowcnt = 0
     for flow in testflows:
         (has_no_gaps, is_alternating) = discoverer.common.flow_is_valid(testflows, flow)
         if not (has_no_gaps and is_alternating) or len(testflows[flow])==1:                                                          
             continue    
         
         messages = testflows[flow]
         c_out = 1
         s_out = 1
         totalcnt = 1
         for m_key in sorted(messages.keys()):
             msg = messages[m_key]
             if msg[1]=="server2client":
                 fdoutserver.write("{0} {1} {2} {3} {4} {5}\n".format(blockseparator, flow, c_out, totalcnt, msg[0].get_length()*2, msg[0].get_payload_as_string())) 
                 c_out += 1
             else:
                 fdoutclient.write("{0} {1} {2} {3} {4} {5}\n".format(blockseparator, flow, s_out, totalcnt, msg[0].get_length()*2, msg[0].get_payload_as_string())) 
                 s_out += 1
             totalcnt += 1
         flowcnt += 1
         if flowcnt>=chunksize:
             fdoutclient.close()
             fdoutserver.close()
             nr += 1
             fdoutclient = open("{0}_{1}_{2}_client".format(outfilename,chunksize, nr), "w")
             fdoutserver = open("{0}_{1}_{2}_server".format(outfilename,chunksize, nr), "w")
             print "Opened output file {0}_{1}_{2}".format(outfilename, chunksize, nr)
     
             flowcnt = 0
             #print "{0} lines read and {1} chunksize flows read. Creating new output file {2}_{1}_{3}".format(linecnt, chunksize,self.infilename, nr)
             #linecnt = 0
             #fdout = open("{0}_{1}_{2}".format(self.infilename,chunksize, nr), "w")
             #inset.clear()
     fdoutclient.close()
     fdoutserver.close()
コード例 #13
0
 def do_dumpflow(self,file):
     if not Globals.getConfig().loadClientAndServerParts:
         print "Flow dumping is only available when analyzing client and server flows"
         return
     if file!="":
         import os.path
         path = os.path.normpath(Globals.getConfig().dumpFile)
         file = os.path.basename(Globals.getConfig().inputFile)         
         (filename,ext) = os.path.splitext(file)
         storePath = "{0}{1}{2}_flow_dump.txt".format(path,os.sep,filename)
         import sys
         old_stdout = sys.stdout
         handle = open(storePath,"w")
         sys.stdout = handle
     print "Dump of 'Discoverer' flows"
     for f in self.env['messageFlows']:
         print "Flow: %s" % f
         for entry in self.env['messageFlows'][f]:
             print "\t{0}:\t{1} - {2}".format(entry,self.env['messageFlows'][f][entry][0].get_message(), self.env['messageFlows'][f][entry][0].getCluster().getFormatHash())
     if file!="":
         handle.close()         
         sys.stdout = old_stdout           
         print "Finished. File size {0}".format(self.convert_bytes(os.path.getsize(storePath)))
コード例 #14
0
 def do_load_testdata(self, args=""):
     if len(args)!=0:
         tok = args.split()
         fileName = tok[0]
         element = int(tok[1])
         
     fileName = Globals.getConfig().testFile
     
     import common
     import cmdinterface
     
         
     client2server_file = "{0}_client".format(fileName)
     server2client_file = "{0}_server".format(fileName)
     logging.debug("Using: {0} & {1} as testdata".format(client2server_file, server2client_file))
     logging.info("Memory usage before loading testdata: {0}".format(self.getMemoryUsage()))
     self.profile("BeforeLoadingTestdata")
     logging.info("Loading {0} entries from test data from {1}".format(Globals.getConfig().numOfTestEntries,client2server_file))
     # Load the client flows
     sequences_client2server = sequences = common.input.Bro(client2server_file, Globals.getConfig().numOfTestEntries).getConnections()
     logging.info("Loading {0} entries from test data from {1}".format(Globals.getConfig().numOfTestEntries, server2client_file))
     # load the server flows
     sequences_server2client = sequences = common.input.Bro(server2client_file, Globals.getConfig().numOfTestEntries).getConnections()
     sequences = [(sequences_client2server, Message.directionClient2Server),(sequences_server2client, Message.directionServer2Client)] # Keep it compatible with existing code TODO        
     
     logging.info("Loaded {0} test sequences from input files".format(len(sequences[0][0])+len(sequences[1][0])))
     logging.info("Memory usage after loading testdata: {0}".format(self.getMemoryUsage()))
     self.profile("AfterLoadingTestdata")    
     # Create quick setup
     tmpMaxPrefix = Globals.getConfig().maxMessagePrefix
     Globals.getConfig().maxMessagePrefix = 2048    
     setup = discoverer.setup.Setup(sequences, performFullAnalysis=False)
     Globals.getConfig().maxMessagePrefix = tmpMaxPrefix
     logging.info("Memory usage after preparing testsequences: {0}".format(self.getMemoryUsage()))
     self.profile("AfterPreparingTestdata")    
     testcluster = setup.get_cluster_collection()
     testflows = self.combineflows(testcluster)
     logging.info("Memory usage after combining testsequences: {0}".format(self.getMemoryUsage()))
     self.profile("AfterCombiningTestdata")    
     self.linkmessages(testflows)
     logging.info("Memory usage after linking testsequences: {0}".format(self.getMemoryUsage()))
     self.profile("AfterLinkingTestdata")
     self.env['testflows']=testflows
     # Hand test flows over to statemachine
     if self.env.has_key('sm'):
         self.env['sm'].setTestFlows(testflows)
コード例 #15
0
    def do_statemachine_accepts(self, args=""):
        # Tries to load the input and returns whether the statemachine accepts this input
        
        # Thoughts:
        # How do I map a single line of input to a transition?
        # A transition is the hash of a rich message format of a single message
        # 
        # Basic Task: Match a single message to the best matching format
        #
        # Idea: Tokenize our single message and create a Message object out of it
        # The transition have linked information about the various messages that
        # are part of the cluster whose hash is the hash of the transition
        #
        # Idea: Compare message format of our single message (only text, binary senseful
        # at this moment) to the formats of the various clusters.
        # Then first examine whether we have perfect matches with respect to text/binary
        # (this might not be the case, if we've got rich cluster formats with merged clusters or sim.
        # If yes, compare the matching clusters's const values with our message and see whether our
        # values match the const value exactly. 
        # If yes and we've only got one cluster that's our transition
        # If yes and we've got multiple matches let's see further
        # if we've got no match regarding the const values there are again 2 possibilities
        # : also consider variable cluster formats (in case our message had indeed a variable instead)
        # : also look for other cluster format combinations (e.g. merged tokens might change the length of the format, which
        # would have sorted this one out in the first instance)
        
        # Furthermore there are more test possibilites
        # e.g. load only client messages and see whether our app is able to answer with a server message
        # or load a full new set of client and server flows and replay flow by flow
       
        # Do it with flows 
        import common
        import cmdinterface
        

        # load the test data if needed        
        if not self.env.has_key('testflows') or len(self.env['testflows']) == 0:
            self.do_load_testdata(args)
        
        if not self.env.has_key('testflows'):
            print "ERROR: Loading test data failed!"
            return
        if not self.env.has_key('sm'):
            print "ERROR: Statemachine not yet built"
            return
        testflows = self.env['testflows']
        # Prepare test statistic counters
        failedelements = []
        success = 0
        failures = 0
        not_in_testflows = 0
        only_one_msg = 0
        has_gaps = 0
        not_alternating = 0
        not_all_transitioned = 0
        not_ended_in_final = 0
        gotMultipleChoice = 0
        test2go = len(testflows.keys())
        totalflows = test2go
        self.env['sm'].setTestFlows(testflows)
        # Make room and clean up the loaded sequences ;-)
        self.env['sequences']=None
        print "Memory usage before test: {0}".format(self.getMemoryUsage())
        self.profile("BeforeStartingTest")
        for elem in testflows.keys():
            print "{0} flows left to test ({1} failed so far, failrate {2} %)".format(test2go, failures, (1.0*failures/totalflows)*100)
            # Test the current flow
            res = self.statemachine_accepts_flow(elem, printSteps=False)
            test2go -= 1
            
            del testflows[elem] # Delete tested flow to make room
            if res['testSuccessful']==True:
                success += 1
            else:
                failures += 1
                # Parse failure reason
                if not res['isInTestFlows']: not_in_testflows += 1
                elif not res['hasMoreThanOneMessage']: only_one_msg += 1
                elif not res['has_no_gaps']: has_gaps += 1
                elif not res['is_alternating']: not_alternating += 1
                elif not res['did_all_transitions' ]: 
                    not_all_transitioned += 1
                    if res['gotMultipleChoice']:
                        gotMultipleChoice += 1
                    failedelements.append(elem)
                elif not res['finished_in_final']: 
                    not_ended_in_final += 1
                    if res['gotMultipleChoice']:
                        gotMultipleChoice += 1
                    failedelements.append(elem)
                    
        print "Finished"
        print "Memory usage after statemachine test: {0}".format(self.getMemoryUsage())
        self.profile("AfterEndTests")
        logging.info("Testresults")
        logging.info("===========")
        logging.info("Number of flows: {0}, Success: {1}, Failures: {2}".format(success+failures, success, failures))
        self.printProfile()
        if failures>0:
            print "Test flowID not in test flows: {0}".format(not_in_testflows)
            print "Flow had only one message: {0}".format(only_one_msg)
            print "Flow had gaps: {0}".format(has_gaps)
            print "Flow was not alternating: {0}".format(not_alternating)
            print "Flow rejected prematurely: {0}".format(not_all_transitioned)
            print "Flow did not end in final state: {0}".format(not_ended_in_final)
            print "Encountered into multiple choice when failed: {0}".format(gotMultipleChoice)
            print
            
            if len(failedelements)>0:
                print "Failed test flows (only tested flows):"
                for elem in failedelements:
                    print "{0}".format(elem)
        # Dump results to file
        import os            
        
        path = os.path.normpath(Globals.getConfig().dumpFile)
        file = os.path.basename(Globals.getConfig().testFile)
        (filename,ext) = os.path.splitext(file)
        storePath = "{0}{1}{2}_testresults.txt".format(path,os.sep,filename)            
        import sys
        old_stdout = sys.stdout
        handle = open(storePath,"w")
        sys.stdout = handle
        print "Testresults"
        print "==========="
        print "Number of flows: {0}, Success: {1}, Failures: {2}".format(success+failures, success, failures)
        self.printProfile()
            
        if failures>0:
            print "Test flowID not in test flows: {0}".format(not_in_testflows)
            print "Flow had only one message: {0}".format(only_one_msg)
            print "Flow had gaps: {0}".format(has_gaps)
            print "Flow was not alternating: {0}".format(not_alternating)
            print "Flow rejected prematurely: {0}".format(not_all_transitioned)
            print "Flow did not end in final state: {0}".format(not_ended_in_final)
            print "Encountered into multiple choice when failed: {0}".format(gotMultipleChoice)
            print
            if len(failedelements)>0:
                print "Failed test flows (only tested flows):"
                for elem in failedelements:
                    print "{0}".format(elem)
                print "Rerunning failed tests and logging output"
                self.do_load_testdata(args)
                for elem in failedelements:
                    print 100*"+"
                    print "Failed flow: {0}".format(elem)
                    # Run test again, this time logging every transition
                    self.statemachine_accepts_flow(elem, printSteps=True)
                    print 100*"+"
        handle.close()         
        sys.stdout = old_stdout
        logging.info("Finished. Test results written to file {0}, file size {1}".format(storePath,self.convert_bytes(os.path.getsize(storePath))))               
コード例 #16
0
    def do_go(self, string):
        if self.env.has_key('cluster_collection'):
            del(self.env['cluster_collection'])
             
        if Globals.getConfig().loadClientAndServerParts == True:
            # Check if we want to constrain our maximum length based on configured confidence intervals
            if Globals.getConfig().calculateMaxMessageLength:
                maxPrefix = discoverer.setup.calcMaxMessageLengthConfidenceInterval(self.env['sequences'], 1-Globals.getConfig().maxMessageLengthConfidenceInterval)
                Globals.getConfig().maxMessagePrefix = maxPrefix
                logging.info("Calculated maximum message prefix based on confidence interval of {0}: {1}".format(Globals.getConfig().maxMessageLengthConfidenceInterval, maxPrefix))
            
            logging.info("Using maximum message prefix for training data: {0}".format(Globals.getConfig().maxMessagePrefix))

            # perform Discoverer analysis
            self.go(self.env['sequences'])
            # writes out the analysis results
            self.do_dumpresult("")
            # Build statemachine
            logging.info("Forcing regex rebuild")
            if self.env.has_key('cluster_collection'):
                self.env['cluster_collection'].updateClusterRegEx()
                logging.info("Performing sanity check over regexes")
                self.env['cluster_collection'].performSanityCheckForRegEx()
                logging.info("Flushing all messages in all clusters")
            # Construct statemachine
            sm = discoverer.statemachine.Statemachine(self.env['messageFlows'])
            self.env['sm'] = sm
            # Log time
            start = time.time()
            logging.info("Building statemachine")
            print "Memory usage w/o statemachine: {0}".format(self.getMemoryUsage())
            self.profile("BeforeBuildStatemachine")
            # perform the build
            sm.build()
            duration = time.time()-start
            print "Statemachine building took {:.3f} seconds".format(duration)
            print "Memory usage with statemachine: {0}".format(self.getMemoryUsage())
            self.profile("AfterBuildStatemachine")
            
            # Save the statemachine's dot file
            path = os.path.normpath(Globals.getConfig().dumpFile)
            file = os.path.basename(Globals.getConfig().inputFile)
            (filename,ext) = os.path.splitext(file)
            storePath = "{0}{1}{2}.dot".format(path,os.sep,filename) 
            logging.info("Dumping state machine")
            sm.dump_dot(storePath)
            sm.dumpTransitions()
            storePath = "{0}{1}{2}_statemachine.xml".format(path,os.sep,filename) 
            # Save the calculation state for later use
            self.do_dump_state("")
            if Globals.getConfig().autoCreateXML:
                # Dump the XML file
                print "Memory usage before creating XML: {0}".format(self.getMemoryUsage())
                self.profile("BeforeBuildXML")            
                self.createXMLOutput()
                self.createPeachOutput()
                print "Memory usage after creating XML: {0}".format(self.getMemoryUsage())
                self.profile("AfterBuildXML")
            
            # Perform the acceptance test
            self.do_statemachine_accepts("")
            
        else:
            # Perform discoverer only for client pat
            self.go(self.env['sequences'],"unknownDirection")