def test_alignmentOfAlmostEquivalentMessages(self):
        alignmentSolution = NeedlemanAndWunsch(8)
        nbTest = 1000
        for i_test in range(0, nbTest):
            common_pattern_before = self.generateRandomString(30, 40)
            common_pattern_after = self.generateRandomString(30, 40)
            # Generate the content of two messages
            data1 = TypeConvertor.stringToNetzobRaw(common_pattern_before +
                                                    "hercule" +
                                                    common_pattern_after)
            data2 = TypeConvertor.stringToNetzobRaw(common_pattern_before +
                                                    "thomas" +
                                                    common_pattern_after)
            # Create the messages
            message1 = RawMessage(str(uuid.uuid4()), str(time.time()), data1)
            message2 = RawMessage(str(uuid.uuid4()), str(time.time()), data2)

            (scores, alignment) = alignmentSolution.alignTwoMessages(
                False, message1, message2)
            (score1, score2, score3) = scores
            (scoresBis, alignment2) = alignmentSolution.alignTwoMessages(
                True, message1, message2)
            (scoreBis1, scoreBis2, scoreBis3) = scoresBis

            self.assertGreater(scoreBis1, score1)
            self.assertGreater(scoreBis1, 95)
Exemple #2
0
    def test_semanticAlignment_bug1(self):
        """test_semanticAlignment_bug1:
        A bug on the semantic alignment has been identified which prevent
        the computation of a valid regex. This test verifies the bug is not comming back.
        @date 18/04/2013
        """

        firstname1 = "antoine"
        email1 = "*****@*****.**"

        firstname2 = "luc"
        email2 = "*****@*****.**"

        msg1 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("6" + firstname1 + "GAHFSHQS" + email1))
        msg2 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("3" + firstname2 + "CVSDHISD" + email2))

        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        symbol = Symbol(uuid.uuid4(), "Test", project)

        symbol.addMessages([msg1, msg2])
        msg1.addSemanticTag("firstname", 2, 2 + len(firstname1) * 2)
        msg1.addSemanticTag("email", 2 + len(firstname1) * 2 + 16, 2 + len(firstname1) * 2 + 16 + len(email1) * 2)

        msg2.addSemanticTag("firstname", 2, 2 + len(firstname2) * 2)
        msg2.addSemanticTag("email", 2 + len(firstname2) * 2 + 16, 2 + len(firstname2) * 2 + 16 + len(email2) * 2)

        nwEngine.alignField(symbol.getField())
        symbol.getField().setFormat(Format.STRING)

        print("Computed Regex : {0}".format(symbol.getRegex()))
        print(symbol.getCells(True))

        computedFields = symbol.getExtendedFields()
        self.assertTrue(len(computedFields) > 1, "Only one field has been computed which tells us something went wrong.")
    def test_alignmentOfEquivalentMessages(self):
        alignmentSolution = NeedlemanAndWunsch(8)
        nbTest = 1000
        for i_test in range(0, nbTest):
            common_pattern = self.generateRandomString(30, 40)
            # Generate the content of two messages
            data1 = TypeConvertor.stringToNetzobRaw(common_pattern)
            data2 = TypeConvertor.stringToNetzobRaw(common_pattern)
            # Create the messages
            message1 = RawMessage(str(uuid.uuid4()), str(time.time()), data1)
            message2 = RawMessage(str(uuid.uuid4()), str(time.time()), data2)

            (scores, alignment) = alignmentSolution.alignTwoMessages(
                False, message1, message2)
            (score1, score2, score3) = scores
            self.assertEqual(score1, 100.0)
            self.assertEqual(score2, 100.0)
            self.assertEqual(score3, 100.0)

            (scores, alignment) = alignmentSolution.alignTwoMessages(
                True, message1, message2)
            (score1, score2, score3) = scores
            self.assertEqual(score1, 100.0)
            self.assertEqual(score2, 100.0)
            self.assertEqual(score3, 100.0)
    def sequence_execute_clicked_cb(self, widget):
        """Callback executed when the user request to start
        the alignment process"""

        self._view.sequence_cancel.set_sensitive(False)
        self._view.sequence_execute.set_sensitive(False)
        self._view.sequence_scale.set_sensitive(False)
        self._view.sequence_spinbutton.set_sensitive(False)
        self._view.radiobutton4bit.set_sensitive(False)
        self._view.radiobutton8bit.set_sensitive(False)
        self._view.orphanButton.set_sensitive(False)
        self._view.smoothButton.set_sensitive(False)

        # retrieves the alignment parameters
        similarityPercent = self._view.sequence_adjustment.get_value()
        if self._view.radiobutton8bit.get_mode():
            unitSize = 8
        else:
            unitSize = 4
        orphan = self._view.orphanButton.get_active()
        smooth = self._view.smoothButton.get_active()

        self.vocabularyController.getCurrentProject().getConfiguration().setVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_EQUIVALENCE_THRESHOLD, int(similarityPercent))
        self.vocabularyController.getCurrentProject().getConfiguration().setVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_ORPHAN_REDUCTION, orphan)
        self.vocabularyController.getCurrentProject().getConfiguration().setVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK, smooth)

        # Configure Needleman and Wunsch
        self.alignmentSolution = NeedlemanAndWunsch(unitSize, self.vocabularyController.getCurrentProject(), self.doUpgma, self.percentOfAlignmentProgessBar)

        # Define the alignment JOB
        self._view.sequence_stop.set_sensitive(True)
        Job(self.startSequenceAlignment(unitSize))
Exemple #5
0
    def executeClustering(self):
        """Execute the clustering operation
        @return the new list of symbols"""
        self.log.debug("Re-Organize the symbols (nbIteration={0}, min_equivalence={1})".format(self.nbIteration, self.minEquivalence))
        # Process the UPGMA on symbols

        if self.isFinish():
            return None

        self.cb_executionStatus(0, 0, "Clustering into symbols...")
        self.processUPGMA()
        self.cb_executionStatus(1, 100, None)
        # Retrieve the alignment of each symbol and the build the associated regular expression
        self.cb_executionStatus(2, 0, "Compute the definition for each cluster...")

        if self.isFinish():
            return None

        self.currentAlignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        self.currentAlignment.absoluteStage = 2
        self.currentAlignment.statusRatio = len(self.symbols)
        self.currentAlignment.statusRatioOffset = 0

        for symbol in self.symbols:

            if self.isFinish():
                return None

            self.currentAlignment.alignField(symbol.getField())
            self.currentAlignment.statusRatioOffset = self.currentAlignment.statusRatioOffset + 1

        return self.symbols
    def test_deserialisationMessages(self):
        nbTest = 10
        alignmentSolution = NeedlemanAndWunsch(8)

        for iTest in range(0, nbTest):
            messages = []
            # Generate a random number of message to serialize
            nbMessage = random.randint(2, 500)
            for iMessage in range(0, nbMessage):
                data = TypeConvertor.stringToNetzobRaw(
                    self.generateRandomString(5, 500))
                message = RawMessage(str(uuid.uuid4()), str(time.time()), data)
                messages.append(message)

            nbDeserializedTest = alignmentSolution.deserializeMessages(
                messages)
            self.assertEqual(nbMessage, nbDeserializedTest)
    def test_AlignementOfMessages(self):
        alignmentSolution = NeedlemanAndWunsch(4)
        nbTest = 100

        for iTest in range(0, nbTest):
            messages = []
            # Generate a random number of message to serialize
            nbMessage = random.randint(2, 50)
            for iMessage in range(0, nbMessage):
                data = TypeConvertor.stringToNetzobRaw(
                    "bonjour" + self.generateRandomString(5, 30) +
                    ", tout va bien ?")
                message = RawMessage(str(uuid.uuid4()), str(time.time()), data)
                messages.append(message)

            (alignment, scores) = alignmentSolution.align(False, messages)
            (score1, score2, score3) = scores
            (alignmentBis, scoresBis) = alignmentSolution.align(True, messages)
            (scoreBis1, scoreBis2, scoreBis3) = scoresBis
            print(alignment)
            print(alignmentBis)

            self.assertGreaterEqual(scoreBis1, score1)
            self.assertGreaterEqual(scoreBis1, 90)
Exemple #8
0
    def clusterByTokenization(self, symbols):
        self.ServerInference()
        ################################### Cluster messages according to their tokens
        ll = len(self.symbols) - 1
        i_equ = 0
        while (ll > 0):
            currentPattern = self.symbols[i_equ].getMessages()[0].getPattern(
            )[1]
            for j in range(ll):
                jnext = len(self.symbols) - j - 1
                cond = False
                for message in self.symbols[jnext].getMessages():
                    if currentPattern == message.getPattern()[1]:
                        #                        score = sum([p1 == p2 for p1, p2 in zip(currentPattern, message.getPattern()[1])])
                        score2 = self.computeSimilarities(
                            currentPattern,
                            message.getPattern()[1])
                        #                        if score >= min(len(currentPattern), len(message.getPattern()[1])):
                        minilength = min(
                            len(message.getData()),
                            len(self.symbols[i_equ].getMessages()
                                [0].getData()))
                        if score2 * 2.0 / minilength >= 0.40:
                            cond = True
                        if cond:
                            break

                if (cond):
                    currentDst = self.symbols[i_equ].getPattern()[0]
                    otherDst = self.symbols[jnext].getPattern()[0]
                    if not self.server or (currentDst == otherDst) or (
                            currentDst != self.server
                            and otherDst != self.server):
                        self.mergeEffectiveRowCol(i_equ, jnext)
                        #                        self.log.debug("Merge the equal column/line {0} with the column/line {1}".format(str(i_equ), str(j + 1)))
                        i_equ -= 1
                        break
            ll -= 1
            i_equ += 1

    ################################## Align messages
        alignment = NeedlemanAndWunsch(self.unitSize, self.cb_status)
        tmpSymbols = []
        for symbol in self.symbols:

            #            alignment.alignSymbols([symbol], self.project)
            #            symbol.getFields()[0].setFormat(Format.STRING)
            #            tmpSymbols.extend(alignment.getLastResult())
            try:
                #                print "l"
                al = self.computeAlignment(symbol)
                symbol.getField().setAlignment(al)
                alignment.buildRegexFromAlignment(symbol, al,
                                                  self.defaultFormat)

#                for (p, fields) in zip(symbol.getPattern()[1], symbol.getFields()):
#                    field.setFormat(p.getFormat())
            except:
                logging.warn(
                    "Partitionnement error: too much fields ( > 100) for the symbol '"
                    + symbol.getName() + "' len=" +
                    str(len(symbol.getExtendedFields())) + "len " +
                    str(len(symbol.getPattern()[1])))
                symbol.getField().removeLocalFields()
                field = Field("Field 0", "(.{, })", symbol)
                symbol.addLocalField(field)
                # Use the default protocol type for representation
                field.setFormat(self.defaultFormat)

        alignment.alignSymbols(self.symbols, self.project)
        self.symbols = alignment.getLastResult()
Exemple #9
0
    def test_semanticAlignment_simple(self):
        """test_semanticAlignment_simple:
        Test that messages with embedded semantic are efficiently aligned.
        Format : <random 10 bytes><random username><random 5 ASCII><random email>

        Optimal Needleman & Wunsch Parameters :
        // Cost definitions for the alignment
        static const short int MATCH = 5;
        static const short int SEMANTIC_MATCH = 30;
        static const short int MISMATCH = -5;
        static const short int GAP = 0;
        static const short int BLEN = 10;
        // Consts for the definition of a mask
        static const unsigned char END = 2;
        static const unsigned char DIFFERENT = 1;
        static const unsigned char EQUAL = 0;
        """
        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        symbol = Symbol(uuid.uuid4(), "Test", project)

        nbMessage = 500
        usernames = []
        emails = []
        for iMessage in range(0, nbMessage):
            str_username = self.generateRandomString(4, 10)
            username = TypeConvertor.stringToNetzobRaw(str_username)
            usernames.append(str_username)

            email_prefix = self.generateRandomString(4, 10)
            email_domain = self.generateRandomString(4, 10)
            email_extension = self.generateRandomString(2, 3)
            str_email = "{0}@{1}.{2}".format(email_prefix, email_domain, email_extension)
            emails.append(str_email)
            email = TypeConvertor.stringToNetzobRaw(str_email)
            random10Bytes = self.generateRandomBytes(10, 10)
            random5ASCII = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 5))
            data = "{0}{1}{2}{3}".format(random10Bytes, username, random5ASCII, email)

            message = RawMessage(uuid.uuid4(), None, data)
            message.addSemanticTag("username", len(random10Bytes), len(random10Bytes) + len(username))
            message.addSemanticTag("email", len(random10Bytes) + len(username) + len(random5ASCII), len(random10Bytes) + len(username) + len(random5ASCII) + len(email))

            symbol.addMessage(message)

        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        nwEngine.alignField(symbol.getField())

        symbol.getField().setFormat(Format.STRING)

        print("Number of computed fields : {0}".format(len(symbol.getExtendedFields())))
        self.assertEqual(4, len(symbol.getExtendedFields()))
        nbValidMessages = 0

        for message in symbol.getMessages():
            isValid = symbol.getField().isRegexValidForMessage(message)
            if isValid:
                nbValidMessages += 1
            self.assertTrue(isValid)

        print(symbol.getCells())

        print("Computed regex is valid for {0}/{1} messages.".format(nbValidMessages, len(symbol.getMessages())))
Exemple #10
0
    def executeOrphanReduction(self):
        """Execute the orphan reduction process by merging symbols
        which are progressively reduced in size."""
        leftReductionFactor = 0
        rightReductionFactor = 0
        currentReductionIsLeft = False
        increment = 10

        while leftReductionFactor < 80 and rightReductionFactor < 80:

            # First we retrieve the current orphans
            orphans = []
            tmp_symbols = []
            # extract orphans
            for i, symbol in zip(range(len(self.symbols)), self.symbols):
                if len(symbol.getMessages()) == 1:
                    orphans.append(symbol)

            # create a tmp symbols array where symbols will be added once computed
            for symbol in self.symbols:
                if len(symbol.getMessages()) > 1:
                    tmp_symbols.append(symbol)

            if len(orphans) <= 1:
                self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans)))
                break

            self.symbols = orphans
            if currentReductionIsLeft:
                leftReductionFactor = leftReductionFactor + increment
                # Reduce the size of the messages by 50% from the left
                for orphan in self.symbols:
                    orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor)
                    orphan.getMessages()[0].setRightReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = False

            if not currentReductionIsLeft:
                rightReductionFactor = rightReductionFactor + increment
                # Reduce the size of the messages from the right
                for orphan in self.symbols:
                    orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor)
                    orphan.getMessages()[0].setLeftReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = True

            for orphan in self.symbols:
                for message in orphan.getMessages():
                    message.setLeftReductionFactor(0)
                    message.setRightReductionFactor(0)
                tmp_symbols.append(orphan)
            self.symbols = tmp_symbols

        self.cb_executionStatus(3, 50.0, "Executing last alignment...")
        alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        # Compute the regex/alignment of each symbol
        for symbol in self.symbols:
            alignment.alignField(symbol.getField())
        return self.symbols