def test_alignmentOfAlmostEquivalentMessages(self): alignmentSolution = NeedlemanAndWunsch(8) nbTest = 1000 for i_test in range(0, nbTest): common_pattern_before = self.generateRandomString(30, 40) common_pattern_after = self.generateRandomString(30, 40) # Generate the content of two messages data1 = TypeConvertor.stringToNetzobRaw(common_pattern_before + "hercule" + common_pattern_after) data2 = TypeConvertor.stringToNetzobRaw(common_pattern_before + "thomas" + common_pattern_after) # Create the messages message1 = RawMessage(str(uuid.uuid4()), str(time.time()), data1) message2 = RawMessage(str(uuid.uuid4()), str(time.time()), data2) (scores, alignment) = alignmentSolution.alignTwoMessages( False, message1, message2) (score1, score2, score3) = scores (scoresBis, alignment2) = alignmentSolution.alignTwoMessages( True, message1, message2) (scoreBis1, scoreBis2, scoreBis3) = scoresBis self.assertGreater(scoreBis1, score1) self.assertGreater(scoreBis1, 95)
def test_semanticAlignment_bug1(self): """test_semanticAlignment_bug1: A bug on the semantic alignment has been identified which prevent the computation of a valid regex. This test verifies the bug is not comming back. @date 18/04/2013 """ firstname1 = "antoine" email1 = "*****@*****.**" firstname2 = "luc" email2 = "*****@*****.**" msg1 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("6" + firstname1 + "GAHFSHQS" + email1)) msg2 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("3" + firstname2 + "CVSDHISD" + email2)) project = Project(uuid.uuid4(), "Experiment", datetime.now(), "") nwEngine = NeedlemanAndWunsch(8, project, False, None) symbol = Symbol(uuid.uuid4(), "Test", project) symbol.addMessages([msg1, msg2]) msg1.addSemanticTag("firstname", 2, 2 + len(firstname1) * 2) msg1.addSemanticTag("email", 2 + len(firstname1) * 2 + 16, 2 + len(firstname1) * 2 + 16 + len(email1) * 2) msg2.addSemanticTag("firstname", 2, 2 + len(firstname2) * 2) msg2.addSemanticTag("email", 2 + len(firstname2) * 2 + 16, 2 + len(firstname2) * 2 + 16 + len(email2) * 2) nwEngine.alignField(symbol.getField()) symbol.getField().setFormat(Format.STRING) print("Computed Regex : {0}".format(symbol.getRegex())) print(symbol.getCells(True)) computedFields = symbol.getExtendedFields() self.assertTrue(len(computedFields) > 1, "Only one field has been computed which tells us something went wrong.")
def test_alignmentOfEquivalentMessages(self): alignmentSolution = NeedlemanAndWunsch(8) nbTest = 1000 for i_test in range(0, nbTest): common_pattern = self.generateRandomString(30, 40) # Generate the content of two messages data1 = TypeConvertor.stringToNetzobRaw(common_pattern) data2 = TypeConvertor.stringToNetzobRaw(common_pattern) # Create the messages message1 = RawMessage(str(uuid.uuid4()), str(time.time()), data1) message2 = RawMessage(str(uuid.uuid4()), str(time.time()), data2) (scores, alignment) = alignmentSolution.alignTwoMessages( False, message1, message2) (score1, score2, score3) = scores self.assertEqual(score1, 100.0) self.assertEqual(score2, 100.0) self.assertEqual(score3, 100.0) (scores, alignment) = alignmentSolution.alignTwoMessages( True, message1, message2) (score1, score2, score3) = scores self.assertEqual(score1, 100.0) self.assertEqual(score2, 100.0) self.assertEqual(score3, 100.0)
def sequence_execute_clicked_cb(self, widget): """Callback executed when the user request to start the alignment process""" self._view.sequence_cancel.set_sensitive(False) self._view.sequence_execute.set_sensitive(False) self._view.sequence_scale.set_sensitive(False) self._view.sequence_spinbutton.set_sensitive(False) self._view.radiobutton4bit.set_sensitive(False) self._view.radiobutton8bit.set_sensitive(False) self._view.orphanButton.set_sensitive(False) self._view.smoothButton.set_sensitive(False) # retrieves the alignment parameters similarityPercent = self._view.sequence_adjustment.get_value() if self._view.radiobutton8bit.get_mode(): unitSize = 8 else: unitSize = 4 orphan = self._view.orphanButton.get_active() smooth = self._view.smoothButton.get_active() self.vocabularyController.getCurrentProject().getConfiguration().setVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_EQUIVALENCE_THRESHOLD, int(similarityPercent)) self.vocabularyController.getCurrentProject().getConfiguration().setVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_ORPHAN_REDUCTION, orphan) self.vocabularyController.getCurrentProject().getConfiguration().setVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK, smooth) # Configure Needleman and Wunsch self.alignmentSolution = NeedlemanAndWunsch(unitSize, self.vocabularyController.getCurrentProject(), self.doUpgma, self.percentOfAlignmentProgessBar) # Define the alignment JOB self._view.sequence_stop.set_sensitive(True) Job(self.startSequenceAlignment(unitSize))
def executeClustering(self): """Execute the clustering operation @return the new list of symbols""" self.log.debug("Re-Organize the symbols (nbIteration={0}, min_equivalence={1})".format(self.nbIteration, self.minEquivalence)) # Process the UPGMA on symbols if self.isFinish(): return None self.cb_executionStatus(0, 0, "Clustering into symbols...") self.processUPGMA() self.cb_executionStatus(1, 100, None) # Retrieve the alignment of each symbol and the build the associated regular expression self.cb_executionStatus(2, 0, "Compute the definition for each cluster...") if self.isFinish(): return None self.currentAlignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status) self.currentAlignment.absoluteStage = 2 self.currentAlignment.statusRatio = len(self.symbols) self.currentAlignment.statusRatioOffset = 0 for symbol in self.symbols: if self.isFinish(): return None self.currentAlignment.alignField(symbol.getField()) self.currentAlignment.statusRatioOffset = self.currentAlignment.statusRatioOffset + 1 return self.symbols
def test_deserialisationMessages(self): nbTest = 10 alignmentSolution = NeedlemanAndWunsch(8) for iTest in range(0, nbTest): messages = [] # Generate a random number of message to serialize nbMessage = random.randint(2, 500) for iMessage in range(0, nbMessage): data = TypeConvertor.stringToNetzobRaw( self.generateRandomString(5, 500)) message = RawMessage(str(uuid.uuid4()), str(time.time()), data) messages.append(message) nbDeserializedTest = alignmentSolution.deserializeMessages( messages) self.assertEqual(nbMessage, nbDeserializedTest)
def test_AlignementOfMessages(self): alignmentSolution = NeedlemanAndWunsch(4) nbTest = 100 for iTest in range(0, nbTest): messages = [] # Generate a random number of message to serialize nbMessage = random.randint(2, 50) for iMessage in range(0, nbMessage): data = TypeConvertor.stringToNetzobRaw( "bonjour" + self.generateRandomString(5, 30) + ", tout va bien ?") message = RawMessage(str(uuid.uuid4()), str(time.time()), data) messages.append(message) (alignment, scores) = alignmentSolution.align(False, messages) (score1, score2, score3) = scores (alignmentBis, scoresBis) = alignmentSolution.align(True, messages) (scoreBis1, scoreBis2, scoreBis3) = scoresBis print(alignment) print(alignmentBis) self.assertGreaterEqual(scoreBis1, score1) self.assertGreaterEqual(scoreBis1, 90)
def clusterByTokenization(self, symbols): self.ServerInference() ################################### Cluster messages according to their tokens ll = len(self.symbols) - 1 i_equ = 0 while (ll > 0): currentPattern = self.symbols[i_equ].getMessages()[0].getPattern( )[1] for j in range(ll): jnext = len(self.symbols) - j - 1 cond = False for message in self.symbols[jnext].getMessages(): if currentPattern == message.getPattern()[1]: # score = sum([p1 == p2 for p1, p2 in zip(currentPattern, message.getPattern()[1])]) score2 = self.computeSimilarities( currentPattern, message.getPattern()[1]) # if score >= min(len(currentPattern), len(message.getPattern()[1])): minilength = min( len(message.getData()), len(self.symbols[i_equ].getMessages() [0].getData())) if score2 * 2.0 / minilength >= 0.40: cond = True if cond: break if (cond): currentDst = self.symbols[i_equ].getPattern()[0] otherDst = self.symbols[jnext].getPattern()[0] if not self.server or (currentDst == otherDst) or ( currentDst != self.server and otherDst != self.server): self.mergeEffectiveRowCol(i_equ, jnext) # self.log.debug("Merge the equal column/line {0} with the column/line {1}".format(str(i_equ), str(j + 1))) i_equ -= 1 break ll -= 1 i_equ += 1 ################################## Align messages alignment = NeedlemanAndWunsch(self.unitSize, self.cb_status) tmpSymbols = [] for symbol in self.symbols: # alignment.alignSymbols([symbol], self.project) # symbol.getFields()[0].setFormat(Format.STRING) # tmpSymbols.extend(alignment.getLastResult()) try: # print "l" al = self.computeAlignment(symbol) symbol.getField().setAlignment(al) alignment.buildRegexFromAlignment(symbol, al, self.defaultFormat) # for (p, fields) in zip(symbol.getPattern()[1], symbol.getFields()): # field.setFormat(p.getFormat()) except: logging.warn( "Partitionnement error: too much fields ( > 100) for the symbol '" + symbol.getName() + "' len=" + str(len(symbol.getExtendedFields())) + "len " + str(len(symbol.getPattern()[1]))) symbol.getField().removeLocalFields() field = Field("Field 0", "(.{, })", symbol) symbol.addLocalField(field) # Use the default protocol type for representation field.setFormat(self.defaultFormat) alignment.alignSymbols(self.symbols, self.project) self.symbols = alignment.getLastResult()
def test_semanticAlignment_simple(self): """test_semanticAlignment_simple: Test that messages with embedded semantic are efficiently aligned. Format : <random 10 bytes><random username><random 5 ASCII><random email> Optimal Needleman & Wunsch Parameters : // Cost definitions for the alignment static const short int MATCH = 5; static const short int SEMANTIC_MATCH = 30; static const short int MISMATCH = -5; static const short int GAP = 0; static const short int BLEN = 10; // Consts for the definition of a mask static const unsigned char END = 2; static const unsigned char DIFFERENT = 1; static const unsigned char EQUAL = 0; """ project = Project(uuid.uuid4(), "Experiment", datetime.now(), "") symbol = Symbol(uuid.uuid4(), "Test", project) nbMessage = 500 usernames = [] emails = [] for iMessage in range(0, nbMessage): str_username = self.generateRandomString(4, 10) username = TypeConvertor.stringToNetzobRaw(str_username) usernames.append(str_username) email_prefix = self.generateRandomString(4, 10) email_domain = self.generateRandomString(4, 10) email_extension = self.generateRandomString(2, 3) str_email = "{0}@{1}.{2}".format(email_prefix, email_domain, email_extension) emails.append(str_email) email = TypeConvertor.stringToNetzobRaw(str_email) random10Bytes = self.generateRandomBytes(10, 10) random5ASCII = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 5)) data = "{0}{1}{2}{3}".format(random10Bytes, username, random5ASCII, email) message = RawMessage(uuid.uuid4(), None, data) message.addSemanticTag("username", len(random10Bytes), len(random10Bytes) + len(username)) message.addSemanticTag("email", len(random10Bytes) + len(username) + len(random5ASCII), len(random10Bytes) + len(username) + len(random5ASCII) + len(email)) symbol.addMessage(message) nwEngine = NeedlemanAndWunsch(8, project, False, None) nwEngine.alignField(symbol.getField()) symbol.getField().setFormat(Format.STRING) print("Number of computed fields : {0}".format(len(symbol.getExtendedFields()))) self.assertEqual(4, len(symbol.getExtendedFields())) nbValidMessages = 0 for message in symbol.getMessages(): isValid = symbol.getField().isRegexValidForMessage(message) if isValid: nbValidMessages += 1 self.assertTrue(isValid) print(symbol.getCells()) print("Computed regex is valid for {0}/{1} messages.".format(nbValidMessages, len(symbol.getMessages())))
def executeOrphanReduction(self): """Execute the orphan reduction process by merging symbols which are progressively reduced in size.""" leftReductionFactor = 0 rightReductionFactor = 0 currentReductionIsLeft = False increment = 10 while leftReductionFactor < 80 and rightReductionFactor < 80: # First we retrieve the current orphans orphans = [] tmp_symbols = [] # extract orphans for i, symbol in zip(range(len(self.symbols)), self.symbols): if len(symbol.getMessages()) == 1: orphans.append(symbol) # create a tmp symbols array where symbols will be added once computed for symbol in self.symbols: if len(symbol.getMessages()) > 1: tmp_symbols.append(symbol) if len(orphans) <= 1: self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans))) break self.symbols = orphans if currentReductionIsLeft: leftReductionFactor = leftReductionFactor + increment # Reduce the size of the messages by 50% from the left for orphan in self.symbols: orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor) orphan.getMessages()[0].setRightReductionFactor(0) self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor))) self.executeClustering() currentReductionIsLeft = False if not currentReductionIsLeft: rightReductionFactor = rightReductionFactor + increment # Reduce the size of the messages from the right for orphan in self.symbols: orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor) orphan.getMessages()[0].setLeftReductionFactor(0) self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor))) self.executeClustering() currentReductionIsLeft = True for orphan in self.symbols: for message in orphan.getMessages(): message.setLeftReductionFactor(0) message.setRightReductionFactor(0) tmp_symbols.append(orphan) self.symbols = tmp_symbols self.cb_executionStatus(3, 50.0, "Executing last alignment...") alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status) # Compute the regex/alignment of each symbol for symbol in self.symbols: alignment.alignField(symbol.getField()) return self.symbols