def parse(self, data): """ Determines if the data is an example of one of our trained languages :param data_string: the string we want to parse :type data_string: str :return: yields parse result(s) if there are any :rtype: ParseResult """ dataset = self.create_dataset(data) # Step 1: Is this possibly code? if not self.find_common_tokens(dataset): return # Step 2: Which languages match, based on keywords alone? matched_languages = self.get_possible_languages(dataset) # Step 3: Which languages match, based on a smarter lexer? lexer = ProgrammingLexer(matched_languages, data.lower()) lex_languages = lexer.lex() if not lex_languages: return # Step 4: Using a Naive Bayes Classifier # to pinpoint the best language fits classifier = ProgrammingBayesianClassifier() bayes_languages = classifier.classify(data) scores = self.calculate_confidence(lex_languages, bayes_languages) for lang_id, scorecard in scores.items(): yield self.result(self.language_keywords[lang_id]['name'], scorecard['confidence'], scorecard)
def test_phpIsDetectedUsingProgrammingLexer(self): data_string = "echo 'Hello World';" lexer = ProgrammingLexer(['php'], data_string) result = lexer.lex() self.assertEqual({'php': 3}, result)