def test_bootstrapSetsUpClassifierAsExpected(self): ProgrammingBayesianClassifier.bootstrap(TestConfig) self.assertEqual( ZipFileStub.called, [ 'init-trainers.zip-r', 'namelist', 'read-foo.def', 'read-bar.def' ] ) self.assertTrue( ismethod(SimpleBayesStub.Tokenizer) or isfunction(SimpleBayesStub.Tokenizer) ) self.assertIsInstance(registry.get('PP_bayes'), SimpleBayesStub) self.assertEqual( SimpleBayesStub.Languages, { 'foo': 'foo.def-text', 'bar': 'bar.def-text' } )
def parse(self, data): """ Determines if the data is an example of one of our trained languages :param data: the string we want to parse :type data: str :return: yields parse result(s) if there are any :rtype: ParseResult """ dataset = self.create_dataset(data) # Step 1: Is this possibly code? if not self.find_common_tokens(dataset): return # Step 2: Which languages match, based on keywords alone? matched_languages = self.get_possible_languages(dataset) # Step 3: Which languages match, based on a smarter lexer? lexer = ProgrammingLexer(matched_languages, data.lower()) lex_languages = lexer.lex() if not lex_languages: return # Step 4: Using a Naive Bayes Classifier # to pinpoint the best language fits classifier = ProgrammingBayesianClassifier() bayes_languages = classifier.classify(data) scores = self.calculate_confidence(lex_languages, bayes_languages) for lang_id, scorecard in scores.items(): yield self.result(self.language_keywords[lang_id]["name"], scorecard["confidence"], scorecard)
def parse(self, data): """ Determines if the data is an example of one of our trained languages """ dataset = self.create_dataset(data) # Step 1: Is this possibly code? if not self.find_common_tokens(dataset): return # Step 2: Which languages match, based on keywords alone? matched_languages = self.get_possible_languages(dataset) # Step 3: Which languages match, based on a smarter lexer? lexer = ProgrammingLexer(matched_languages, data.lower()) lex_languages = lexer.lex() if not lex_languages: return # Step 4: Using a Naive Bayes Classifier # to pinpoint the best language fits classifier = ProgrammingBayesianClassifier() bayes_languages = classifier.classify(data) scores = self.normalize_scores(data, lex_languages, bayes_languages) for lang_id, confidence in scores.items(): yield ParseResult(self.type, self.language_keywords[lang_id]['name'], confidence)
def parse(self, data): """ Determines if the data is an example of one of our trained languages :param data_string: the string we want to parse :type data_string: str :return: yields parse result(s) if there are any :rtype: ParseResult """ dataset = self.create_dataset(data) # Step 1: Is this possibly code? if not self.find_common_tokens(dataset): return # Step 2: Which languages match, based on keywords alone? matched_languages = self.get_possible_languages(dataset) # Step 3: Which languages match, based on a smarter lexer? lexer = ProgrammingLexer(matched_languages, data.lower()) lex_languages = lexer.lex() if not lex_languages: return # Step 4: Using a Naive Bayes Classifier # to pinpoint the best language fits classifier = ProgrammingBayesianClassifier() bayes_languages = classifier.classify(data) scores = self.calculate_confidence(lex_languages, bayes_languages) for lang_id, scorecard in scores.items(): yield self.result(self.language_keywords[lang_id]['name'], scorecard['confidence'], scorecard)
def test_classifierProducesExpectedResult(self): ProgrammingBayesianClassifier.bootstrap(TestConfig) classifier = ProgrammingBayesianClassifier() result = classifier.classify('echo "Hello World";') self.assertEqual('echo "Hello World";', SimpleBayesStub.data_string) self.assertEqual('FooBar', result)
def test_bootstrapSetsUpClassifierAsExpected(self): ProgrammingBayesianClassifier.bootstrap(TestConfig) self.assertEqual(ZipFileStub.called, [ 'init-trainers.zip-r', 'namelist', 'read-foo.def', 'read-bar.def' ]) self.assertTrue( ismethod(SimpleBayesStub.Tokenizer) or isfunction(SimpleBayesStub.Tokenizer)) self.assertIsInstance(registry.get('PP_bayes'), SimpleBayesStub) self.assertEqual(SimpleBayesStub.Languages, { 'foo': 'foo.def-text', 'bar': 'bar.def-text' })
def bootstrap(config): """Loads tokens from the yaml files on disk""" all_keywords = [] language_keywords = {} directory = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(directory, "languages/*.yaml") for file_path in glob.glob(path): with open(file_path, 'r') as language_file: language = yaml.load(language_file) all_keywords.extend(language['keywords']) language_keywords[language['id']] = language registry.set('PP_all_keywords', set(all_keywords)) registry.set('PP_language_keywords', language_keywords) ProgrammingBayesianClassifier.bootstrap(config)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ all_keywords = [] language_keywords = {} directory = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(directory, "languages/*.yaml") for file_path in glob.glob(path): with open(file_path, "r") as language_file: language = yaml.load(language_file) all_keywords.extend(language["keywords"]) language_keywords[language["id"]] = language registry.set("PP_all_keywords", set(all_keywords)) registry.set("PP_language_keywords", language_keywords) ProgrammingBayesianClassifier.bootstrap(config)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ all_keywords = [] language_keywords = {} directory = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(directory, "languages/*.yaml") for file_path in glob.glob(path): with open(file_path, 'r') as language_file: language = yaml.load(language_file) all_keywords.extend(language['keywords']) language_keywords[language['id']] = language registry.set('PP_all_keywords', set(all_keywords)) registry.set('PP_language_keywords', language_keywords) ProgrammingBayesianClassifier.bootstrap(config)
def test_tokenizerProducesExpectedList(self): result = ProgrammingBayesianClassifier.bayes_tokenizer('Hello World') self.assertEqual(2, len(result))