def test_bootstrapSetsUpClassifierAsExpected(self):

        ProgrammingBayesianClassifier.bootstrap(TestConfig)

        self.assertEqual(
            ZipFileStub.called,
            [
                'init-trainers.zip-r',
                'namelist',
                'read-foo.def',
                'read-bar.def'
            ]
        )

        self.assertTrue(
            ismethod(SimpleBayesStub.Tokenizer) or
            isfunction(SimpleBayesStub.Tokenizer)
        )
        self.assertIsInstance(registry.get('PP_bayes'), SimpleBayesStub)

        self.assertEqual(
            SimpleBayesStub.Languages,
            {
                'foo': 'foo.def-text',
                'bar': 'bar.def-text'
            }
        )
Beispiel #2
0
    def parse(self, data):
        """
        Determines if the data is an example of one of our trained languages

        :param data: the string we want to parse
        :type data: str
        :return: yields parse result(s) if there are any
        :rtype: ParseResult
        """
        dataset = self.create_dataset(data)

        # Step 1: Is this possibly code?
        if not self.find_common_tokens(dataset):
            return

        # Step 2: Which languages match, based on keywords alone?
        matched_languages = self.get_possible_languages(dataset)

        # Step 3: Which languages match, based on a smarter lexer?
        lexer = ProgrammingLexer(matched_languages, data.lower())
        lex_languages = lexer.lex()

        if not lex_languages:
            return

        # Step 4: Using a Naive Bayes Classifier
        # to pinpoint the best language fits
        classifier = ProgrammingBayesianClassifier()
        bayes_languages = classifier.classify(data)

        scores = self.calculate_confidence(lex_languages, bayes_languages)

        for lang_id, scorecard in scores.items():
            yield self.result(self.language_keywords[lang_id]["name"], scorecard["confidence"], scorecard)
Beispiel #3
0
    def parse(self, data):
        """
        Determines if the data is an example of one of our trained languages
        """

        dataset = self.create_dataset(data)

        # Step 1: Is this possibly code?
        if not self.find_common_tokens(dataset):
            return

        # Step 2: Which languages match, based on keywords alone?
        matched_languages = self.get_possible_languages(dataset)

        # Step 3: Which languages match, based on a smarter lexer?
        lexer = ProgrammingLexer(matched_languages, data.lower())
        lex_languages = lexer.lex()

        if not lex_languages:
            return

        # Step 4: Using a Naive Bayes Classifier
        # to pinpoint the best language fits
        classifier = ProgrammingBayesianClassifier()
        bayes_languages = classifier.classify(data)

        scores = self.normalize_scores(data, lex_languages, bayes_languages)

        for lang_id, confidence in scores.items():
            yield ParseResult(self.type,
                              self.language_keywords[lang_id]['name'],
                              confidence)
Beispiel #4
0
    def parse(self, data):
        """
        Determines if the data is an example of one of our trained languages

        :param data_string: the string we want to parse
        :type data_string: str
        :return: yields parse result(s) if there are any
        :rtype: ParseResult
        """
        dataset = self.create_dataset(data)

        # Step 1: Is this possibly code?
        if not self.find_common_tokens(dataset):
            return

        # Step 2: Which languages match, based on keywords alone?
        matched_languages = self.get_possible_languages(dataset)

        # Step 3: Which languages match, based on a smarter lexer?
        lexer = ProgrammingLexer(matched_languages, data.lower())
        lex_languages = lexer.lex()

        if not lex_languages:
            return

        # Step 4: Using a Naive Bayes Classifier
        # to pinpoint the best language fits
        classifier = ProgrammingBayesianClassifier()
        bayes_languages = classifier.classify(data)

        scores = self.calculate_confidence(lex_languages, bayes_languages)

        for lang_id, scorecard in scores.items():
            yield self.result(self.language_keywords[lang_id]['name'],
                              scorecard['confidence'], scorecard)
    def test_classifierProducesExpectedResult(self):

        ProgrammingBayesianClassifier.bootstrap(TestConfig)

        classifier = ProgrammingBayesianClassifier()
        result = classifier.classify('echo "Hello World";')

        self.assertEqual('echo "Hello World";', SimpleBayesStub.data_string)
        self.assertEqual('FooBar', result)
Beispiel #6
0
    def test_classifierProducesExpectedResult(self):

        ProgrammingBayesianClassifier.bootstrap(TestConfig)

        classifier = ProgrammingBayesianClassifier()
        result = classifier.classify('echo "Hello World";')

        self.assertEqual('echo "Hello World";', SimpleBayesStub.data_string)
        self.assertEqual('FooBar', result)
Beispiel #7
0
    def test_bootstrapSetsUpClassifierAsExpected(self):

        ProgrammingBayesianClassifier.bootstrap(TestConfig)

        self.assertEqual(ZipFileStub.called, [
            'init-trainers.zip-r', 'namelist', 'read-foo.def', 'read-bar.def'
        ])

        self.assertTrue(
            ismethod(SimpleBayesStub.Tokenizer)
            or isfunction(SimpleBayesStub.Tokenizer))
        self.assertIsInstance(registry.get('PP_bayes'), SimpleBayesStub)

        self.assertEqual(SimpleBayesStub.Languages, {
            'foo': 'foo.def-text',
            'bar': 'bar.def-text'
        })
Beispiel #8
0
    def bootstrap(config):
        """Loads tokens from the yaml files on disk"""
        all_keywords = []
        language_keywords = {}

        directory = os.path.dirname(os.path.abspath(__file__))
        path = os.path.join(directory, "languages/*.yaml")

        for file_path in glob.glob(path):
            with open(file_path, 'r') as language_file:
                language = yaml.load(language_file)
                all_keywords.extend(language['keywords'])
                language_keywords[language['id']] = language

        registry.set('PP_all_keywords', set(all_keywords))
        registry.set('PP_language_keywords', language_keywords)

        ProgrammingBayesianClassifier.bootstrap(config)
Beispiel #9
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        all_keywords = []
        language_keywords = {}

        directory = os.path.dirname(os.path.abspath(__file__))
        path = os.path.join(directory, "languages/*.yaml")

        for file_path in glob.glob(path):
            with open(file_path, "r") as language_file:
                language = yaml.load(language_file)
                all_keywords.extend(language["keywords"])
                language_keywords[language["id"]] = language

        registry.set("PP_all_keywords", set(all_keywords))
        registry.set("PP_language_keywords", language_keywords)

        ProgrammingBayesianClassifier.bootstrap(config)
Beispiel #10
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        all_keywords = []
        language_keywords = {}

        directory = os.path.dirname(os.path.abspath(__file__))
        path = os.path.join(directory, "languages/*.yaml")

        for file_path in glob.glob(path):
            with open(file_path, 'r') as language_file:
                language = yaml.load(language_file)
                all_keywords.extend(language['keywords'])
                language_keywords[language['id']] = language

        registry.set('PP_all_keywords', set(all_keywords))
        registry.set('PP_language_keywords', language_keywords)

        ProgrammingBayesianClassifier.bootstrap(config)
Beispiel #11
0
 def test_tokenizerProducesExpectedList(self):
     result = ProgrammingBayesianClassifier.bayes_tokenizer('Hello World')
     self.assertEqual(2, len(result))
Beispiel #12
0
 def test_tokenizerProducesExpectedList(self):
     result = ProgrammingBayesianClassifier.bayes_tokenizer('Hello World')
     self.assertEqual(2, len(result))