예제 #1
0
파일: parser.py 프로젝트: msabramo/cahoots
    def parse(self, data_string):
        """Parses input data and returns a dict of result data"""

        start_time = time.time()
        results = []
        threads = []

        # Creating/starting a thread for each parser module
        for module in self.config.enabled_modules:
            thread = ParserThread(self.config, module, data_string)
            thread.start()
            threads.append(thread)

        # Synchronizing/finishing parser threads
        for thr in threads:
            thr.join()

        # The threads are done, let's get the results out of them
        for thr in threads:
            results.extend(thr.results)

        # Unique list of all major types
        types = list(set([result.type for result in results]))

        if results:
            # Getting a unique list of result types.
            all_types = []
            for res in results:
                all_types.extend([res.type, res.subtype])

            # Hierarchical Confidence Normalization
            normalizer_chain = HierarchicalNormalizerChain(
                self.config,
                types,
                list(set(all_types))
            )
            results = normalizer_chain.normalize(results)

            # Sorting our results by confidence value
            results = sorted(
                results,
                key=lambda result: result.confidence,
                reverse=True
            )

        return {
            'query': truncate_text(data_string),
            'date': datetime.datetime.now(),
            'execution_seconds': time.time() - start_time,
            'top': results[0] if len(results) > 0 else None,
            'results': {
                'count': len(results),
                'types': types,
                'matches': results
            }
        }
예제 #2
0
    def test_normalizer_normalizes(self):
        res = [
            ParseResult('Test', 'Test', 100),
            ParseResult('Test', 'Test', 0)
        ]

        conf = TestConfig()
        conf.enabled_confidence_normalizers.append(NormalizerStub)
        hnc = HierarchicalNormalizerChain(conf, [], [])
        results = hnc.normalize(res)

        self.assertEqual(1, len(results))
        self.assertIsInstance(results[0], ParseResult)
예제 #3
0
    def test_normalizer_normalizes(self):
        res = [
            ParseResult('Test', 'Test', 100),
            ParseResult('Test', 'Test', 0)
        ]

        conf = TestConfig()
        conf.enabled_confidence_normalizers.append(NormalizerStub)
        hnc = HierarchicalNormalizerChain(conf, [], [])
        results = hnc.normalize(res)

        self.assertEqual(1, len(results))
        self.assertIsInstance(results[0], ParseResult)
예제 #4
0
    def parse(self, data_string):
        """
        Parses input data and returns a dict of result data

        :param data_string: the string we want to parse
        :type data_string: str
        :return: yields parse result data if there is any
        :rtype: dict
        """
        start_time = time.time()
        results = []
        threads = []

        # Creating/starting a thread for each parser module
        for module in self.config.enabled_modules:
            thread = ParserThread(self.config, module, data_string)
            thread.start()
            threads.append(thread)

        # Synchronizing/finishing parser threads
        for thr in threads:
            thr.join()

        # The threads are done, let's get the results out of them
        for thr in threads:
            results.extend(thr.results)

        # Unique list of all major types
        types = list(set([result.type for result in results]))

        if results:
            # Getting a unique list of result types.
            all_types = []
            for res in results:
                all_types.extend([res.type, res.subtype])

            # Hierarchical Confidence Normalization
            normalizer_chain = HierarchicalNormalizerChain(
                self.config,
                types,
                list(set(all_types))
            )
            results = normalizer_chain.normalize(results)

            # Sorting our results by confidence value
            results = sorted(
                results,
                key=lambda result: result.confidence,
                reverse=True
            )

        return {
            'query': truncate_text(data_string),
            'date': datetime.datetime.now(),
            'execution_seconds': time.time() - start_time,
            'top': results[0] if len(results) > 0 else None,
            'results': {
                'count': len(results),
                'types': types,
                'matches': results
            }
        }