def run_parser(self, name, file_path=None, data=b"", **kwargs):
        """
        Runs specified parser on file

        :param str name: name of parser module to run (use ":" notation to specify source if necessary e.g. "mwcp-acme:Foo")
        :param str file_path: file to parse
        :param bytes data: use data as file instead of loading data from filename
        """
        self.__reset()

        # TODO: Remove all traces of the input file in the reporter!!
        #  (kept around for now because tool.py uses it for pulling file info)
        if file_path:
            with open(file_path, 'rb') as f:
                self.input_file = mwcp.FileObject(
                    f.read(), self, file_name=os.path.basename(file_path), output_file=False)
                self.input_file.file_path = file_path
        else:
            self.input_file = mwcp.FileObject(data, self, output_file=False)

        try:
            with self.__redirect_stdout():
                found = False
                for source, parser in mwcp.iter_parsers(name):
                    found = True
                    try:
                        parser.parse(self.input_file, self)
                    except (Exception, SystemExit):
                        logger.exception("Error running parser {}:{} on {}".format(
                            source.name, parser.name, file_path or self.input_file.md5))

                if not found:
                    logger.error('Could not find parsers with name: {}'.format(name))
        finally:
            self.__cleanup()
Exemple #2
0
def test_register_parser_directory2(make_sample_parser):
    registry.clear()

    parser_path, config_path = make_sample_parser()
    parser_dir = str(parser_path.dirname)

    # Test registration
    assert not list(mwcp.iter_parsers('Sample'))
    mwcp.register_parser_directory(parser_dir,
                                   config_file_path=str(config_path),
                                   source_name='ACME')
    parsers = list(mwcp.iter_parsers('Sample'))
    assert len(parsers) == 1

    # Test it was registered properly
    source, parser = parsers[0]
    assert parser.name == 'Sample'
    assert source.name == 'ACME'
    assert source.path == parser_dir

    # Test we can also pull by source name.
    parsers = list(mwcp.iter_parsers(source='ACME'))
    assert len(parsers) == 1
    parsers = list(mwcp.iter_parsers('ACME:'))
    assert len(parsers) == 1
Exemple #3
0
def test_register_parser_directory(monkeypatch, Sample_parser):
    # Monkey patch parsers registration so previous test runs don't muck with this.
    monkeypatch.setattr('mwcp.registry._sources', {})

    parser_path, config_path = Sample_parser
    parser_dir = str(parser_path.dirname)

    # Test registration
    assert not list(mwcp.iter_parsers('Sample'))
    mwcp.register_parser_directory(parser_dir,
                                   config_file_path=str(config_path))
    parsers = list(mwcp.iter_parsers('Sample'))
    assert len(parsers) == 1

    # Test it was registered properly
    source, parser = parsers[0]
    assert parser.name == 'Sample'
    assert source.name == parser_dir
    assert source.path == parser_dir

    # Test we can also pull by source name.
    parsers = list(mwcp.iter_parsers(source=parser_dir))
    assert len(parsers) == 1
    parsers = list(mwcp.iter_parsers(parser_dir + ':'))
    assert len(parsers) == 1
Exemple #4
0
def test_iter_parsers(make_sample_parser):
    registry.clear()

    parser_path, config_path = make_sample_parser()
    source = os.path.abspath(str(parser_path.dirname))
    mwcp.register_parser_directory(source, config_file_path=str(config_path))

    parsers = list(mwcp.iter_parsers('Sample'))
    assert len(parsers) == 1

    _source, parser = parsers[0]
    assert parser.__class__ == mwcp.Dispatcher
    assert parser.name == 'Sample'
    assert _source.path == source
    assert len(parser.parsers) == 2
    assert parser.DESCRIPTION == 'A test parser'

    parsers = sorted(mwcp.iter_parsers(config_only=False),
                     key=lambda x: x[1].DESCRIPTION)
    assert len(parsers) == 3

    _source, parser = parsers[0]
    assert parser.__class__ == mwcp.Dispatcher
    assert parser.name == 'Sample'
    assert len(parser.parsers) == 2
    downloader_parser, implant_parser = parser.parsers
    assert parser.DESCRIPTION == 'A test parser'
    assert downloader_parser.DESCRIPTION == 'TestParser Downloader'
    assert implant_parser.DESCRIPTION == 'TestParser Implant'

    assert parsers[1][1] == downloader_parser
    assert parsers[2][1] == implant_parser
Exemple #5
0
def test_recursive_error(make_sample_parser):
    """Tests error handling for a recursive parser."""
    registry.clear()

    parser_path, config_file = make_sample_parser(config_text=u'''
        
Sample:
    description: A test parser
    author: Mr. Tester
    parsers:
        - .Downloader
        - .Implant
        - Sample2
        
Sample2:
    description: A test parser 2
    author: Mr. Tester
    parsers:
        - Sample.Downloader  # This one should be fine.
        - Sample             # It should complain about this.

        
        ''')
    parser_dir = str(parser_path.dirname)

    mwcp.register_parser_directory(parser_dir,
                                   config_file_path=str(config_file),
                                   source_name='ACME')

    with pytest.raises(RuntimeError) as exec_info:
        list(mwcp.iter_parsers('Sample'))
    assert 'Detected recursive loop: Sample2 -> Sample' in str(exec_info.value)
    def test_cases(self):
        """Returns test cases."""
        if self._test_cases is None:
            self._test_cases = []
            for parser_name in self.parser_names:
                # We want to iterate parsers in case parser_name represents a set of parsers from different sources.
                found = False
                for source, parser in mwcp.iter_parsers(parser_name):
                    found = True
                    full_parser_name = '{}:{}'.format(source.name, parser.name)
                    results_file_path = self.get_results_filepath(full_parser_name)
                    if os.path.isfile(results_file_path):
                        for expected_results in self.parse_results_file(results_file_path):
                            # Add results_file_path for relative paths.
                            # NOTE: os.path.join will ignore the prefix we add if the second is not relative.
                            input_file_path = expected_results[INPUT_FILE_PATH]
                            input_file_path = os.path.join(os.path.dirname(results_file_path), input_file_path)
                            input_file_path = os.path.abspath(input_file_path)
                            expected_results[INPUT_FILE_PATH] = input_file_path

                            self._test_cases.append(TestCase(
                                self.reporter, full_parser_name, expected_results,
                                field_names=self.field_names, ignore_field_names=self.ignore_field_names))
                    else:
                        # Warn user if they are missing a test file for a parser group.
                        logger.warning('Test case file not found: {}'.format(results_file_path))

                if not found and parser_name:
                    # Add a failed results if we have an orphan test.
                    self._results.append(TestResult(
                        parser=parser_name,
                        passed=False,
                        errors=['Parser not found.']
                    ))
        return self._test_cases
Exemple #7
0
    def get_results_filepath(self, name, source=None):
        """
        Returns the results file path based on the parser name provided and the
        previously specified output directory.
        """
        # TODO: Remove hardcoding "parsertests" folder. Determine better way to handle this.
        for parser_name, source, klass in mwcp.iter_parsers(name,
                                                            source=source):
            file_name = parser_name + FILE_EXTENSION
            # Use hardcoded results dir if requested.
            if self.results_dir:
                return os.path.join(self.results_dir, file_name)

            # If source is a directory, assume there is a "parsertests" folder next to it.
            if os.path.isdir(source):
                return os.path.normpath(
                    os.path.join(source, '..', 'parsertests', file_name))

            # Otherwise dynamically pull based on parser's top level module.
            top_level_module, _, _ = klass.__module__.partition('.')
            results_dir = pkg_resources.resource_filename(
                top_level_module, 'parsertests')
            return os.path.join(results_dir, file_name)

        raise ValueError('Invalid parser: {}'.format(name))
Exemple #8
0
def test_missing_parser_class(Sample_parser, tmpdir):
    """Tests error handling for a missing parser class."""
    registry.clear()

    parser_path, config_file = Sample_parser
    parser_dir = str(parser_path.dirname)

    config_file.write_text(
        u'''

Sample:
    description: A test parser
    author: Mr. Tester
    parsers:
        - .Downloader
        - .Implant
        - .NoExist

    ''', 'utf8')
    mwcp.register_parser_directory(parser_dir,
                                   config_file_path=str(config_file),
                                   source_name='ACME')

    with pytest.raises(RuntimeError) as exec_info:
        list(mwcp.iter_parsers('Sample'))
    assert 'Unable to find Sample.NoExist' in str(exec_info.value)
Exemple #9
0
    def test_cases(self):
        """Returns test cases."""
        if self._test_cases is None:
            self._test_cases = []
            for parser_name in self.parser_names:
                # We want to iterate parsers in case parser_name represents a set of parsers from different sources.
                found = False
                for source, parser in mwcp.iter_parsers(parser_name):
                    found = True
                    full_parser_name = "{}:{}".format(source.name, parser.name)
                    results_file_path = self.get_results_filepath(
                        full_parser_name)
                    if os.path.isfile(results_file_path):
                        for expected_results in self.read_results_file(
                                results_file_path):
                            self._test_cases.append(
                                TestCase(
                                    self.reporter,
                                    full_parser_name,
                                    expected_results,
                                    field_names=self.field_names,
                                    ignore_field_names=self.ignore_field_names,
                                ))
                    else:
                        # Warn user if they are missing a test file for a parser group.
                        logger.warning("Test case file not found: {}".format(
                            results_file_path))

                if not found and parser_name:
                    # Add a failed results if we have an orphan test.
                    self._results.append(
                        TestResult(parser=parser_name,
                                   passed=False,
                                   errors=["Parser not found."]))
        return self._test_cases
Exemple #10
0
def test_register_parser_directory(monkeypatch, test_parser):
    # Monkey patch parsers registration so previous test runs don't muck with this.
    monkeypatch.setattr('mwcp.parsers._PARSERS', collections.defaultdict(dict))

    # Test registration
    assert not list(mwcp.iter_parsers('test_parser'))
    mwcp.register_parser_directory(os.path.dirname(test_parser))
    parsers = list(mwcp.iter_parsers('test_parser'))
    assert len(parsers) == 1

    # Test it was registered properly
    name, source_name, klass = parsers[0]
    assert name == 'test_parser'
    assert source_name == os.path.dirname(test_parser)

    # Test we can also pull by source name.
    parsers = list(mwcp.iter_parsers(source=os.path.dirname(test_parser)))
    assert len(parsers) == 1
    parsers = list(mwcp.iter_parsers(os.path.dirname(test_parser) + ':'))
    assert len(parsers) == 1
Exemple #11
0
    def run_parser(self, name, file_path=None, data=b"", **kwargs):
        """
        Runs specified parser on file

        :param str name: name of parser module to run (use ":" notation to specify source if necessary e.g. "mwcp-acme:Foo")
        :param str file_path: file to parse
        :param bytes data: use data as file instead of loading data from filename
        """
        self.__reset()

        if file_path:
            with open(file_path, 'rb') as f:
                self.input_file = mwcp.FileObject(
                    f.read(),
                    self,
                    file_name=os.path.basename(file_path),
                    output_file=False)
                self.input_file.file_path = file_path
        else:
            self.input_file = mwcp.FileObject(data, self, output_file=False)

        try:
            with self.__redirect_stdout():
                found = False
                for parser_name, source, parser_class in mwcp.iter_parsers(
                        name):
                    found = True
                    with self.input_file as fo:
                        self._handle = fo
                        try:
                            parser = parser_class(reporter=self)
                            parser.run(**kwargs)
                        except (Exception, SystemExit) as e:
                            if file_path:
                                identifier = file_path
                            else:
                                identifier = hashlib.md5(data).hexdigest()
                            self.error(
                                "Error running parser {}:{} on {}: {}".format(
                                    source, parser_name, identifier,
                                    traceback.format_exc()))

                if not found:
                    self.error(
                        'Could not find parsers with name: {}'.format(name))
        finally:
            self.__cleanup()
Exemple #12
0
def test_non_importable_module(make_sample_parser):
    """Tests error handling for non importable module."""
    registry.clear()

    parser_path, config_file = make_sample_parser()
    parser_dir = str(parser_path.dirname)

    # Add garbage so that the module will have an import error
    parser_path.write('\nimport dummy\n', mode='w+')

    mwcp.register_parser_directory(parser_dir,
                                   config_file_path=str(config_file),
                                   source_name='ACME')

    with pytest.raises(ImportError) as exec_info:
        list(mwcp.iter_parsers('Sample'))
    assert "No module named 'dummy'" in str(exec_info.value)
Exemple #13
0
    def test_cases(self):
        """Returns test cases."""
        if self._test_cases is None:
            self._test_cases = []
            for parser_name in self.parser_names:
                # We want to iterate parsers in case parser_name represents a set of parsers from different sources.
                found = False
                for source, parser in mwcp.iter_parsers(parser_name):
                    found = True
                    full_parser_name = '{}:{}'.format(source.name, parser.name)
                    results_file_path = self.get_results_filepath(
                        full_parser_name)
                    if os.path.isfile(results_file_path):
                        for expected_results in self.parse_results_file(
                                results_file_path):
                            # Add results_file_path for relative paths.
                            # NOTE: os.path.join will ignore the prefix we add if the second is not relative.
                            input_file_path = expected_results[INPUT_FILE_PATH]
                            input_file_path = os.path.join(
                                os.path.dirname(results_file_path),
                                input_file_path)
                            input_file_path = os.path.abspath(input_file_path)
                            expected_results[INPUT_FILE_PATH] = input_file_path

                            self._test_cases.append(
                                TestCase(self.reporter,
                                         full_parser_name,
                                         expected_results,
                                         field_names=self.field_names,
                                         ignore_field_names=self.
                                         ignore_field_names))
                    else:
                        # Warn user if they are missing a test file for a parser group.
                        logger.warning('Test case file not found: {}'.format(
                            results_file_path))

                if not found and parser_name:
                    # Add a failed results if we have an orphan test.
                    self._results.append(
                        TestResult(parser=parser_name,
                                   passed=False,
                                   errors=['Parser not found.']))
        return self._test_cases
Exemple #14
0
    def run_parser(self, name, file_path=None, data=b"", **kwargs):
        """
        Runs specified parser on file

        :param str name: name of parser module to run (use ":" notation to specify source if necessary e.g. "mwcp-acme:Foo")
        :param str file_path: file to parse
        :param bytes data: use data as file instead of loading data from filename
        """
        self.__reset()

        # TODO: Remove all traces of the input file in the reporter!!
        #  (kept around for now because tool.py uses it for pulling file info)
        if file_path:
            with open(file_path, "rb") as f:
                self.input_file = mwcp.FileObject(
                    f.read(),
                    self,
                    file_name=os.path.basename(file_path),
                    output_file=False)
                self.input_file.file_path = file_path
        else:
            self.input_file = mwcp.FileObject(data, self, output_file=False)

        try:
            with self.__redirect_stdout():
                found = False
                for source, parser in mwcp.iter_parsers(name):
                    found = True
                    try:
                        parser.parse(self.input_file, self)
                    except (Exception, SystemExit):
                        logger.exception(
                            "Error running parser {}:{} on {}".format(
                                source.name, parser.name, file_path
                                or self.input_file.md5))

                if not found:
                    logger.error(
                        "Could not find parsers with name: {}".format(name))
        finally:
            self.__cleanup()
    def get_results_filepath(self, name, source=None):
        """
        Returns the results file path based on the parser name provided and the
        previously specified output directory.
        """
        for source, parser in mwcp.iter_parsers(name, source=source):
            file_name = parser.name + FILE_EXTENSION
            # Use hardcoded results dir if requested.
            if self.results_dir:
                return os.path.join(self.results_dir, file_name)

            if source.is_pkg:
                # Dynamically pull based on parser's top level module.
                test_dir = pkg_resources.resource_filename(source.path, 'tests')
            else:
                # If source is a directory, assume there is a "tests" folder within it.
                test_dir = os.path.join(source.path, 'tests')

            return os.path.normpath(os.path.join(test_dir, file_name))

        raise ValueError('Invalid parser: {}'.format(name))
Exemple #16
0
    def test_cases(self):
        """Returns test cases."""
        if self._test_cases is None:
            self._test_cases = []
            for parser_name in self.parser_names:
                # We want to iterate parsers in case parser_name represents a set of parsers from different sources.
                found = False
                for name, source, _ in mwcp.iter_parsers(parser_name):
                    found = True
                    parser = '{}:{}'.format(source, name)
                    results_file_path = self.get_results_filepath(parser)
                    if os.path.isfile(results_file_path):
                        for expected_results in self.parse_results_file(
                                results_file_path):
                            self._test_cases.append(
                                TestCase(self.reporter,
                                         parser,
                                         expected_results,
                                         field_names=self.field_names,
                                         ignore_field_names=self.
                                         ignore_field_names))
                    else:
                        # Add a failed result if the test case is missing.
                        self._results.append(
                            TestResult(
                                parser=parser,
                                passed=False,
                                errors=[
                                    'Test case file not found: {}'.format(
                                        results_file_path)
                                ],
                            ))

                if not found and parser_name:
                    # Add a failed results if we have an orphan test.
                    self._results.append(
                        TestResult(parser=parser_name,
                                   passed=False,
                                   errors=['Parser not found.']))
        return self._test_cases
Exemple #17
0
    def get_results_filepath(self, name, source=None):
        """
        Returns the results file path based on the parser name provided and the
        previously specified output directory.
        """
        for source, parser in mwcp.iter_parsers(name, source=source):
            file_name = parser.name + FILE_EXTENSION
            # Use hardcoded results dir if requested.
            if self.results_dir:
                return os.path.join(self.results_dir, file_name)

            if source.is_pkg:
                # Dynamically pull based on parser's top level module.
                test_dir = pkg_resources.resource_filename(
                    source.path, 'tests')
            else:
                # If source is a directory, assume there is a "tests" folder within it.
                test_dir = os.path.join(source.path, 'tests')

            return os.path.normpath(os.path.join(test_dir, file_name))

        raise ValueError('Invalid parser: {}'.format(name))
Exemple #18
0
    def get_results_filepath(self, name, source=None):
        """
        Returns the results file path based on the parser name provided and the
        set testcase directory.
        """
        for source, parser in mwcp.iter_parsers(name, source=source):
            file_name = parser.name + FILE_EXTENSION
            # Use hardcoded testcase directory if set.
            testcase_dir = mwcp.config.get("TESTCASE_DIR")
            if testcase_dir:
                return os.path.join(testcase_dir, file_name)

            if source.is_pkg:
                # Dynamically pull based on parser's top level module.
                test_dir = pkg_resources.resource_filename(source.path, "tests")
            else:
                # If source is a directory, assume there is a "tests" folder within it.
                test_dir = os.path.join(source.path, "tests")

            return os.path.normpath(os.path.join(test_dir, file_name))

        raise ValueError("Invalid parser: {}".format(name))
Exemple #19
0
    def __init__(
        self,
        parserdir=None,
        outputdir=None,
        tempdir=None,
        outputfile_prefix=None,
        interpreter_path=None,
        disabledebug=False,
        disableoutputfiles=False,
        disabletempcleanup=False,
        disableautosubfieldparsing=False,
        disablevaluededup=False,
        disablemodulesearch=False,
        base64outputfiles=False,
    ):
        """
        Initializes the Reporter object

        :param str parserdir: sets parser directory (defaults to parsers found in mwcp/parsers)
        :param str tempdir: sets path to temporary directory
        :param str outputdir:
            sets directory for output_file(). Should not be written to (or read from) by parsers
            directly (use tempdir)
        :param str outputfile_prefix:
            sets prefix for output files written to outputdir. Special value "md5" causes prefix
            by md5 of the input file.
        :param str interpreter_path: overrides value returned by interpreter_path()
        :param bool disabledebug: disable inclusion of debug messages in output
        :param bool disableoutputfiles: disable writing if files to filesystem
        :param bool disabletempcleanup: disable cleanup (deletion) of temp files
        :param bool disableautosubfieldparsing: disable parsing of metadata item of subfields
        :param bool disablevaluededup: disable deduplication of metadata items
        :param bool disablemodulesearch: disable search of modules for parsers, only look in parsers directory
        """

        # defaults
        self.tempdir = tempdir or tempfile.gettempdir()
        self.outputfiles = {}
        self._handle = None
        self.fields = {
            "debug": {
                "description": "debug",
                "type": "listofstrings"
            }
        }
        self.metadata = {}
        self.errors = []
        self.input_file = None

        # Continue to allow use of deprecated resourcedir.
        # TODO: Remove this in a new release version.
        self._resourcedir = None
        self.resourcedir = os.path.dirname(resources.__file__)

        self.__managed_tempdir = None
        self.__outputdir = outputdir or ''
        self.__outputfile_prefix = outputfile_prefix or ''

        # Register parsers from given directory.
        # Only register if a custom parserdir was provided or MWCP's entry_points did not get registered because
        # the project was not installed with setuptools.
        # NOTE: This is all to keep backwards compatibility. mwcp.register_parser_directory() should be
        # called outside of this class in the future.
        self.parserdir = parserdir or self.DEFAULT_PARSERDIR
        if self.parserdir != self.DEFAULT_PARSERDIR or not any(
                mwcp.iter_parsers(source='mwcp')):
            mwcp.register_parser_directory(self.parserdir)

        self._interpreter_path = interpreter_path
        self._disable_debug = disabledebug
        self._disable_output_files = disableoutputfiles
        self._disable_temp_cleanup = disabletempcleanup
        self._disable_auto_subfield_parsing = disableautosubfieldparsing
        self._disable_value_dedup = disablevaluededup
        self._disable_module_search = disablemodulesearch
        self._base64_output_files = base64outputfiles

        # TODO: Move fields.json to shared data or config folder.
        fieldspath = os.path.join(os.path.dirname(mwcp.resources.__file__),
                                  "fields.json")

        with open(fieldspath, 'rb') as f:
            self.fields = json.load(f)
Exemple #20
0
    def run_tests(self, parser_names=None, field_names=None, ignore_field_names=DEFAULT_EXCLUDE_FIELDS):
        """

        Run tests and compare produced results to expected results.

        Arguments:
            parser_name (list):
                A list of parser names to run tests for. If the list is empty (default),
                then test cases for all parsers will be run.
            field_names(list):
                A restricted list of fields (metadata key values) that should be compared
                during testing. If the list is empty (default), then all fields, except those in
                ignore_field_names will be compared.
        """
        if not field_names:
            field_names = []

        # Determine files to test (this will be a list of JSON files). If no parser name(s) is specified, run
        # all tests.
        if not parser_names:
            parser_names = [None]

        test_case_file_paths = []
        for parser_name in parser_names:
            # We want to iterate parsers in case parser_name represents a set of parsers from different sources.
            found = False
            for name, source, _ in mwcp.iter_parsers(parser_name):
                found = True
                parser_name = '{}:{}'.format(source, name)
                results_file_path = self.get_results_filepath(parser_name)
                if os.path.isfile(results_file_path):
                    test_case_file_paths.append((parser_name, results_file_path))
                else:
                    print("Results file not found for {} parser".format(parser_name).encode(**encode_params))
                    print("File(s) not found = {}".format(results_file_path).encode(**encode_params))

            if not found:
                print("Parser not found for: {}".format(parser_name).encode(**encode_params))

        cores = mp.cpu_count()

        if len(test_case_file_paths) == 1:
            parser_name, results_file_path = test_case_file_paths[0]
            res_list = [self.get_test_results(parser_name, results_file_path, field_names, ignore_field_names)]
        else:
            # Use at most 3/4 of available logical cores.
            # Adjust fraction as needed.
            procs = (3 * cores) // 4

            # When creating multiprocessing pool we need to re-register the parser_directory because
            # global variables don't stick with Windows processes.
            pool = mp.Pool(processes=procs,
                           initializer=mwcp.register_parser_directory,
                           initargs=(self.reporter.parserdir,))

            # Feed each parser's test case(s) into the process pool.
            multi_res = []
            for parser_name, results_file_path in test_case_file_paths:
                multi_res.append(pool.apply_async(
                    multiproc_test_wrapper, (self, parser_name, results_file_path, field_names, ignore_field_names)))

            # Very generous 1 hour timeout for each job.
            res_list = [res.get(timeout=3600) for res in multi_res]

        # Flatten the list of lists and return
        return list(itertools.chain.from_iterable(res_list))
Exemple #21
0
    def run_tests(self, parser_names=None, field_names=None, ignore_field_names=DEFAULT_EXCLUDE_FIELDS, nprocs=None):
        """

        Run tests and compare produced results to expected results.

        Arguments:
            parser_name (list):
                A list of parser names to run tests for. If the list is empty (default),
                then test cases for all parsers will be run.
            field_names(list):
                A restricted list of fields (metadata key values) that should be compared
                during testing. If the list is empty (default), then all fields, except those in
                ignore_field_names will be compared.
        """
        if not field_names:
            field_names = []

        # Determine files to test (this will be a list of JSON files). If no parser name(s) is specified, run
        # all tests.
        if not parser_names:
            parser_names = [None]

        test_case_file_paths = []
        for parser_name in parser_names:
            # We want to iterate parsers in case parser_name represents a set of parsers from different sources.
            found = False
            for name, source, _ in mwcp.iter_parsers(parser_name):
                found = True
                parser_name = '{}:{}'.format(source, name)
                results_file_path = self.get_results_filepath(parser_name)
                if os.path.isfile(results_file_path):
                    test_case_file_paths.append((parser_name, results_file_path))
                else:
                    print("Results file not found for {} parser".format(parser_name).encode(**encode_params))
                    print("File(s) not found = {}".format(results_file_path).encode(**encode_params))

            if not found:
                print("Parser not found for: {}".format(parser_name).encode(**encode_params))

        cores = mp.cpu_count()
        procs = nprocs or (3 * cores) // 4
        pool = multi_proc.TPool(
            processes=procs, initializer=mwcp.register_parser_directory,
            initargs=(self.reporter.parserdir,))

        tests = []
        # Just for nicer formatting...
        parser_len = 0
        filename_len = 0
        # Parse test case/results files, run tests, and compare expected results to produced results
        for parser_name, results_file_path in test_case_file_paths:
            results_data = self.parse_results_file(results_file_path)

            for result_data in results_data:
                parser_len = max(parser_len, len(os.path.basename(parser_name)))
                filename_len = max(filename_len, len(os.path.basename(result_data[self.INPUT_FILE_PATH])))
                tests.append((self, result_data, parser_name, field_names, ignore_field_names))

        # While the tests will start in the order they were added, they will be yielded roughly in the
        # order they complete.
        test_iter = pool.imap_unordered(multiproc_test_wrapper, tests)
        pool.close()

        finished_tests = 0
        digits = len(str(len(tests)))

        try:
            for results in test_iter:
                # Add an info dict to the returned results
                # Built with formatting here since we have knowledge of all test cases
                finished_tests += 1
                test_info = {
                    'finished': str(finished_tests).zfill(digits),
                    'total': str(len(tests)).zfill(digits),
                    'parser': os.path.basename(results.parser).ljust(parser_len),
                    'filename': os.path.basename(results.input_file_path).ljust(filename_len),
                    'run_time': results.run_time
                }
                yield results, test_info
        except KeyboardInterrupt:
            pool.terminate()
            raise
Exemple #22
0
def test_external_source(make_sample_parser):
    """Tests importing a parser from an external source."""
    registry.clear()

    parser_path, config_file = make_sample_parser("acme")
    parser_dir = str(parser_path.dirname)

    parser2_path, config2_file = make_sample_parser("acme2",
                                                    parser_name="Sample2",
                                                    parser_code=u'''
from mwcp import Parser

class Decoy(Parser):
    DESCRIPTION = "TestParser2 Decoy"
        ''',
                                                    config_text=r'''
Sample2:
    description: Another test parser
    author: Mrs. Tester
    parsers:
        - .Decoy
        - acme:Sample.Downloader  # imports individual component
        - acme:Sample             # imports parser group
      
Sample:
    description: Another test parser
    author: Mrs. Tester
    parsers:
        - Sample2.Decoy
        - acme:Sample
        
        ''')
    parser2_dir = str(parser2_path.dirname)

    # Register 2 parsers.
    mwcp.register_parser_directory(parser_dir,
                                   config_file_path=str(config_file),
                                   source_name="acme")
    mwcp.register_parser_directory(parser2_dir,
                                   config_file_path=str(config2_file),
                                   source_name="acme2")

    # Test that Sample2 has Sample and Sample.Downloader in it's sub-parsers.
    parsers = list(mwcp.iter_parsers("Sample2"))
    assert len(parsers) == 1
    Sample2_parser = parsers[0][1]
    assert len(Sample2_parser.parsers) == 3
    assert [(p.name, p.source) for p in Sample2_parser.parsers] == [
        ("Sample2.Decoy", "acme2"),
        ("Sample.Downloader", "acme"),
        ("Sample", "acme"),
    ]

    # Test we don't hit a recursion error when we reference a parser with the same name.
    parsers = list(mwcp.iter_parsers("Sample", source="acme2"))
    assert len(parsers) == 1
    Sample_parser = parsers[0][1]
    assert len(Sample_parser.parsers) == 2
    assert [(p.name, p.source) for p in Sample_parser.parsers] == [
        ("Sample2.Decoy", "acme2"),
        ("Sample", "acme"),
    ]