def test_invalid_combined(fullname, mimetype, version):
    """
    Integration test for all invalid files.
    - Test that well_formed is False or None and mimetype is expected.
    - If well_formed is None, check that Scraper was not found.
    - Skip files that are known cases where it is identified
      differently (but yet correctly) than expected and would be
      well-formed.
    - Skip empty files, since those are detected as inode/x-empty
      and scraper is not found.
    """
    if "empty" in fullname or fullname in IGNORE_INVALID:
        pytest.skip("[%s] has empty or in invalid ignore" % fullname)

    predefined_mimetype = GIVEN_MIMETYPES.get(fullname, None)
    predefined_charset = GIVEN_CHARSETS.get(fullname, None)
    scraper = Scraper(fullname,
                      mimetype=predefined_mimetype,
                      charset=predefined_charset)
    scraper.scrape()

    for _, info in iteritems(scraper.info):
        if scraper.mimetype != mimetype and info["class"] == "ScraperNotFound":
            pytest.skip(("[%s] mimetype mismatches with scraper "
                         "and scraper not found") % fullname)

    assert not scraper.well_formed  # Should return either False or None
    assert scraper.mimetype == mimetype or (fullname in UNAV_MIMETYPE_INVALID
                                            and scraper.mimetype == UNAV)
def test_detect_filetype(filename, params, expected_results):
    """
    Test running only the filetype detection.

    This test ensures that the filetype detection fills in mimetype and version
    (if available from detectors) for the file, leaving well_formed and
    streams as None. Info should also contain some entries, but their contents
    are not checked.

    Then it is tested that the same results are also returned if full scraping
    is run before filetype detection.

    :filename: Test file name
    :params: Parameters for Scarper
    :expected_results: Expected results, containing expected values of Scraper
                       attributes
    """
    # Filetype detection should work without scraping
    scraper = Scraper(filename, **params)
    scraper.detect_filetype()
    for field, value in expected_results.items():
        assert getattr(scraper, field) == value
    assert scraper.streams is None
    assert scraper.info

    # Even if scraping has been done previously, detection should erase all
    # streams and other information
    scraper.scrape()
    scraper.detect_filetype()
    for field, value in expected_results.items():
        assert getattr(scraper, field) == value
    assert scraper.streams is None
    assert scraper.info
def test_grading(fullname, mimetype, version):
    """Test grading for a valid test file.

    Test that file format is graded as recommended unless the file
    is explicitly listed as acceptable or unacceptable.
    """
    if fullname in UNAV_VERSION:
        pytest.skip(
            "File format version of file {} can not be defined.".format(
                fullname))

    charset = GIVEN_CHARSETS.get(fullname, None)
    scraper = Scraper(fullname,
                      mimetype=mimetype,
                      version=version,
                      charset=charset)
    scraper.scrape()

    if fullname in UNACCEPTABLE_FILES:
        expected_grade = UNACCEPTABLE
    elif fullname in BIT_LEVEL_FILES:
        expected_grade = BIT_LEVEL
    elif fullname in BIT_LEVEL_WITH_RECOMMENDED_FILES:
        expected_grade = BIT_LEVEL_WITH_RECOMMENDED
    elif fullname in ACCEPTABLE_FILES:
        expected_grade = ACCEPTABLE
    else:
        expected_grade = RECOMMENDED

    assert scraper.grade() == expected_grade
Ejemplo n.º 4
0
def test_without_wellformed(fullname, mimetype):
    """Test the case where metadata is collected without well-formedness check.
    - Test that well-formed is always None.
    - Test that mimetype matches.
    - Test that there exists correct stream type for image, video, audio
      and text.
    - Test a random element existence for image, video, audio and text.
    """
    if fullname in IGNORE_FOR_METADATA:
        pytest.skip('[%s] in ignore' % fullname)

    scraper = Scraper(fullname)
    scraper.scrape(False)

    _assert_valid_scraper_result(scraper, fullname, mimetype, False)

    mimepart = mimetype.split("/")[0]
    assert mimepart not in ['image', 'video', 'text', 'audio'] or \
           mimepart in scraper.streams[0]['stream_type']

    elem_dict = {
        'image': 'colorspace',
        'video': 'color',
        'videocontainer': 'codec_name',
        'text': 'charset',
        'audio': 'num_channels'
    }
    for _, stream in iteritems(scraper.streams):
        assert stream['stream_type'] is not None
        if stream['stream_type'] in elem_dict:
            assert elem_dict[stream['stream_type']] in stream

    if 'text/csv' in mimetype:
        assert 'delimiter' in scraper.streams[0]
Ejemplo n.º 5
0
def test_without_wellformed(fullname, mimetype):
    """Test the case where metadata is collected without well-formedness check.
    - Test that well-formed is always None.
    - Test that mimetype matches.
    - Test that there exists correct stream type for image, video, audio
      and text.
    - Test a random element existence for image, video, audio and text.
    """
    if fullname in IGNORE_FOR_METADATA:
        pytest.skip("[%s] in ignore" % fullname)

    scraper = Scraper(fullname)
    scraper.scrape(False)

    _assert_valid_scraper_result(scraper, fullname, mimetype, False)

    mimepart = mimetype.split("/")[0]
    if mimepart in ["image", "video", "text", "audio"]:
        assert mimepart in scraper.streams[0]["stream_type"]

    elem_dict = {
        "image": "colorspace",
        "video": "color",
        "videocontainer": "codec_name",
        "text": "charset",
        "audio": "num_channels"
    }

    for stream in scraper.streams.values():
        assert stream["stream_type"] is not None
        if stream["stream_type"] in elem_dict:
            assert elem_dict[stream["stream_type"]] in stream

    if "text/csv" in mimetype:
        assert "delimiter" in scraper.streams[0]
Ejemplo n.º 6
0
def _validate_file(file_, cache_path, errors):
    """Validate file using file-scraper.

    :param file_: file metadata
    :param mongo_file: file data in mongo
    :param cache_path: Path to the file_cache
    :param errors: array to store non-valid files
    :returns: None
    """
    identifier = file_["identifier"]
    file_chars = file_["file_characteristics"]
    mimetype = file_chars["file_format"]
    encoding = file_chars.get("encoding", None)
    version = file_chars.get("format_version", None)

    filepath = os.path.join(cache_path, identifier)

    scraper = Scraper(
        filepath, mimetype=mimetype, charset=encoding, version=version
    )
    scraper.scrape(check_wellformed=True)
    if not scraper.well_formed:
        errors.append(identifier)

    del scraper
Ejemplo n.º 7
0
def scrape_file(filename, filerel=None, workspace=None):
    """Return already existing scraping result or create a new one, if
    missing.
    """
    if filerel is None:
        filerel = filename

    ref_exists = False
    if workspace is not None:
        ref = os.path.join(workspace, 'md-references.xml')
        if os.path.isfile(ref):
            ref_exists = True

    if ref_exists:
        root = lxml.etree.parse(ref).getroot()
        filerel = fsdecode_path(filerel)

        amdref = root.xpath("/mdReferences/mdReference[not(@stream) "
                            "and @file='%s']" % filerel)
        pkl_name = None
        if amdref:
            pkl_name = os.path.join(
                workspace, '{}-scraper.pkl'.format(amdref[0].text[1:]))

        if pkl_name and os.path.isfile(pkl_name) and amdref:
            with open(pkl_name, 'rb') as pkl_file:
                return pickle.load(pkl_file)

    scraper = Scraper(filename)
    scraper.scrape(False)
    return scraper.streams
Ejemplo n.º 8
0
def test_missing_scraper(fullname, mimetype):
    """Integration test with missing scraper.
    - Scraper is missing for the HTML files due to missing doctype.
    """
    scraper = Scraper(fullname)
    scraper.scrape()
    assert scraper.info[len(scraper.info) - 1]['class'] == 'ScraperNotFound'
    assert scraper.well_formed is None
Ejemplo n.º 9
0
def test_missing_file():
    """Test missing file."""
    scraper = Scraper("missing_file")
    scraper.scrape()
    assert not scraper.well_formed

    scraper = Scraper(None)
    scraper.scrape()
    assert not scraper.well_formed
Ejemplo n.º 10
0
def check_well_formed(metadata_info, catalog_path):
    """
    Check if file is well formed. If mets specifies an alternative format or
    scraper identifies the file as something else than what is given in mets,
    add a message specifying the alternative mimetype and version. Validate
    file as the mimetype given in mets.

    :param metadata_info: Dictionary containing metadata parsed from mets.
    :param catalog_path: Schema XML catalog path to pass to file-scraper.
    :returns: Tuple with 2 dicts: (result_dict, scraper.streams)
    """
    messages = []
    valid_only_messages = []
    md_mimetype = metadata_info['format']['mimetype']
    md_version = metadata_info['format']['version']
    force_mimetype = False

    if 'alt-format' in metadata_info['format']:
        messages.append(
            append_format_info('METS alternative ',
                               metadata_info['format']['alt-format']))
        force_mimetype = True
    else:
        scraper = Scraper(metadata_info['filename'])
        (mime, version) = scraper.detect_filetype()
        if mime != md_mimetype or version != md_version:
            messages.append(append_format_info('Detected ', mime, version))
            force_mimetype = True

    scraper_mimetype = None
    scraper_version = None
    if force_mimetype:
        scraper_mimetype = md_mimetype
        scraper_version = md_version
        messages.append(append_format_info('METS ', md_mimetype, md_version))
        messages.append(
            append_format_info('Validating as ', md_mimetype, md_version))
        valid_only_messages.append(
            append_format_info('The digital object will be preserved as ',
                               md_mimetype, md_version))

    scraper = Scraper(metadata_info['filename'],
                      mimetype=scraper_mimetype,
                      version=scraper_version,
                      catalog_path=catalog_path,
                      **create_scraper_params(metadata_info))
    scraper.scrape()

    scraper_info = get_scraper_info(scraper)
    messages.extend(scraper_info['messages'])
    return (make_result_dict(is_valid=scraper.well_formed,
                             messages=messages,
                             errors=scraper_info['errors'],
                             extensions=scraper_info['extensions'],
                             valid_only_messages=valid_only_messages),
            scraper.streams)
def test_grade(file_path, expected_grade):
    """Test that scraper returns correct digital preservation grade."""
    scraper = Scraper(file_path)

    # File can not be graded before scraping
    assert scraper.grade() == "(:unav)"

    # After scraping the file should have expected grade
    scraper.scrape()
    assert scraper.grade() == expected_grade
Ejemplo n.º 12
0
def scrape_file(filepath,
                filerel=None,
                workspace=None,
                mimetype=None,
                version=None,
                charset=None,
                skip_well_check=False,
                skip_json=False):
    """
    Return already existing scraping result or create a new one, if
    missing.

    :filepath: Digital object path
    :filerel: Digital object path relative to base path
    :workspace: Workspace path
    :mimetype: MIME type of digital object
    :version: File format version of digital object
    :charset: Encoding of digital object (if text file)
    :skip_well_check: True skips well-formedness checking
    :skip_json: True does scraping and does not try to find JSON file
    :returns: Metadata dict of streams and scraper info as a tuple
    :raises: ValueError If metadata collecting fails.
             IOError If file does not exist.
    """
    filerel = filepath if filerel is None else filerel
    streams = None
    if not skip_json:
        streams = read_json_streams(filerel, workspace)
    if streams is not None:
        return (streams, None)

    scraper = Scraper(filepath,
                      mimetype=mimetype,
                      version=version,
                      charset=charset)
    scraper.scrape(not skip_well_check)

    if scraper.well_formed is False:  # Must not be None
        errors = []
        for _, info in six.iteritems(scraper.info):
            errors.append("\n".join(info['errors']))
        error_str = "\n".join(errors)
        if skip_well_check:
            error_head = "Metadata of file %s could not " \
                         "be collected due to errors.\n" % filepath
            error_str = error_head + error_str
        raise ValueError(six.text_type(error_str))

    if scraper.info[0]['class'] == 'FileExists' and scraper.info[0]['errors']:
        raise IOError(scraper.info[0]['errors'])
    for _, info in six.iteritems(scraper.info):
        if info['class'] == 'ScraperNotFound':
            raise ValueError('File format is not supported.')

    return (scraper.streams, scraper.info)
def test_missing_file():
    """Test missing file."""
    scraper = Scraper("missing_file", mimetype="application/pdf")
    scraper.scrape()
    assert not scraper.well_formed
    assert len(scraper.info) == 1 and scraper.info[0]["class"] == "FileExists"

    scraper = Scraper(None, mimetype="application/pdf")
    scraper.scrape()
    assert not scraper.well_formed
    assert len(scraper.info) == 1 and scraper.info[0]["class"] == "FileExists"
def test_without_wellformed(fullname, mimetype, version):
    """
    Test the case where metadata is collected without well-formedness check.
    - Test that well-formed is always None.
    - Test that mimetype and version matches.
    - Test that there exists correct stream type for image, video, audio
      and text.
    - Test a random element existence for image, video, audio and text.
    - Test that giving the resulted MIME type, version and charset
      produce the same results.
    """
    if fullname in IGNORE_FOR_METADATA:
        pytest.skip("[%s] in ignore" % fullname)

    predefined_mimetype = GIVEN_MIMETYPES.get(fullname, None)
    predefined_charset = GIVEN_CHARSETS.get(fullname, None)
    scraper = Scraper(fullname,
                      mimetype=predefined_mimetype,
                      charset=predefined_charset)
    scraper.scrape(False)

    _assert_valid_scraper_result(scraper, fullname, mimetype, version, None)

    mimepart = mimetype.split("/")[0]
    if mimepart in ["image", "video", "text", "audio"]:
        assert mimepart in scraper.streams[0]["stream_type"]

    elem_dict = {
        "image": "colorspace",
        "video": "color",
        "videocontainer": "codec_name",
        "text": "charset",
        "audio": "num_channels"
    }

    for stream in scraper.streams.values():
        assert stream["stream_type"] not in [UNAV, None]
        if stream["stream_type"] in elem_dict:
            assert elem_dict[stream["stream_type"]] in stream

    # Test that output does not change if MIME type and version are given
    # to be the ones scraper would determine them to be in any case.

    given_scraper = Scraper(fullname,
                            mimetype=scraper.mimetype,
                            version=scraper.version,
                            charset=scraper.streams[0].get("charset", None))
    given_scraper.scrape(False)

    assert given_scraper.mimetype == scraper.mimetype
    assert given_scraper.version == scraper.version
    assert given_scraper.streams == scraper.streams
    assert given_scraper.well_formed == scraper.well_formed
Ejemplo n.º 15
0
def scrape_file(ctx, filename, check_wellformed, tool_info, mimetype, version):
    """
    Identify file type, collect metadata, and optionally check well-formedness.

    In addition to the given options, the user can provide any extra options
    that are passed onto the scraper. These options must be in the long form,
    e.g. "--charset=UTF-8" or "--charset UTF-8".
    \f

    :ctx: Context object
    :filename: Path to the file that should be scraped
    :check_wellformed: Flag whether the scraper checks wellformedness
    :tool_info: Flag whether the scraper includes messages from different 3rd
                party tools
    :mimetype: Specified mimetype for the scraped file
    :version: Specified version for the scraped file
    """
    scraper = Scraper(filename,
                      mimetype=mimetype,
                      version=version,
                      **_extra_options_to_dict(ctx.args))
    scraper.scrape(check_wellformed=check_wellformed)

    results = {
        "path": ensure_text(scraper.filename),
        "MIME type": ensure_text(scraper.mimetype),
        "version": ensure_text(scraper.version),
        "metadata": scraper.streams,
        "grade": scraper.grade()
    }
    if check_wellformed:
        results["well-formed"] = scraper.well_formed
    if tool_info:
        results["tool_info"] = scraper.info

    errors = {}

    for item in scraper.info.values():
        if "ScraperNotFound" in item["class"]:
            raise click.ClickException("Proper scraper was not found. The "
                                       "file was not analyzed.")

        if item["errors"]:
            errors[item["class"]] = item["errors"]

    if errors:
        results["errors"] = errors

    click.echo(json.dumps(results, indent=4))
def test_given_filetype(filepath, params, well_formed, expected_mimetype,
                        expected_version, expected_charset, meta_well_formed):
    """
    Test the scraping to be done as user given file type.

    MIME type and version results are checked both directly from the scraper
    and for well-formed files also from the first stream. In addition to this,
    well-formedness status of the file should be as expected.

    :filepath: Test file path
    :params: Parameters for Scraper
    :well_formed: Expected result of well-formedness
    :expected_mimetype: Expected MIME type result
    :exprected_version: Expected file format version
    """
    scraper = Scraper(filename=filepath, **params)
    scraper.scrape()

    assert scraper.well_formed == well_formed
    assert scraper.mimetype == expected_mimetype
    assert scraper.version == expected_version
    if expected_charset:
        assert scraper.streams[0]["charset"] == expected_charset
    else:
        assert "charset" not in scraper.streams[0]

    assert scraper.streams[0]["mimetype"] == expected_mimetype
    assert scraper.streams[0]["version"] == expected_version

    # Just collect metadata without well-formedness checking

    # WARC files can not be scraped without well-formedness check
    if expected_mimetype == "application/warc":
        return

    scraper = Scraper(filename=filepath, **params)
    scraper.scrape(False)

    assert scraper.well_formed == meta_well_formed
    assert scraper.mimetype == expected_mimetype
    assert scraper.version == expected_version
    if expected_charset:
        assert scraper.streams[0]["charset"] == expected_charset
    else:
        assert "charset" not in scraper.streams[0]

    assert scraper.streams[0]["mimetype"] == expected_mimetype
    assert scraper.streams[0]["version"] == expected_version
Ejemplo n.º 17
0
def test_coded_filename(testpath, fullname, mimetype):
    """Integration test with unicode and utf-8 filename and with all scrapers.
    - Test that unicode filenames work with all mimetypes
    - Test that utf-8 encoded filenames work with all mimetypes
    """
    if fullname in IGNORE_VALID + ["tests/data/text_xml/valid_1.0_dtd.xml"]:
        pytest.skip("[%s] in ignore" % fullname)
    ext = fullname.rsplit(".", 1)[-1]
    unicode_name = os.path.join(testpath, "äöå.%s" % ext)
    shutil.copy(fullname, unicode_name)
    scraper = Scraper(unicode_name)
    scraper.scrape()
    assert scraper.well_formed
    scraper = Scraper(unicode_name.encode("utf-8"))
    scraper.scrape()
    assert scraper.well_formed
def test_charset(filepath, charset, well_formed):
    """
    Test charset parameter.

    We are able to give charset as a parameter. This tests the
    parameter with different mimetypes and charset inputs.

    :filepath: Test file path
    :charset: Given and expected character encoding of a test file
    :well_formed: Expected result of well-formedness
    """
    predefined_mimetype = GIVEN_MIMETYPES.get(filepath, None)
    scraper = Scraper(filepath, mimetype=predefined_mimetype, charset=charset)
    scraper.scrape()

    assert scraper.well_formed == well_formed
    assert scraper.streams[0]["charset"] == charset
Ejemplo n.º 19
0
def test_valid_combined(fullname, mimetype):
    """Integration test for valid files.
    - Test that mimetype matches.
    - Test Find out all None elements.
    - Test that errors are not given.
    - Test that all files are well-formed.
    - Test that forcing the scraper to use the MIME type and version the file
      actually as does not affect scraping results.
    - Ignore few files because of required parameter or missing scraper.
    """
    if fullname in IGNORE_VALID:
        pytest.skip("[%s] in ignore" % fullname)

    scraper = Scraper(fullname)
    scraper.scrape()

    for _, info in iteritems(scraper.info):
        assert not info["errors"]

    _assert_valid_scraper_result(scraper, fullname, mimetype, True)

    # Test that output does not change if MIME type and version are forced
    # to be the ones scraper would determine them to be in any case.

    # This cannot be done with compressed arcs, as WarctoolsScraper reports
    # the MIME type of the compressed archive instead of application/gzip,
    # so for those types, all required testing is already done here.
    if (scraper.mimetype in ["application/x-internet-archive"]
            and fullname[-3:] == ".gz"):
        return

    # Forced version affects all frames within a gif or a tiff
    if scraper.mimetype in ["image/gif", "image/tiff"]:
        for _, stream in iteritems(scraper.streams):
            if "version" in stream.keys():
                stream["version"] = scraper.streams[0]["version"]

    forced_scraper = Scraper(fullname,
                             mimetype=scraper.mimetype,
                             version=scraper.version)
    forced_scraper.scrape()

    assert forced_scraper.mimetype == scraper.mimetype
    assert forced_scraper.version == scraper.version
    assert forced_scraper.streams == scraper.streams
Ejemplo n.º 20
0
def test_valid_combined(fullname, mimetype):
    """Integration test for valid files.
    - Test that mimetype matches.
    - Test Find out all None elements.
    - Test that errors are not given.
    - Test that all files are well-formed.
    - Ignore few files because of required parameter or missing scraper.
    """
    if fullname in IGNORE_VALID:
        pytest.skip('[%s] in ignore' % fullname)

    scraper = Scraper(fullname)
    scraper.scrape()

    for _, info in iteritems(scraper.info):
        assert not info['errors']

    _assert_valid_scraper_result(scraper, fullname, mimetype, True)
Ejemplo n.º 21
0
def main(arguments=None):
    """Main loop"""
    usage = "usage: %prog [options] xml-file-name"
    catalog_path = ("/etc/xml/dpres-xml-schemas/schema_catalogs")
    schema_path = ("/etc/xml/dpres-xml-schemas/schema_catalogs/schemas")

    parser = optparse.OptionParser(usage=usage)

    parser.add_option("-c", "--catalog", dest="catalogpath",
                      default=os.path.join(
                          catalog_path, "catalog_main.xml"),
                      help="Full path to XML catalog file",
                      metavar="FILE")

    parser.add_option("-s", "--schemapath", dest="schemapath",
                      default=os.path.join(schema_path, "mets/mets.xsd"),
                      help="XML schema filename for validation",
                      metavar="PATH")

    (options, args) = parser.parse_args(arguments)

    if len(args) != 1:
        parser.error("Must give XML filename as argument")

    filename = args[0]
    scraper = Scraper(filename, schema=options.schemapath,
                      catalog_path=options.catalogpath,
                      mimetype="text/xml", version="1.0",
                      charset="UTF-8")

    messages, errors = [], []
    scraper.scrape()
    info = get_scraper_info(scraper)
    messages.extend(info['messages'])
    errors.extend(info['errors'])

    if messages:
        print(ensure_text(concat(messages)), file=sys.stdout)
    if errors:
        print(ensure_text(concat(errors)), file=sys.stderr)

    if errors or not scraper.well_formed:
        return 117
    return 0
Ejemplo n.º 22
0
def test_forced_filetype(filepath, params, well_formed, expected_mimetype,
                         expected_version):
    """
    Test forcing the scraping to be done as specific file type.

    MIME type and version results are checked both directly from the scraper
    and for well-formed files also from the first stream. In addition to this,
    well-formedness status of the file should be as expected.
    """
    scraper = Scraper(filepath, **params)
    scraper.scrape()

    assert scraper.well_formed == well_formed
    assert scraper.mimetype == expected_mimetype
    assert scraper.version == expected_version

    if well_formed:
        assert scraper.streams[0]["mimetype"] == expected_mimetype
        assert scraper.streams[0]["version"] == expected_version
Ejemplo n.º 23
0
    def _scrape_file(self, filepath, skip_well_check):
        """Scrape file
        :filepath: Path to file to be scraped
        :skip_well_check: True, if well-formed check is skipped
        :returns: scraper with result attributes
        """
        scraper = Scraper(filepath)
        if not skip_well_check:
            scraper.scrape(True)
            if not scraper.well_formed:
                errors = []
                for _, info in six.iteritems(scraper.info):
                    if len(info['errors']) > 0:
                        errors.append(info['errors'])
                error_str = "\n".join(errors)
                raise ValueError(error_str)
        else:
            scraper.scrape(False)

        return scraper
def test_valid_combined(fullname, mimetype, version):
    """
    Integration test for valid files.

    - Test that mimetype and version matches.
    - Test Find out all None elements.
    - Test that errors are not given.
    - Test that all files are well-formed.
    - Ignore few files because of required parameter or missing scraper.
    - Test that giving the resulted MIME type, version and charset
      produce the same results.
    """
    if fullname in IGNORE_VALID:
        pytest.skip("[%s] in ignore" % fullname)

    predefined_mimetype = GIVEN_MIMETYPES.get(fullname, None)
    predefined_charset = GIVEN_CHARSETS.get(fullname, None)
    scraper = Scraper(fullname,
                      mimetype=predefined_mimetype,
                      charset=predefined_charset)
    scraper.scrape()

    for _, info in iteritems(scraper.info):
        assert not info["errors"]

    _assert_valid_scraper_result(scraper, fullname, mimetype, version, True)

    # Test that output does not change if MIME type and version are given
    # to be the ones scraper would determine them to be in any case.

    given_scraper = Scraper(fullname,
                            mimetype=scraper.mimetype,
                            version=scraper.version,
                            charset=scraper.streams[0].get("charset", None))
    given_scraper.scrape()

    assert given_scraper.mimetype == scraper.mimetype
    assert given_scraper.version == scraper.version
    assert given_scraper.streams == scraper.streams
    assert given_scraper.well_formed == scraper.well_formed
Ejemplo n.º 25
0
    def _scrape_file(self,
                     filepath,
                     skip_well_check,
                     file_format=None,
                     charset=None):
        """Scrape file
        :filepath: Path to file to be scraped
        :skip_well_check: True, if well-formed check is skipped
        :file_format: File format and version from the command line argument
                      parser, originally given as a value pair by the user.
                      The mimetype is in index 0 and version in index 1.
        :charset: Character encoding from arguments
        :returns: scraper with result attributes
        """
        if file_format in [None, ()]:
            mimetype = None
            version = None
        else:
            mimetype = file_format[0]
            version = file_format[1]
        scraper = Scraper(filepath,
                          mimetype=mimetype,
                          version=version,
                          charset=charset)
        if not skip_well_check:
            scraper.scrape(True)
            if not scraper.well_formed:
                errors = []
                for _, info in six.iteritems(scraper.info):
                    if len(info['errors']) > 0:
                        for error in info['errors']:
                            errors.append(error)
                error_str = "\n".join(errors)
                raise ValueError(error_str)
        else:
            scraper.scrape(False)

        return scraper
Ejemplo n.º 26
0
def test_invalid_combined(fullname, mimetype):
    """Integration test for all invalid files.
    - Test that well_formed is False and mimetype is expected.
    - If well_formed is None, check that Scraper was not found.
    - Skip files that are known cases where it is identified
      differently (but yet correctly) than expected and would be
      well-formed.
    - Skip empty files, since those are detected as inode/x-empty
      and scraper is not found.
    """
    if 'empty' in fullname or fullname in IGNORE_INVALID:
        pytest.skip('[%s] has empty or in invalid ignore' % fullname)

    scraper = Scraper(fullname)
    scraper.scrape()

    for _, info in iteritems(scraper.info):
        if scraper.mimetype != mimetype and info['class'] == 'ScraperNotFound':
            pytest.skip(('[%s] mimetype mismatches with scraper '
                         'and scraper not found') % fullname)

    assert scraper.well_formed is False  # Could be also None (wrong)
    assert scraper.mimetype == mimetype or (fullname
                                            in DIFFERENT_MIMETYPE_INVALID)
def test_empty_file():
    """Test empty file."""
    scraper = Scraper("test/data/text_plain/invalid__empty.txt")
    scraper.scrape()
    assert not scraper.well_formed