コード例 #1
0
def test_iter_scrapers(mimetype, version, scraper_classes):
    """
    Test scraper discovery.

    :mimetype: Detected mimetype
    :version: Detected file format version
    :scraper_classes: Expected Scraper classes which are run
    """
    scrapers = iter_scrapers(mimetype, version)
    assert {x.__name__ for x in scrapers} == set(scraper_classes)

    scraper_classes = [
        "TextEncodingMetaScraper" if x == "TextEncodingScraper" else x
        for x in scraper_classes
    ]
    scraper_classes = [
        "WarctoolsScraper" if x == "WarctoolsFullScraper" else x
        for x in scraper_classes
    ]
    if mimetype in ["application/x-spss-por", "text/html", "text/xml"] or \
            mimetype == "application/pdf" and version in \
            ["A-1a", "A-1b", "A-2a", "A-2b", "A-2u", "A-3a", "A-3b", "A-3u"]:
        scraper_classes.append("DetectedMimeVersionMetadataScraper")

    scrapers = iter_scrapers(mimetype, version, False)
    scraper_set = set(scraper_classes).difference(set(WELLFORMED_SCRAPERS))
    if mimetype in ["application/gzip", "image/x-dpx"]:
        scraper_set = set(["ScraperNotFound"])
    assert {x.__name__ for x in scrapers} == scraper_set
コード例 #2
0
ファイル: scraper.py プロジェクト: rasek-sls/file-scraper
    def scrape(self, check_wellformed=True):
        """Scrape file and collect metadata.
        :check_wellformed: True, full scraping; False, skip well-formed check.
        """
        self.streams = None
        self.info = {}
        self.well_formed = None

        file_exists = FileExists(self.filename, None)
        self._scrape_file(file_exists)

        if file_exists.well_formed is False:
            return

        self._identify()
        for scraper_class in iter_scrapers(mimetype=self.mimetype,
                                           version=self.version,
                                           check_wellformed=check_wellformed,
                                           params=self._params):
            scraper = scraper_class(self.filename, self.mimetype,
                                    check_wellformed, self._params)
            self._scrape_file(scraper)

        self._check_utf8(check_wellformed)
        self._check_mimetype_version()
コード例 #3
0
    def scrape(self, check_wellformed=True):
        """Scrape file and collect metadata.

        :check_wellformed: True, full scraping; False, skip well-formed check.
        """
        self.detect_filetype()

        # MIME type could not be determined
        # or an error occured while detection process
        if not self._predefined_mimetype or self.well_formed is False:
            self.streams = {}
            return

        for scraper_class in iter_scrapers(
                mimetype=self._predefined_mimetype,
                version=self._predefined_version,
                check_wellformed=check_wellformed, params=self._params):
            scraper = scraper_class(
                filename=self.filename,
                mimetype=self._predefined_mimetype,
                version=self._predefined_version,
                params=self._params)
            self._scrape_file(scraper, check_wellformed)
        self._params["scraper_results"] = self._scraper_results

        self._merge_results(check_wellformed)
        self._check_utf8(check_wellformed)

        self.mimetype = self.streams[0]["mimetype"]
        self.version = self.streams[0]["version"]

        self._check_mime(check_wellformed)
コード例 #4
0
def test_scrape_valid_file(filename, mimetype):
    """Test scraping for a well-formed odt file."""
    for class_ in iter_scrapers(mimetype, None):
        scraper = class_(
            os.path.join(BASEPATH, mimetype.replace('/', '_'), filename),
            mimetype)
        scraper.scrape_file()
        assert scraper.well_formed
コード例 #5
0
def test_scrape_invalid_file(filename, mimetype):
    """Test scraping for non well-formed odt file."""
    scraper_results = []
    for class_ in iter_scrapers(mimetype, None):
        scraper = class_(
            os.path.join(BASEPATH, "application_vnd.oasis.opendocument.text",
                         filename), mimetype)
        scraper_results.append(scraper.well_formed)

    assert not all(scraper_results)
    assert scraper_results
コード例 #6
0
    def scrape(self, check_wellformed=True):
        """Scrape file and collect metadata.
        :check_wellformed: True, full scraping; False, skip well-formed check.
        """
        self.detect_filetype()

        # File not found or MIME type could not be determined
        if not self.mimetype:
            self.streams = {}
            return

        self._params["mimetype_guess"] = self.mimetype
        for scraper_class in iter_scrapers(mimetype=self.mimetype,
                                           version=self.version,
                                           check_wellformed=check_wellformed,
                                           params=self._params):
            scraper = scraper_class(self.filename, check_wellformed,
                                    self._params)
            self._scrape_file(scraper)
        self.streams = generate_metadata_dict(self._scraper_results, LOSE)
        self._check_utf8(check_wellformed)
        self._check_mimetype_version()
コード例 #7
0
def test_iter_scrapers(mimetype, version, scraper_classes):
    """Test scraper discovery."""
    scrapers = iter_scrapers(mimetype, version)
    assert set([x.__name__ for x in scrapers]) == set(scraper_classes)