def test_iter_scrapers(mimetype, version, scraper_classes): """ Test scraper discovery. :mimetype: Detected mimetype :version: Detected file format version :scraper_classes: Expected Scraper classes which are run """ scrapers = iter_scrapers(mimetype, version) assert {x.__name__ for x in scrapers} == set(scraper_classes) scraper_classes = [ "TextEncodingMetaScraper" if x == "TextEncodingScraper" else x for x in scraper_classes ] scraper_classes = [ "WarctoolsScraper" if x == "WarctoolsFullScraper" else x for x in scraper_classes ] if mimetype in ["application/x-spss-por", "text/html", "text/xml"] or \ mimetype == "application/pdf" and version in \ ["A-1a", "A-1b", "A-2a", "A-2b", "A-2u", "A-3a", "A-3b", "A-3u"]: scraper_classes.append("DetectedMimeVersionMetadataScraper") scrapers = iter_scrapers(mimetype, version, False) scraper_set = set(scraper_classes).difference(set(WELLFORMED_SCRAPERS)) if mimetype in ["application/gzip", "image/x-dpx"]: scraper_set = set(["ScraperNotFound"]) assert {x.__name__ for x in scrapers} == scraper_set
def scrape(self, check_wellformed=True): """Scrape file and collect metadata. :check_wellformed: True, full scraping; False, skip well-formed check. """ self.streams = None self.info = {} self.well_formed = None file_exists = FileExists(self.filename, None) self._scrape_file(file_exists) if file_exists.well_formed is False: return self._identify() for scraper_class in iter_scrapers(mimetype=self.mimetype, version=self.version, check_wellformed=check_wellformed, params=self._params): scraper = scraper_class(self.filename, self.mimetype, check_wellformed, self._params) self._scrape_file(scraper) self._check_utf8(check_wellformed) self._check_mimetype_version()
def scrape(self, check_wellformed=True): """Scrape file and collect metadata. :check_wellformed: True, full scraping; False, skip well-formed check. """ self.detect_filetype() # MIME type could not be determined # or an error occured while detection process if not self._predefined_mimetype or self.well_formed is False: self.streams = {} return for scraper_class in iter_scrapers( mimetype=self._predefined_mimetype, version=self._predefined_version, check_wellformed=check_wellformed, params=self._params): scraper = scraper_class( filename=self.filename, mimetype=self._predefined_mimetype, version=self._predefined_version, params=self._params) self._scrape_file(scraper, check_wellformed) self._params["scraper_results"] = self._scraper_results self._merge_results(check_wellformed) self._check_utf8(check_wellformed) self.mimetype = self.streams[0]["mimetype"] self.version = self.streams[0]["version"] self._check_mime(check_wellformed)
def test_scrape_valid_file(filename, mimetype): """Test scraping for a well-formed odt file.""" for class_ in iter_scrapers(mimetype, None): scraper = class_( os.path.join(BASEPATH, mimetype.replace('/', '_'), filename), mimetype) scraper.scrape_file() assert scraper.well_formed
def test_scrape_invalid_file(filename, mimetype): """Test scraping for non well-formed odt file.""" scraper_results = [] for class_ in iter_scrapers(mimetype, None): scraper = class_( os.path.join(BASEPATH, "application_vnd.oasis.opendocument.text", filename), mimetype) scraper_results.append(scraper.well_formed) assert not all(scraper_results) assert scraper_results
def scrape(self, check_wellformed=True): """Scrape file and collect metadata. :check_wellformed: True, full scraping; False, skip well-formed check. """ self.detect_filetype() # File not found or MIME type could not be determined if not self.mimetype: self.streams = {} return self._params["mimetype_guess"] = self.mimetype for scraper_class in iter_scrapers(mimetype=self.mimetype, version=self.version, check_wellformed=check_wellformed, params=self._params): scraper = scraper_class(self.filename, check_wellformed, self._params) self._scrape_file(scraper) self.streams = generate_metadata_dict(self._scraper_results, LOSE) self._check_utf8(check_wellformed) self._check_mimetype_version()
def test_iter_scrapers(mimetype, version, scraper_classes): """Test scraper discovery.""" scrapers = iter_scrapers(mimetype, version) assert set([x.__name__ for x in scrapers]) == set(scraper_classes)