def test_corrupted_ocr(self): fs = FilesystemSource(os.path.join(test_data_path, "corrupted")) with SourceManager() as sm: for h in fs.handles(sm): resource = h.follow(sm) self.assertEqual(convert(resource, OutputType.Text), None, "{0}: error handling failed".format(h))
def test_ocr_conversions(self): fs = FilesystemSource(os.path.join(test_data_path, "good")) with SourceManager() as sm: for h in fs.handles(sm): resource = h.follow(sm) self.assertEqual( convert(resource, OutputType.Text).value, expected_result, "{0}: content failed".format(h))
def try_apply(sm, source): for handle in source.handles(sm): derived = Source.from_handle(handle, sm) if derived: try_apply(sm, derived) else: resource = handle.follow(sm) representation = convert(resource, OutputType.Text) return representation.value
def try_apply(sm, source, rule): for handle in source.handles(sm): derived = Source.from_handle(handle, sm) if derived: yield from try_apply(sm, derived, rule) else: resource = handle.follow(sm) representation = convert(resource, rule.operates_on) if representation: yield from rule.match(representation.value)
def test_size_computation(self): fs = FilesystemSource(test_data_path) with SourceManager() as sm: for h in fs.handles(sm): resource = h.follow(sm) size = convert(resource, OutputType.ImageDimensions) if not size: if "rgba32" in h.relative_path: self.skipTest("Pillow RGBA bug detected -- skipping") else: size = size.value self.assertEqual(size, expected_size, "{0}: size failed")
while True: h_generator = s.handles(sm) h = next(h_generator) r = h.follow(sm) #if h.guess_type() == "text/plain": print(f"handle\t{h}") print(f"resource\t{r}") print("raw content:") with r.make_stream() as fp: print("\t\t{0}".format(fp.read())) # should succed for text -> text conversion try: rep = convert(r, OutputType.Text) print(f"Conveted\t{rep.value}") break except KeyError as e: # lets try to reinterpret the handle as a new Source s = Source.from_handle(h) # sz = Source.from_handle(h) # hz = next(sz.handles(sm)) # rz = hz.follow(sm) # with rz.make_stream() as fp: # print("\t\t{0}".format(fp.read())) ## Lets try manual hd = DataHandle(DataSource(content=b64encode(gzip_content), mime="text/plain",
else: break return h converters = registry.__converters # pprint(f"converters {converters}") sm = SourceManager() site = WebSource("http://localhost:64346/") page = WebHandle(source=site, path="side.html") resource = page.follow(sm) resource.check() mime_type = resource.compute_type() print(f"mime_type of resource {mime_type}") link_list = convert(resource, OutputType.Links).value rule = LinksFollowRule(sensitivity=Sensitivity.INFORMATION) matches = list(rule.match(link_list)) msg = messages.MatchFragment(rule, matches or []) with contextlib.closing(site.handles(sm)) as handles: first_thing = next(handles) second_thing = next(handles) h = base_referrer(second_thing) print(f"{second_thing.presentation} have {h.presentation} as base referrer")