def test_single_element_filter(self): # Verify the effectiveness of single-valued matchers id1, filename1, author1, universe1 = uuid4(), URI("01.png"), "bdhnd", "fotwf" id2, filename2, author2, universe2 = uuid4(), URI("02.png"), "shndl", None id3, filename3, author3, universe3 = uuid4(), URI("03.png"), "okn", "ph" el1 = ImageMetadata(id1, filename1, author1, universe1, None, None) el2 = ImageMetadata(id2, filename2, author2, universe2, None, None) el3 = ImageMetadata(id3, filename3, author3, universe3, None, None) filter_builder = FilterBuilder() # Test constraints satisfied filter_builder.filename_constraint(filename1.path.name) \ .filename_constraint(filename2.path.name) \ .filename_constraint(filename3.path.name) filename_filter = filter_builder.get_filename_filter() self.assertTrue(filename_filter(el1)) self.assertTrue(filename_filter(el2)) self.assertTrue(filename_filter(el3)) # Test implicit exclusion filter_builder.author_constraint(author1) author_filter = filter_builder.get_author_filter() self.assertTrue(author_filter(el1)) self.assertFalse(author_filter(el2)) self.assertFalse(author_filter(el3)) # Test explicit exclusion filter_builder.id_constraint(str(id2), True) id_filter = filter_builder.get_id_filter() self.assertTrue(id_filter(el1)) self.assertFalse(id_filter(el2)) self.assertTrue(id_filter(el3))
def test_collective_disjunctive_filter(self): # Verify the effectiveness of multi-valued matchers, when evaluated in disjunction chars1 = ["al", "john", "jack"] chars2 = ["jm", "jr"] chars3 = ["jr"] el1 = ImageMetadata(uuid4(), URI("a.png"), "ghi", None, chars1, None) el2 = ImageMetadata(uuid4(), URI("b.png"), "nsh", None, chars2, None) el3 = ImageMetadata(uuid4(), URI("c.png"), "ShT", None, chars3, None) filter_builder = FilterBuilder() # Test disjunctive filtering with inclusion f = filter_builder.character_constraint("jm").character_constraint("al").characters_as_disjunctive(True) \ .get_character_filter() self.assertTrue(f(el1)) self.assertTrue(f(el2)) self.assertFalse(f(el3)) # Test disjunctive filtering with exclusion f = filter_builder.character_constraint("jack", True).get_character_filter() self.assertTrue(f(el1)) self.assertTrue(f(el2)) self.assertTrue(f(el3)) filter_builder = FilterBuilder() f = filter_builder.characters_as_disjunctive(True).character_constraint("john", True) \ .character_constraint("jack", True).get_character_filter() self.assertFalse(f(el1)) self.assertTrue(f(el2)) self.assertTrue(f(el3))
def load_meta(img_file: Path) -> ImageMetadata: """ Load the metadata tuple for a given image file. If no metadata file is present, or it is currently inaccessible, return a blank metadata tuple. :arg img_file: a path pointing to a managed image for which we want to load metadata :return: the associated metadata as a tuple, or a blank metadata tuple """ meta_file = _construct_metadata_path(img_file) if meta_file.exists(): try: with meta_file.open() as mf: metadata = parse_xml(mf.read()) # Check if 'file' is a valid URI, otherwise make it so (for retro-compatibility with older schema) if metadata.file.scheme is None: metadata = _old_to_new_schema(img_file, metadata) except (OSError, ParseError): metadata = ImageMetadata(uuid3(NAMESPACE_URL, str(URI(img_file))), URI(img_file), None, None, None, None) else: metadata = ImageMetadata(uuid3(NAMESPACE_URL, str(URI(img_file))), URI(img_file), None, None, None, None) return metadata
def test_none_match_collective(self): # Check the effects of None constraints on multi-valued matchers el1 = ImageMetadata(uuid4(), URI('aaa'), None, None, None, None) el2 = ImageMetadata(uuid4(), URI('yyy'), None, None, None, ["fta"]) none_filter = FilterBuilder().tag_constraint(None).get_tag_filter() self.assertTrue(none_filter(el1)) self.assertFalse(none_filter(el2))
def test_none_match_single(self): # Check the effects of None constraints on single-valued matchers el1 = ImageMetadata(uuid4(), URI('fff'), None, 'u', None, None) el2 = ImageMetadata(uuid4(), URI('zzz'), None, None, None, None) none_filter = FilterBuilder().universe_constraint(None).get_universe_filter() self.assertFalse(none_filter(el1)) self.assertTrue(none_filter(el2))
def test_empty_filter_collective(self): # Check the effects of the absence of constraints on multi-valued matchers el1 = ImageMetadata(uuid4(), URI('xxx'), None, None, None, ["nl", "ll"]) el2 = ImageMetadata(uuid4(), URI('kkk'), None, None, None, None) empty_filter = FilterBuilder().get_tag_filter() self.assertTrue(empty_filter(el1)) self.assertTrue(empty_filter(el2))
def test_empty_filter_single(self): # Check the effects of the absence of constraints on single-valued matchers el1 = ImageMetadata(uuid4(), URI('xxx'), None, None, None, None) el2 = ImageMetadata(uuid4(), URI('kkk'), None, None, None, None) empty_filter = FilterBuilder().get_id_filter() self.assertTrue(empty_filter(el1)) self.assertTrue(empty_filter(el2))
def test_issue_003_path_like_division_trailing(): base = URI("http://example.com/foo/") assert str(base) == "http://example.com/foo/" assert str(base / "bar.html") == "http://example.com/foo/bar.html" base = URI("http://example.com/foo") assert str(base) == "http://example.com/foo" assert str(base / "bar.html") == "http://example.com/bar.html"
def test_issue_003_path_on_path_division(): base = URI("http://ats.example.com/job/listing") # scrape the listing, identify a job URL from that listing target = URI("detail/sample-job") # oh no, it's relative! # And it's resolved. assert str(base / target) == "http://ats.example.com/job/detail/sample-job"
def test_mixin_add_link() -> None: """Can add a new entry.""" sut = LinksMixin() sut.add_link(key="link1", link=Link(href=URI("/test1"))) sut.add_link(key="link2", link=Link(href=URI("/test2"))) assert sut.links is not None assert sut.links["link1"] is not None assert sut.links["link1"].href == URI("/test1") assert sut.links["link2"] is not None assert sut.links["link2"].href == URI("/test2")
def test_link_init() -> None: """Can init a new link.""" sut = Link(href=URI("/test")) assert sut is not None assert sut.href == URI("/test") sut = Link(href=URI("/test"), rel=["self"]) assert sut is not None assert sut.href == URI("/test") assert sut.rel == ["self"]
def to_uri(set_uri: str, scheme=None): AssertUtils.assert_not_null("to_uri", set_uri) if scheme: if not set_uri.startswith("/"): set_uri = "/" + set_uri uri_p = URI( scheme=scheme, path=set_uri.replace(" ", "%20")) else: uri_p = URI(set_uri.replace(" ", "%20")) return uri_p if uri_p.scheme else URI( scheme="string", path=set_uri.replace(" ", "%20"))
def _old_to_new_schema(img_path: Path, old_meta: ImageMetadata): return ImageMetadata(img_id=old_meta.img_id, file=URI(img_path), author=old_meta.author, universe=old_meta.universe, characters=old_meta.characters, tags=old_meta.tags)
def parse_xml(data: str) -> ImageMetadata: """Parse an XML containing image metadata. :param data: a string containing valid image metadata :return: an image metadata object""" image_elem = ElTree.fromstring(data) img_id = image_elem.get('id') file = image_elem.get('file') # If we were presented with a legacy XML not containing 'file', use the legacy name 'filename' if file is None: file = image_elem.get('filename') author = image_elem.find("./author") universe = image_elem.find("./universe") characters = [ char.text for char in image_elem.findall("./characters/character") ] tags = [tag.text for tag in image_elem.findall("./tags/tag")] return ImageMetadata( img_id=UUID(img_id), file=URI(file), author=author.text if author is not None else None, universe=universe.text if universe is not None else None, characters=characters if len(characters) != 0 else None, tags=tags if len(tags) != 0 else None)
def test_issue_003_path_like_division_operators(): base = URI("http://example.com/foo/bar.html") assert str(base / "baz.html") == 'http://example.com/foo/baz.html' assert str(base // "cdn.example.com" / "baz.html") == 'http://cdn.example.com/baz.html' assert str(base / "/diz") == 'http://example.com/diz' assert str(base / "#diz") == 'http://example.com/foo/bar.html#diz' assert str(base / "https://example.com") == 'https://example.com/'
class CloudAppClient: base: ClassVar[URI] = URI('https://my.cl.ly') serialization: ClassVar[ str] = "application/json" # Used for Accept and Content-Type headers. session: Session def __init__(self) -> None: """Initialize the client interface.""" super().__init__() self.session = Session() self.session.headers['User-Agent'] = 'Ruby.CloudApp.API' self.session.headers['Accept'] = self.serialization self.session.headers['Content-Type'] = self.serialization if 'CLOUDAPP_USER' in environ: self.authenticate(environ['CLOUDAPP_USER'], environ['CLOUDAPP_PASSWORD']) def authenticate(self, email: str, password: str): # -> CloudAppClient: """Preserve authentication credentials for later use by RPC calls.""" self.session.headers.pop('Authorization', None) self.session.auth = HTTPDigestAuth(email, password) return self # Internal Mechanisms def __call__(self, path: str, method='get', **params): """Issue an API call.""" uri: URI = self.base / path return self.session.request(method, uri, params) def __getitem__(self, slug: str) -> Drop: """Retrieve a Drop by slug.""" return Drop(slug, self) def __iter__(self) -> Iterable[Drop]: """Iterate all known drops.""" return Drop[self] def _parse_errors(self, result): if isinstance(result, Mapping): return [f"{k}: {v}" for k, v in result.items()] if isinstance(result, str): return [result] if isinstance(result, Collection): return result return []
def meta_extractor(v: View) -> ImageMetadata: characters = v.get_characters() characters = characters.split(', ') if characters is not None else None tags = v.get_tags() tags = tags.split(', ') if tags is not None else None return ImageMetadata(v.image_id, URI(v._image_path), v.get_author(), v.get_universe(), characters, tags)
def test_mixin_init() -> None: """Can init a new mixin.""" sut = LinksMixin() assert sut is not None link = Link(href=URI("/test")) sut = LinksMixin(links={"self": link}) assert sut is not None assert sut.links is not None assert sut.links["self"] == link
def write(self) -> None: """ Persist the updated metadata. :raise OSError: when the metadata file couldn't be opened """ meta_obj = ImageMetadata(self._id, URI(self._image_path), self.author, self.universe, self.characters, self.tags) write_meta(meta_obj, self._image_path)
def test_legacy_load(self): image_uri = URI(self.test_path / "test.png") with (self.test_path / "test.xml").open('w') as f: f.write( "<image id=\"97ed6183-73a0-46ea-b51d-0721b0fbd357\" filename=\"test.png\"></image>" ) loaded = load_meta(Path(image_uri.path)) self.assertEqual( ImageMetadata(UUID('97ed6183-73a0-46ea-b51d-0721b0fbd357'), image_uri, None, None, None, None), loaded)
def test_collective_conjunctive_filter(self): # Verify the effectiveness of multi-valued matchers, when evaluated in conjunction tags1 = ["y", "an", "hry"] tags2 = ["an", "mb", "sty", "rp"] tags3 = ["ll", "vnl"] el1 = ImageMetadata(uuid4(), URI("a.png"), "ghi", None, None, tags1) el2 = ImageMetadata(uuid4(), URI("b.png"), "nsh", None, None, tags2) el3 = ImageMetadata(uuid4(), URI("c.png"), "ShT", None, None, tags3) filter_builder = FilterBuilder() # Test conjunctive filtering with inclusion f = filter_builder.tag_constraint("an").get_tag_filter() self.assertTrue(f(el1)) self.assertTrue(f(el2)) self.assertFalse(f(el3)) # Test conjunctive filtering with exclusion f = filter_builder.tag_constraint("y", True).get_tag_filter() self.assertFalse(f(el1)) self.assertTrue(f(el2)) self.assertFalse((f(el3)))
def test_load_actual(self): image_uri = URI(self.test_path / "test.png") with (self.test_path / "test.xml").open('w') as f: f.write( "<image id=\"97ed6183-73a0-46ea-b51d-0721b0fbd357\" file=\"" + str(image_uri) + "\">" + "<author>a</author><universe>u</universe>" + "<characters><character>x</character><character>y</character></characters>" + "<tags><tag>f</tag><tag>a</tag></tags></image>") self.assertEqual( ImageMetadata(UUID('97ed6183-73a0-46ea-b51d-0721b0fbd357'), image_uri, "a", "u", ["x", "y"], ["f", "a"]), load_meta(Path(self.test_dir.name) / "test.png"))
def test_metadata_read(self): # Write some metadata for one of the images meta1 = ImageMetadata( img_id=UUID('f32ed6ad-1162-4ea6-b243-1e6c91fb7eda'), file=URI(self.test_path / '01.png'), author="a", universe="p", characters=["x", "y"], tags=["t", "f"]) write_meta(meta1, (self.test_path / '01.png')) specimen = GtkView(self.test_path) # Collect metadata from the specimen results = dict() for _ in range(0, 2): specimen.load_next() results[specimen.filename] = TestView.meta_extractor(specimen) self.assertEqual(meta1, results["01.png"]) self.assertEqual( ImageMetadata(results["02.png"].img_id, URI(self.test_path / "02.png"), None, None, None, None), results["02.png"])
def test_store(self): image_uri = URI(self.test_path / "test.png") write_meta( ImageMetadata(UUID('97ed6183-73a0-46ea-b51d-0721b0fbd357'), image_uri, "a", "u", ["x", "y"], ["f", "a"]), Path(self.test_dir.name) / "test.png") with (self.test_path / "test.xml") as f: result = f.read_text() self.assertEqual( "<image id=\"97ed6183-73a0-46ea-b51d-0721b0fbd357\" file=\"" + str(image_uri) + "\">" + "<author>a</author><universe>u</universe>" + "<characters><character>x</character><character>y</character></characters>" + "<tags><tag>f</tag><tag>a</tag></tags></image>", result)
def split_uri(uri_str): """Get the scheme (+namespace if not a URL), and value from URI.""" uri = URI(uri_str) if uri.scheme.name in ['http', 'https']: scheme = uri.scheme.name # we're replacing the scheme instead of using heirarchical # to preserve query strings value = uri_str.replace(uri.scheme.name + '://', '', 1) else: # e.g. uri.heirarchical = 'doi:10.11647/obp.0130'; # we are asumming the path only contains one colon namespace, value = uri.heirarchical.split(':', 1) scheme = ''.join([uri.scheme.name, ':', namespace]) if namespace == "isbn": # we store hyphenless isbn numbers - remove hyphens from input value = value.replace("-", "") # we store lowercased URIs - let's lower input return [scheme.lower(), value.lower()]
def inner(environ:WSGIEnvironment, start_response:WSGIStartResponse): try: request: Request = Request(environ) # This will be remembered and re-used as a singleton later. uri: URI = URI(request.url) except Exception as e: # Protect against de-serialization errors. return HTTPBadRequest(f"Encountered error de-serializing the request: {e!r}")(environ, start_response) # https://docs.pylonsproject.org/projects/webob/en/stable/api/request.html#webob.request.BaseRequest.client_addr # Ref: https://www.nginx.com/resources/wiki/start/topics/examples/forwarded/ client: str = request.client_addr try: # Immediately reject known bad actors. if inet_aton(request.client_addr) in self.blacklist: return HTTPClose()(environ, start_response) # No need to re-blacklist. # Validate the heuristic rules. for heuristic in self.heuristics: try: heuristic(environ, uri) except HTTPClose as e: log.error(f"{heuristic} {e.args[0].lower()}") raise # Invoke the wrapped application if everything seems OK. Note that this pattern of wrapping permits # your application to raise HTTPClose if wishing to blacklist the active connection for any reason. return app(environ, start_response) except HTTPClose as e: if request.client_addr not in self.exempt: log.warning(f"Blacklisting: {request.client_addr}") self.blacklist.add(inet_aton(request.client_addr)) if not __debug__: e = HTTPClose() # Do not disclose the reason in production environments. elif ': ' in e.args[0]: # XXX: Not currently effective. left, _, right = e.args[0].partition(': ') e.args = (f"<strong>{left}:</strong> <tt>{escape(right)}</tt>", ) return e(environ, start_response)
def test_metadata_update(self): target_filename = '02.png' specimen = GtkView(self.test_path) # Scan until we find our target specimen.load_next() while specimen.filename != target_filename: specimen.load_next() # Verify that no metadata is present self.assertIsNone(specimen.get_tags()) # Set metadata and check coherence specimen.set_author(":DD") specimen.set_universe("\tu") specimen.set_characters("3, f,p\n") specimen.set_tags("fa,s \vjo,\u200dl") specimen.write() self.assertEqual( ImageMetadata(specimen.image_id, URI(self.test_path / target_filename), ":DD", "u", ["3", "f", "p"], ["fa", "s jo", "l"]), load_meta(self.test_path / target_filename))
def __init__(self, uri: URI, expected_outputs: List[str]): self.uri = URI(uri) self.expected_outputs = expected_outputs super().__init__()
def test_http_get(self): content = self.sucker.get_contents(URI("https://www.qq.com"), {"method": "get"})[0] self.assertEqual(content.get_header("content-type"), "text/html; charset=GB2312") self.assertEqual(content.get_status_code(), 200)
class Drop: base: ClassVar[URI] = URI( 'https://cl.ly/' ) # The ID is appended to this as the first path element. _storage: str = '{self.uploaded.year}/{self.uploaded.month}/{self.uploaded.day}/{self.id}--{self.slug}--{self.type}--{self.original}' # Name (format string) to use when saving locally. id: int # The internal integer identifier. type: str # The meta-type (bulk grouping) of the uploaded drop. slug: str # The URL slug used to access this drop. name: str # Current file name. original: str # Original uploaded file name. target: Optional[URI] # Target URI if a "short link" redirection. content: Optional[URI] # Content URI otherwise. size: int # Total file size. views: int # View counter. uploaded: datetime # The date and time of initial upload. favourite: bool = False # Has this been marked as a favourite? index: Optional[int] total: Optional[int] _data: Dict _stats: URI _json_map: Dict[str, str] = { 'id': 'id', 'slug': 'slug', 'created_at': 'uploaded', 'item_type': 'type', 'name': 'name', 'redirect_url': 'target', ('file_name', 'name'): 'original', 'view_counter': 'views', ('source_url', 'remote_url'): 'content', 'stats_url': '_stats', 'content_length': 'size', 'favourite': 'favourite', } def __repr__(self) -> str: return f"Drop({'⚠️ ' if self.favourite else ''}{self.slug}, {self.type}, '{self.original}', size={self.size}, uploaded={self.uploaded.isoformat()})" def __init__(self, slug, api, json: bool = False) -> None: self._api = proxy(api) if json: self._apply(slug) return self.slug = slug result = api.session.get(self.uri) if result.status_code != codes.ok: raise ValueError( f"Received {result.status_code!s} attempting to retrieve drop metadata." ) self._apply(result.json()) def __class_getitem__(Drop, api) -> Generator: """Fetch an iterator of all available drops over the given authenticated API instance. Args: page, per_page, type (image, bookmark, text, archive, audio, video, unknown), deleted """ result = api('/v3/items').json() #__import__('pudb').set_trace() counter = count() while result.get('links', {}).get('next_url', {}).get('href', None): for record in result['data']: try: drop = Drop(record['slug'], api) except ValueError: yield record continue drop.index = next(counter) drop.total = result['meta']['count'] yield drop result = api.session.get( result['links']['next_url']['href']).json() @property def uri(self) -> URI: return self.base / self.slug def save(self, path: Optional[Path] = None): if not path: path = self._storage.format(self=self) target = Path(path).absolute() target.parent.mkdir(parents=True, exist_ok=True) # First, write out the .info.json for this drop. with target.with_suffix('.info.json').open('w', encoding='utf-8') as out: out.write(dumps(self._data, indent=4, sort_keys=True)) if self.type == 'bookmark': target = target.with_suffix('.webloc') target.write_bytes(plist({'URL': self.target}, fmt=FMT_BINARY)) else: if not target.exists() or (self.size and getsize(target) != self.size): with target.open('wb', buffering=8192) as out: with self._api.session.get(self.content, stream=True) as req: stream(req.raw, out) uploaded = mktime(self.uploaded.timetuple()) utime(target, (uploaded, uploaded)) def _apply(self, metadata) -> None: self._process(metadata) self._data = metadata for origin, destination in self._json_map.items(): if isinstance(origin, tuple): origins = origin else: origins = (origin, ) for origin in origins: if metadata.get(origin, None) is not None: setattr(self, destination, metadata.get(origin)) break else: setattr(self, destination, None) def _process(self, data: dict) -> None: """Perform minor additional typecasting or cleanup work after retrieval of a drop's metadata.""" for key, value in data.items(): if not isinstance(value, str): continue if key in ('file_name', ): value = unquote(value) elif key.endswith('_at') and value: try: value = datetime.strptime( value.rstrip('Z'), '%Y-%m-%dT%H:%M:%S') # Try with optional trailing Z. except ValueError: value = datetime.strptime( value, '%Y-%m-%d') # Attempt without time component. data[key] = value