class ResponseTypes(object): CLASSES = { 'text/html': 'scrapy.http.HtmlResponse', 'application/atom+xml': 'scrapy.http.XmlResponse', 'application/rdf+xml': 'scrapy.http.XmlResponse', 'application/rss+xml': 'scrapy.http.XmlResponse', 'application/xhtml+xml': 'scrapy.http.HtmlResponse', 'application/vnd.wap.xhtml+xml': 'scrapy.http.HtmlResponse', 'application/xml': 'scrapy.http.XmlResponse', 'application/json': 'scrapy.http.TextResponse', 'application/x-json': 'scrapy.http.TextResponse', 'application/javascript': 'scrapy.http.TextResponse', 'application/x-javascript': 'scrapy.http.TextResponse', 'text/xml': 'scrapy.http.XmlResponse', 'text/*': 'scrapy.http.TextResponse', } def __init__(self): self.classes = {} self.mimetypes = MimeTypes() mimedata = get_data('scrapy', 'mime.types').decode('utf8') self.mimetypes.readfp(StringIO(mimedata)) for mimetype, cls in six.iteritems(self.CLASSES): self.classes[mimetype] = load_object(cls) def from_mimetype(self, mimetype): """Return the most appropriate Response class for the given mimetype""" if mimetype is None: return Response elif mimetype in self.classes: return self.classes[mimetype] else: basetype = "%s/*" % mimetype.split('/')[0] return self.classes.get(basetype, Response) def from_content_type(self, content_type, content_encoding=None): """Return the most appropriate Response class from an HTTP Content-Type header """ if content_encoding: return Response mimetype = content_type.split(';')[0].strip().lower() return self.from_mimetype(mimetype) def from_content_disposition(self, content_disposition): try: filename = content_disposition.split(';')[1].split('=')[1] filename = filename.strip('"\'') return self.from_filename(filename) except IndexError: return Response def from_headers(self, headers): """Return the most appropriate Response class by looking at the HTTP headers""" cls = Response if 'Content-Type' in headers: cls = self.from_content_type(headers['Content-type'], \ headers.get('Content-Encoding')) if cls is Response and 'Content-Disposition' in headers: cls = self.from_content_disposition(headers['Content-Disposition']) return cls def from_filename(self, filename): """Return the most appropriate Response class from a file name""" mimetype, encoding = self.mimetypes.guess_type(filename) if mimetype and not encoding: return self.from_mimetype(mimetype) else: return Response def from_body(self, body): """Try to guess the appropriate response based on the body content. This method is a bit magic and could be improved in the future, but it's not meant to be used except for special cases where response types cannot be guess using more straightforward methods.""" chunk = body[:5000] if isbinarytext(chunk): return self.from_mimetype('application/octet-stream') elif "<html>" in chunk.lower(): return self.from_mimetype('text/html') elif "<?xml" in chunk.lower(): return self.from_mimetype('text/xml') else: return self.from_mimetype('text') def from_args(self, headers=None, url=None, filename=None, body=None): """Guess the most appropriate Response class based on the given arguments""" cls = Response if headers is not None: cls = self.from_headers(headers) if cls is Response and url is not None: cls = self.from_filename(url) if cls is Response and filename is not None: cls = self.from_filename(filename) if cls is Response and body is not None: cls = self.from_body(body) return cls
class ResponseTypes(object): CLASSES = { 'text/html': 'scrapy.http.HtmlResponse', 'application/atom+xml': 'scrapy.http.XmlResponse', 'application/rdf+xml': 'scrapy.http.XmlResponse', 'application/rss+xml': 'scrapy.http.XmlResponse', 'application/xhtml+xml': 'scrapy.http.HtmlResponse', 'application/vnd.wap.xhtml+xml': 'scrapy.http.HtmlResponse', 'application/xml': 'scrapy.http.XmlResponse', 'application/json': 'scrapy.http.TextResponse', 'application/javascript': 'scrapy.http.TextResponse', 'application/x-javascript': 'scrapy.http.TextResponse', 'text/xml': 'scrapy.http.XmlResponse', 'text/*': 'scrapy.http.TextResponse', } def __init__(self): self.classes = {} self.mimetypes = MimeTypes() mimedata = get_data('scrapy', 'mime.types') self.mimetypes.readfp(StringIO(mimedata)) for mimetype, cls in self.CLASSES.iteritems(): self.classes[mimetype] = load_object(cls) def from_mimetype(self, mimetype): """Return the most appropriate Response class for the given mimetype""" if mimetype is None: return Response elif mimetype in self.classes: return self.classes[mimetype] else: basetype = "%s/*" % mimetype.split('/')[0] return self.classes.get(basetype, Response) def from_content_type(self, content_type, content_encoding=None): """Return the most appropriate Response class from an HTTP Content-Type header """ if content_encoding: return Response mimetype = content_type.split(';')[0].strip().lower() return self.from_mimetype(mimetype) def from_content_disposition(self, content_disposition): try: filename = content_disposition.split(';')[1].split('=')[1] filename = filename.strip('"\'') return self.from_filename(filename) except IndexError: return Response def from_headers(self, headers): """Return the most appropriate Response class by looking at the HTTP headers""" cls = Response if 'Content-Type' in headers: cls = self.from_content_type(headers['Content-type'], \ headers.get('Content-Encoding')) if cls is Response and 'Content-Disposition' in headers: cls = self.from_content_disposition(headers['Content-Disposition']) return cls def from_filename(self, filename): """Return the most appropriate Response class from a file name""" mimetype, encoding = self.mimetypes.guess_type(filename) if mimetype and not encoding: return self.from_mimetype(mimetype) else: return Response def from_body(self, body): """Try to guess the appropriate response based on the body content. This method is a bit magic and could be improved in the future, but it's not meant to be used except for special cases where response types cannot be guess using more straightforward methods.""" chunk = body[:5000] if isbinarytext(chunk): return self.from_mimetype('application/octet-stream') elif "<html>" in chunk.lower(): return self.from_mimetype('text/html') elif "<?xml" in chunk.lower(): return self.from_mimetype('text/xml') else: return self.from_mimetype('text') def from_args(self, headers=None, url=None, filename=None, body=None): """Guess the most appropriate Response class based on the given arguments""" cls = Response if headers is not None: cls = self.from_headers(headers) if cls is Response and url is not None: cls = self.from_filename(url) if cls is Response and filename is not None: cls = self.from_filename(filename) if cls is Response and body is not None: cls = self.from_body(body) return cls
class ResponseTypes: CLASSES={ 'text/html': 'Jcrapy.https.HtmlResponse', } def __init__(self): self.classes = {} self.mimetypes = MimeTypes() mimedata = get_data('Jcrapy', 'mime.types').decode('utf8') self.mimetypes.readfp(StringIO(mimedata)) for mimetype, cls in self.CLASSES.items(): self.classes[mimetype] = load_object(cls) def from_mimetype(self, mimetype): """Return the most appropriate Response class for the given mimetype""" if mimetype is None: return Response elif mimetype in self.classes: return self.classes[mimetype] else: print('ResponseTypes.from_mimetype') def from_content_type(self, content_type, content_encoding=None): """Return the most appropriate Response class from an HTTP Content-Type header """ if content_encoding: return Response mimetype = to_unicode(content_type).split(';')[0].strip().lower() return self.from_mimetype(mimetype) def from_headers(self, headers): """Return the most appropriate Response class by looking at the HTTP headers""" cls = Response if b'Content-Type' in headers: cls = self.from_content_type( content_type=headers[b'Content-Type'], content_encoding=headers.get(b'Content-Encoding') ) if cls is Response and b'Content-Disposition' in headers: print('from_headers') return cls def from_args(self, headers=None, url=None, filename=None, body=None): """Guess the most appropriate Response class based on the given arguments.""" cls = Response if headers is not None: cls = self.from_headers(headers) if cls is Response: if url is not None: print('url is not None') elif filename is not None: print('filename is not None') elif body is not None: print('body is not None') return cls
from mimetypes import MimeTypes from os import path from crawlmi.http import Response, TextResponse, XmlResponse, HtmlResponse from crawlmi.utils.python import is_binary _mime_types = MimeTypes() _mt_filename = path.join(path.dirname(path.abspath(__file__)), 'mime.types') with open(_mt_filename, 'rb') as f: _mime_types.readfp(f) _response_classes = { 'text/html': HtmlResponse, 'application/atom+xml': XmlResponse, 'application/rdf+xml': XmlResponse, 'application/rss+xml': XmlResponse, 'application/xhtml+xml': HtmlResponse, 'application/vnd.wap.xhtml+xml': HtmlResponse, 'application/xml': XmlResponse, 'application/json': TextResponse, 'application/javascript': TextResponse, 'application/x-javascript': TextResponse, 'text/xml': XmlResponse, 'text/*': TextResponse, } def from_mime_type(mime_type): '''Return the most appropiate Response class for the given mime type.''' if mime_type is None: return Response
from mimetypes import MimeTypes from os import path from crawlmi.http import Response, TextResponse, XmlResponse, HtmlResponse from crawlmi.utils.python import is_binary _mime_types = MimeTypes() _mt_filename = path.join(path.dirname(path.abspath(__file__)), 'mime.types') with open(_mt_filename, 'rb') as f: _mime_types.readfp(f) _response_classes = { 'text/html': HtmlResponse, 'application/atom+xml': XmlResponse, 'application/rdf+xml': XmlResponse, 'application/rss+xml': XmlResponse, 'application/xhtml+xml': HtmlResponse, 'application/vnd.wap.xhtml+xml': HtmlResponse, 'application/xml': XmlResponse, 'application/json': TextResponse, 'application/javascript': TextResponse, 'application/x-javascript': TextResponse, 'text/xml': XmlResponse, 'text/*': TextResponse, } def from_mime_type(mime_type): '''Return the most appropiate Response class for the given mime type.''' if mime_type is None:
class ResponseTypes: CLASSES = { "text/html": "scrapy.http.HtmlResponse", "application/atom+xml": "scrapy.http.XmlResponse", "application/rdf+xml": "scrapy.http.XmlResponse", "application/rss+xml": "scrapy.http.XmlResponse", "application/xhtml+xml": "scrapy.http.HtmlResponse", "application/vnd.wap.xhtml+xml": "scrapy.http.HtmlResponse", "application/xml": "scrapy.http.XmlResponse", "application/json": "scrapy.http.TextResponse", "application/x-json": "scrapy.http.TextResponse", "application/json-amazonui-streaming": "scrapy.http.TextResponse", "application/javascript": "scrapy.http.TextResponse", "application/x-javascript": "scrapy.http.TextResponse", "text/xml": "scrapy.http.XmlResponse", "text/*": "scrapy.http.TextResponse", } def __init__(self): self.classes = {} self.mimetypes = MimeTypes() mimedata = get_data("scrapy", "mime.types").decode("utf8") self.mimetypes.readfp(StringIO(mimedata)) for mimetype, cls in self.CLASSES.items(): self.classes[mimetype] = load_object(cls) def from_mimetype(self, mimetype): """Return the most appropriate Response class for the given mimetype""" if mimetype is None: return Response elif mimetype in self.classes: return self.classes[mimetype] else: basetype = f"{mimetype.split('/')[0]}/*" return self.classes.get(basetype, Response) def from_content_type(self, content_type, content_encoding=None): """Return the most appropriate Response class from an HTTP Content-Type header""" if content_encoding: return Response mimetype = to_unicode(content_type).split(";")[0].strip().lower() return self.from_mimetype(mimetype) def from_content_disposition(self, content_disposition): try: filename = (to_unicode( content_disposition, encoding="latin-1", errors="replace").split(";")[1].split("=")[1].strip("\"'")) return self.from_filename(filename) except IndexError: return Response def from_headers(self, headers): """Return the most appropriate Response class by looking at the HTTP headers""" cls = Response if b"Content-Type" in headers: cls = self.from_content_type( content_type=headers[b"Content-Type"], content_encoding=headers.get(b"Content-Encoding"), ) if cls is Response and b"Content-Disposition" in headers: cls = self.from_content_disposition( headers[b"Content-Disposition"]) return cls def from_filename(self, filename): """Return the most appropriate Response class from a file name""" mimetype, encoding = self.mimetypes.guess_type(filename) if mimetype and not encoding: return self.from_mimetype(mimetype) else: return Response def from_body(self, body): """Try to guess the appropriate response based on the body content. This method is a bit magic and could be improved in the future, but it's not meant to be used except for special cases where response types cannot be guess using more straightforward methods.""" chunk = body[:5000] chunk = to_bytes(chunk) if not binary_is_text(chunk): return self.from_mimetype("application/octet-stream") elif b"<html>" in chunk.lower(): return self.from_mimetype("text/html") elif b"<?xml" in chunk.lower(): return self.from_mimetype("text/xml") else: return self.from_mimetype("text") def from_args(self, headers=None, url=None, filename=None, body=None): """Guess the most appropriate Response class based on the given arguments.""" cls = Response if headers is not None: cls = self.from_headers(headers) if cls is Response and url is not None: cls = self.from_filename(url) if cls is Response and filename is not None: cls = self.from_filename(filename) if cls is Response and body is not None: cls = self.from_body(body) return cls