def get_extension(content): """A handful of workarounds for getting extensions we can trust.""" file_str = magic.from_buffer(content) if file_str.startswith('Composite Document File V2 Document'): # Workaround for issue with libmagic1==5.09-2 in Ubuntu 12.04. Fixed # in libmagic 5.11-2. mime = 'application/msword' elif file_str == '(Corel/WP)': mime = 'application/vnd.wordperfect' elif file_str == 'C source, ASCII text': mime = 'text/plain' else: # No workaround necessary mime = magic.from_buffer(content, mime=True) extension = mimetypes.guess_extension(mime) if extension == '.obj': # It could be a wpd, if it's not a PDF if 'PDF' in content[0:40]: # Does 'PDF' appear in the beginning of the content? extension = '.pdf' else: extension = '.wpd' if extension == '.wsdl': # It's probably an HTML file, like those from Resource.org extension = '.html' if extension == '.ksh': extension = '.txt' if extension == '.asf': extension = '.wma' return extension
def get_extension(content): """A handful of workarounds for getting extensions we can trust.""" file_str = magic.from_buffer(content) if file_str.startswith("Composite Document File V2 Document"): # Workaround for issue with libmagic1==5.09-2 in Ubuntu 12.04. Fixed # in libmagic 5.11-2. mime = "application/msword" elif file_str == "(Corel/WP)": mime = "application/vnd.wordperfect" elif file_str == "C source, ASCII text": mime = "text/plain" else: # No workaround necessary mime = magic.from_buffer(content, mime=True) extension = mimetypes.guess_extension(mime) if extension == ".obj": # It could be a wpd, if it's not a PDF if "PDF" in content[0:40]: # Does 'PDF' appear in the beginning of the content? extension = ".pdf" else: extension = ".wpd" fixes = { ".htm": ".html", ".xml": ".html", ".wsdl": ".html", ".ksh": ".txt", ".asf": ".wma", ".dot": ".doc", } return fixes.get(extension, extension).lower()
def test_for_meta_redirections(r): mime = magic.from_buffer(r.content, mime=True) extension = mimetypes.guess_extension(mime) if extension == '.html': html_tree = html.fromstring(r.text) try: path = "//meta[translate(@http-equiv, 'REFSH', 'refsh') = " \ "'refresh']/@content" attr = html_tree.xpath(path)[0] wait, text = attr.split(";") if text.lower().startswith("url="): url = text[4:] if not url.startswith('http'): # Relative URL, adapt url = urljoin(r.url, url) return True, url except IndexError: return False, None else: return False, None