def analyze(cls, something): """ Accept a given input (e.g. a URL, file path, or file handle and determine how to normalize it into an ``Ingestor`` while generating metadata. """ if isinstance(something, cls): return (something, ) if isinstance(something, basestring): # Treat strings as paths or URLs url = urlparse(something) if url.scheme.lower() in ['http', 'https']: something = requests.get(something) elif url.scheme.lower() in ['file', '']: finalpath = url.path if osname == 'nt': finalpath = finalpath[1:] upath = fullpath(finalpath) if path.isdir(upath): return (cls(file_name=f) for f in directory_files(upath)) return (cls(file_name=upath), ) # Python requests if isinstance(something, requests.Response): fd = StringIO(something.content) return (cls(file_obj=fd, meta={ 'http_status': something.status_code, 'http_headers': clean_headers(something.headers), 'source_url': something.url }), ) if isinstance(something, HTTPResponse): # Can't tell the URL for HTTPResponses return (cls(file_obj=something, meta={ 'http_status': something.status, 'http_headers': clean_headers(something.getheaders()), 'source_url': something.url }), ) elif hasattr(something, 'geturl') and hasattr(something, 'info'): # assume urllib or urllib2 return (cls(file_obj=something, meta={ 'http_status': something.getcode(), 'http_headers': clean_headers(something.headers), 'source_url': something.url }), ) elif hasattr(something, 'read'): # Fileobj will be a bit bland return (cls(file_obj=something), ) return []
def analyze(cls, something): """ Accept a given input (e.g. a URL, file path, or file handle and determine how to normalize it into an ``Ingestor`` while generating metadata. """ if isinstance(something, cls): return (something, ) if isinstance(something, basestring): # Treat strings as paths or URLs url = urlparse(something) if url.scheme.lower() in ['http', 'https']: something = requests.get(something) elif url.scheme.lower() in ['file', '']: finalpath = url.path if osname == 'nt': finalpath = finalpath[1:] upath = fullpath(finalpath) if path.isdir(upath): return (cls(file_name=f) for f in directory_files(upath)) return (cls(file_name=upath),) # Python requests if isinstance(something, requests.Response): fd = StringIO(something.content) return (cls(file_obj=fd, meta={ 'http_status': something.status_code, 'http_headers': clean_headers(something.headers), 'source_url': something.url }), ) if isinstance(something, HTTPResponse): # Can't tell the URL for HTTPResponses return (cls(file_obj=something, meta={ 'http_status': something.status, 'http_headers': clean_headers(something.getheaders()), 'source_url': something.url }), ) elif hasattr(something, 'geturl') and hasattr(something, 'info'): # assume urllib or urllib2 return (cls(file_obj=something, meta={ 'http_status': something.getcode(), 'http_headers': clean_headers(something.headers), 'source_url': something.url }), ) elif hasattr(something, 'read'): # Fileobj will be a bit bland return (cls(file_obj=something), ) return []
def __init__(self, path=None, **kwargs): self.path = fullpath(path) if os.path.exists(path) and not os.path.isdir(path): raise ValueError('Not a directory: %s' % path)