def test_relaxed(): assert parse_headers('attachment;', relaxed=True).disposition == 'attachment' assert parse_headers('attachment; key=val;', relaxed=True).disposition == 'attachment' cd = parse_headers('attachment; filename="spa ced";', relaxed=True) assert cd.filename_unsafe == u'spa ced'
def test_relaxed(): assert parse_headers( 'attachment;', relaxed=True).disposition == 'attachment' assert parse_headers( 'attachment; key=val;', relaxed=True).disposition == 'attachment' cd = parse_headers( 'attachment; filename="spa ced";', relaxed=True) assert cd.filename_unsafe == u'spa ced'
def test_location_fallback(): assert parse_headers( None, location='https://foo/bar%c3%a9.py').filename_unsafe == u'baré.py' assert parse_headers(None, location='https://foo/').filename_unsafe == u'' assert parse_headers( None, location='https://foo/%C3%A9toil%C3%A9/').filename_unsafe == u'étoilé'
def test_strict(): # Trailing ; means the header is rejected assert parse_headers('attachment;').disposition == 'inline' assert parse_headers('attachment; key=val;').disposition == 'inline' try: cd = parse_headers('attachment; filename="spa ced";') except ValueError: assert True else: assert False, cd
def test_strict(): # Trailing ; means the header is rejected assert parse_headers('attachment;').disposition == 'inline' assert parse_headers('attachment; key=val;').disposition == 'inline' try: cd = parse_headers( 'attachment; filename="spa ced";') except ValueError: assert True else: assert False, cd
def test_location_fallback(): assert parse_headers( None, location='https://foo/bar%c3%a9.py' ).filename_unsafe == u'baré.py' assert parse_headers( None, location='https://foo/' ).filename_unsafe == u'' assert parse_headers( None, location='https://foo/%C3%A9toil%C3%A9/' ).filename_unsafe == u'étoilé'
def test_relaxed(): assert parse_headers( 'attachment;', relaxed=True).disposition == 'attachment' assert parse_headers( 'attachment; key=val;', relaxed=True).disposition == 'attachment' cd = parse_headers( 'attachment; filename="spa ced";', relaxed=True) assert cd.filename_unsafe == u'spa ced' cd = parse_headers('attachment; filename="medium_SIEMEAE06658_1_PE_TAP2.png";filename*=UTF-8\'\'"medium_SIEMEAE06658_1_PE_TAP2.png"', relaxed=True) assert cd.filename_unsafe is None
def _filename(url, headers): """Given the URL and the HTTP headers received while fetching it, generate a reasonable name for the file. If no suitable name can be found, return None. (Either uses the Content-Disposition explicit filename or a filename from the URL.) """ filename = None # Try to get filename from Content-Disposition header. heads = re.findall(r'^Content-Disposition:\s*(.*?)\r\n', headers, re.I | re.M) if heads: filename = rfc6266.parse_headers(heads[-1]).filename_unsafe # Get filename from URL. if not filename: parts = urlparse.urlparse(url).path.split('/') if parts: filename = parts[-1] # Strip unsafe characters from path. if filename: filename = filename.strip() for sep in (os.sep, os.altsep): if sep: filename = filename.replace(sep, '_') for pat in FILENAME_REPLACE: filename = pat.sub('_', filename) if filename: return filename
def test_location_fallback(): assert parse_headers( None, location='https://foo/bar%c3%a9.py' ).filename_unsafe == u'baré.py' assert parse_headers( None, location='https://foo/' ).filename_unsafe == u'' assert parse_headers( None, location='https://foo/%C3%A9toil%C3%A9/' ).filename_unsafe == u'étoilé' assert parse_headers( None, location='http://vtv.vn/Content/Uploads/image/Trung%20khanh/Olympic%202012/SV%C4%90%20Olympic%202012%208.jpg' ).filename_unsafe == u"SVĐ Olympic 2012 8.jpg"
def download_to_local(url, dir_name): """Downloads remote resource given its URL. Args: url: the URL to the resource. dir_name: the directory on the local filesystem to save the resource. Returns: filename: the filename (relative to the dir_name) of the downloaded resource. It may be different from the name of the remote resource because of sanitization. """ # TODO: be able to verify SSL certificates from some publishers with requests.get(url, stream=True) as r: r.raise_for_status() # Guesst the proper filename to use filename = "" # 1. Try to use the content-disposition header if available if "content-disposition" in r.headers: filename = rfc6266.parse_headers(r.headers["content-disposition"], relaxed=True).filename_unsafe # 2. Try to get it from the URL if filename == "": filename = url.rsplit("/", 1)[-1] # 3. Sanitize the filename, this handles empty filename right now filename = get_safe_filename(filename) # Download the file with open(os.path.join(dir_name, filename), "wb") as o: shutil.copyfileobj(r.raw, o) return filename
def _find_attachments(self): """Retrieve attachments from the parsed body structure. We try to find and decode a file name for each attachment. If we failed, a generic name will be used (ie. part_1, part_2, ...). """ for att in self.bs.attachments: attname = "part_%s" % att["pnum"] if "params" in att and att["params"] != "NIL": attname = u2u_decode.u2u_decode(att["params"][1]) \ .strip("\r\t\n") elif "disposition" in att and len(att["disposition"]) > 1: for pos, value in enumerate(att["disposition"][1]): if not value.startswith("filename"): continue header = "%s; %s=%s" \ % (att['disposition'][0], value, att["disposition"][1][pos + 1].strip("\r\t\n")) attname = parse_headers(header).filename_unsafe if attname is None: attname = u2u_decode.u2u_decode( att["disposition"][1][pos + 1] ).strip("\r\t\n") break self.attachments[att["pnum"]] = attname
def _filename(url, headers): """Given the URL and the HTTP headers received while fetching it, generate a reasonable name for the file. If no suitable name can be found, return None. (Either uses the Content-Disposition explicit filename or a filename from the URL.) """ filename = None # Try to get filename from Content-Disposition header. heads = re.findall(r'^Content-Disposition:\s*(.*?)\r\n', headers, re.I | re.M) if heads: cdisp = rfc6266.parse_headers(heads[-1], relaxed=True) filename = cdisp.filename_unsafe # Get filename from URL. if not filename: parts = urlparse.urlparse(url).path.split('/') if parts: filename = parts[-1] # Strip unsafe characters from path. if filename: filename = filename.strip() for sep in (os.sep, os.altsep): if sep: filename = filename.replace(sep, '_') for pat in FILENAME_REPLACE: filename = pat.sub('_', filename) if filename: return filename
def download(url): print "Downloading %s" % url response = requests.get(url) fname = rfc6266.parse_headers(response.headers['content-disposition']).filename_unsafe with open(fname, "wb") as f: f.write(response.content) return fname
def _find_attachments(self): """Retrieve attachments from the parsed body structure. We try to find and decode a file name for each attachment. If we failed, a generic name will be used (ie. part_1, part_2, ...). """ for att in self.bs.attachments: attname = "part_%s" % att["pnum"] if "params" in att and att["params"] != "NIL": attname = u2u_decode.u2u_decode(att["params"][1]) \ .strip("\r\t\n") elif "disposition" in att and len(att["disposition"]) > 1: for pos, value in enumerate(att["disposition"][1]): if not value.startswith("filename"): continue header = "%s; %s=%s" \ % (att['disposition'][0], value, att["disposition"][1][pos + 1].strip("\r\t\n")) attname = parse_headers(header).filename_unsafe if attname is None: attname = u2u_decode.u2u_decode( att["disposition"][1][pos + 1]).strip("\r\t\n") break self.attachments[att["pnum"]] = attname
def download_file(self, file_id): """ Download the file. Args: file_id (:obj:`int`): File id Returns: class:`models.responses.DownloadResponse` object """ if not isinstance(file_id, int): raise TypeError('file_id must be an instance of int') url = self._download_file_base_url + str(file_id) response = self._perform_get_file_request(url) if response.status_code == 200: try: filename = rfc6266.parse_headers( response.headers['Content-Disposition']).filename_unsafe except: filename = re.findall('filename=(.+)', response.headers['Content-Disposition']) return resp.DownloadResponse(filename, response.content) else: if response.status_code == 401: return resp.BaseResponse( **{'error_code': 'authorization_error'}) if response.status_code == 403 or response.status_code == 404: return resp.BaseResponse( **{'error_code': 'access_denied_file'}) else: return resp.BaseResponse(**{'error_code': 'ServerError'})
def get_filename(url, header): try: filename = parse_headers(header.get("content-disposition")) filename = filename.filename_unsafe assert filename is not None except AssertionError: filename = get_filename_from_url(url) return filename
def get_filename(entry): """Find the 'content-disposition' filename of a WARC entry.""" for ext in DOC_EXT: if entry["mime"].lower().endswith(ext) or urlparse(entry["url"]).path.lower().endswith(".%s" % (ext)): for header, value in entry.record.status_headers.headers: if header.lower() == "content-disposition": cd = rfc6266.parse_headers(value) return cd.filename_unsafe
def get_filename(entry): """Find the 'content-disposition' filename of a WARC entry.""" for ext in DOC_EXT: if entry["mime"].lower().endswith(ext) or urlparse( entry["url"]).path.lower().endswith(".%s" % (ext)): for header, value in entry.record.status_headers.headers: if header.lower() == "content-disposition": cd = rfc6266.parse_headers(value) return cd.filename_unsafe
def filename(self): import urllib.parse if "content-disposition" in self.response.headers: try: import rfc6266 except ImportError as e: print(e) print("Couldn't import rfc6266; not using content-disposition header") else: return rfc6266.parse_headers(self.response.headers["content-disposition"]).filename_unsafe return os.path.basename(urllib.parse.urlparse(self.url).path) or "index.html"
def download(url): print "Downloading %s" % url response = requests.get(url) if "Content-Disposition" in response.headers.keys(): fname = rfc6266.parse_headers( response.headers['Content-Disposition']).filename_unsafe else: fname = url.split("/")[-1] with open(fname, "wb") as f: f.write(response.content) return fname
def _digest_binary(self, data, headers): header = [ i[1] for i in headers if i[0].lower() == 'content-disposition' ] if not len(header): return data cd = rfc6266.parse_headers(header[0], relaxed=True) return { 'filename': cd.filename_unsafe, 'disposition': cd.disposition, 'content': data }
def download_url(url, counter): try: res = requests.get(url) filename = rfc6266.parse_headers( res.headers['Content-Disposition']).filename_unsafe if not filename: filename = f'{counter}.zip' print(f'[INFO] Get filename {filename}') with open(f'download_files/{filename}', 'wb+') as f: f.write(res.content) except Exception as e: print(e)
def test_parsing(): assert parse_headers(None).disposition == 'inline' assert parse_headers('attachment').disposition == 'attachment' assert parse_headers('attachment; key=val').assocs['key'] == 'val' assert parse_headers( 'attachment; filename=simple').filename_unsafe == 'simple' # test ISO-8859-1 fname = parse_headers(u'attachment; filename="oyé"').filename_unsafe assert fname == u'oyé', repr(fname) cd = parse_headers( 'attachment; filename="EURO rates";' ' filename*=utf-8\'\'%e2%82%ac%20rates') assert cd.filename_unsafe == u'€ rates' assert parse_headers('attachment; filename=""').filename_unsafe == None
def test_parsing(): assert parse_headers(None).disposition == 'inline' assert parse_headers('attachment').disposition == 'attachment' assert parse_headers('attachment; key=val').assocs['key'] == 'val' assert parse_headers( 'attachment; filename=simple').filename_unsafe == 'simple' # test ISO-8859-1 fname = parse_headers(u'attachment; filename="oyé"').filename_unsafe assert fname == u'oyé', repr(fname) cd = parse_headers('attachment; filename="EURO rates";' ' filename*=utf-8\'\'%e2%82%ac%20rates') assert cd.filename_unsafe == u'€ rates'
def parse_attachment(message_part, attachments=None): content_disposition = message_part.get("Content-Disposition", None) if content_disposition: try: cd = parse_headers(content_disposition, relaxed=True) if cd.disposition.lower() == "attachment": if not cd.assocs.has_key("filename"): #print error or warning? return None else: file_data = message_part.get_payload(decode=True) if not file_data: payload = message_part.get_payload() if isinstance(payload, list): for msgobj in payload: parse2(msgobj, attachments) return None print >>sys.stderr, message_part.get_payload() print >>sys.stderr, message_part.get_content_charset() attachment = StringIO(file_data) attachment.content_type = message_part.get_content_type() attachment.size = len(file_data) attachment.name = cd.assocs['filename'] attachment.create_date = None attachment.mod_date = None attachment.read_date = None for name, value in cd.assocs.iteritems(): if name == "create-date": attachment.create_date = value #TODO: datetime elif name == "modification-date": attachment.mod_date = value #TODO: datetime elif name == "read-date": attachment.read_date = value #TODO: datetime return attachment except: print >>sys.stderr, "content_disposition:", content_disposition raise return None
def parse_attachment(message_part, attachments=None): content_disposition = message_part.get("Content-Disposition", None) if content_disposition: try: cd = parse_headers(content_disposition, relaxed=True) if cd.disposition.lower() == "attachment": if not "filename" in cd.assocs: #print error or warning? return None else: file_data = message_part.get_payload(decode=True) if not file_data: payload = message_part.get_payload() if isinstance(payload, list): for msgobj in payload: _parse2(msgobj, attachments) return None # PSIPHON: fixed conditional return attachment = StringIO(file_data) attachment.content_type = message_part.get_content_type() attachment.size = len(file_data) attachment.name = cd.assocs['filename'] attachment.create_date = None attachment.mod_date = None attachment.read_date = None for name, value in cd.assocs.iteritems(): if name == "create-date": attachment.create_date = value # TODO: datetime elif name == "modification-date": attachment.mod_date = value # TODO: datetime elif name == "read-date": attachment.read_date = value # TODO: datetime return attachment except: print >> sys.stderr, "content_disposition:", content_disposition raise return None
def download(target_dir, url): response = requests.get(url, stream=True) if not response.ok: raise DownloadError('Can\'t download %s: response status: %i'%\ (url, response.status_code)) fname = None cd = response.headers.get('Content-Disposition') if cd: fname = rfc6266.parse_headers(cd).filename_unsafe if not fname: fname = os.path.basename(url) log.info('Downloading %s' % fname) total = response.headers.get('content-length').strip() if total: total = int(total) path = os.path.join(target_dir, fname) with open(path, 'wb') as f: widgets = [ progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA(), ' ', progressbar.FileTransferSpeed() ] pbar = progressbar.ProgressBar(widgets=widgets, max_value=total).start() size = 0 for block in response.iter_content(1024): size += len(block) f.write(block) pbar.update(size) pbar.finish() return path
def _find_attachments(self): for att in self.bs.attachments: attname = "part_%s" % att["pnum"] params = None key = None if "params" in att and att["params"] != "NIL": params = att["params"] key = "name" if key is None and "disposition" in att and len(att["disposition"]) > 1: params = att["disposition"][1] key = "filename" if key and params: for pos, value in enumerate(params): if not value.startswith(key): continue header = "%s; %s=%s" % (att['disposition'][0], value, u2u_decode.u2u_decode(params[pos + 1]).strip("\r\t\n")) attname = parse_headers(header).filename_unsafe if attname is None: attname = u2u_decode.u2u_decode(params[pos + 1]).strip("\r\t\n") break self.attachments[att["pnum"]] = attname
def get_info(self): logger.info('Getting piece config from url %r' % (self.url, )) r = requests.head(self.url.geturl(), verify=False) try: size = r.headers.get('content-length') size = int(size) except ValueError: raise Exception( 'Size is invalid (%r), unable to segmented download.' % (size, )) #raise InvalidInputException('Size is invalid (%r), unable to segmented download.' % size) filename = None if r.headers.get('content-disposition'): filename = rfc6266.parse_headers( r.headers['content-disposition']).filename_unsafe if not filename: url_filename = self.url.path.split('?')[0].split('/')[-1] if url_filename: filename = url_filename return int(size), filename, r.headers.get('content-type')
def __init__(self, fp=None, headers=None, outerboundary="", environ=os.environ, keep_blank_values=0, strict_parsing=0): """Constructor. Read multipart/* until last part. Arguments, all optional: fp : file pointer; default: sys.stdin (not used when the request method is GET) headers : header dictionary-like object; default: taken from environ as per CGI spec outerboundary : terminating multipart boundary (for internal use only) environ : environment dictionary; default: os.environ keep_blank_values: flag indicating whether blank values in percent-encoded forms should be treated as blank strings. A true value indicates that blanks should be retained as blank strings. The default false value indicates that blank values are to be ignored and treated as if they were not included. strict_parsing: flag indicating what to do with parsing errors. If false (the default), errors are silently ignored. If true, errors raise a ValueError exception. """ method = 'GET' self.keep_blank_values = keep_blank_values self.strict_parsing = strict_parsing if 'REQUEST_METHOD' in environ: method = environ['REQUEST_METHOD'].upper() self.qs_on_post = None if method == 'GET' or method == 'HEAD': if 'QUERY_STRING' in environ: qs = environ['QUERY_STRING'] elif sys.argv[1:]: qs = sys.argv[1] else: qs = "" fp = StringIO(qs) if headers is None: headers = {'content-type': "application/x-www-form-urlencoded"} if headers is None: headers = {} if method == 'POST': # Set default content-type for POST to what's traditional headers['content-type'] = "application/x-www-form-urlencoded" if 'CONTENT_TYPE' in environ: headers['content-type'] = environ['CONTENT_TYPE'] if 'QUERY_STRING' in environ: self.qs_on_post = environ['QUERY_STRING'] if 'CONTENT_LENGTH' in environ: headers['content-length'] = environ['CONTENT_LENGTH'] self.fp = fp or sys.stdin self.headers = headers self.outerboundary = outerboundary # Process content-disposition header cdisp, pdict = "", {} if 'content-disposition' in self.headers and rfc6266: cd = rfc6266.parse_headers(self.headers['content-disposition'], relaxed=True) cdisp, pdict = cd.disposition, cd.assocs elif 'content-disposition' in self.headers: cdisp, pdict = parse_header(self.headers['content-disposition']) self.disposition = cdisp self.disposition_options = pdict self.name = None if 'name' in pdict: self.name = pdict['name'] self.filename = None if 'filename' in pdict: self.filename = pdict['filename'] if 'filename*' in pdict: self.filename = pdict['filename*'].string if self.filename and '&' in self.filename: from HTMLParser import HTMLParser self.filename = HTMLParser().unescape(self.filename) if isinstance(self.filename, unicode): self.filename = self.filename.encode('utf8') # Process content-type header # # Honor any existing content-type header. But if there is no # content-type header, use some sensible defaults. Assume # outerboundary is "" at the outer level, but something non-false # inside a multi-part. The default for an inner part is text/plain, # but for an outer part it should be urlencoded. This should catch # bogus clients which erroneously forget to include a content-type # header. # # See below for what we do if there does exist a content-type header, # but it happens to be something we don't understand. if 'content-type' in self.headers: ctype, pdict = parse_header(self.headers['content-type']) elif self.outerboundary or method != 'POST': ctype, pdict = "text/plain", {} else: ctype, pdict = 'application/x-www-form-urlencoded', {} self.type = ctype self.type_options = pdict self.innerboundary = "" if 'boundary' in pdict: self.innerboundary = pdict['boundary'] clen = -1 if 'content-length' in self.headers: try: clen = int(self.headers['content-length']) except ValueError: pass if maxlen and clen > maxlen: raise ValueError, 'Maximum content length exceeded' self.length = clen self.list = self.file = None self.done = 0 if ctype == 'application/x-www-form-urlencoded': self.read_urlencoded() elif ctype[:10] == 'multipart/': self.read_multi(environ, keep_blank_values, strict_parsing) else: self.read_single()
def shapefiles(base='.'): """ Update any out-of-date shapefiles. """ headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)' } def process(slug, config, url, data_file_path): # We can only process KML, KMZ and ZIP files. extension = os.path.splitext(data_file_path)[1] if extension in ('.kml', '.kmz', '.zip'): repo_path = os.path.dirname(data_file_path) while not os.path.exists(os.path.join( repo_path, '.git')) and not repo_path == '/': repo_path = os.path.join(repo_path, '..') repo_path = os.path.realpath(repo_path) directory = dirname(config['file']) # Remove old files. for basename in os.listdir(directory): if basename not in ('.DS_Store', '__pycache__', 'definition.py', 'LICENSE.txt', 'data.kml', 'data.kmz', 'data.zip'): os.unlink(os.path.join(directory, basename)) files_to_add = [] # Unzip any zip file. error_thrown = False if extension == '.zip': try: zip_file = ZipFile(data_file_path) for name in zip_file.namelist(): # Don't extract directories. if name[-1] == '/': continue # Flatten the zip file hierarchy. extension = os.path.splitext(name)[1] if extension in ('.kml', '.kmz'): basename = 'data%s' % extension # assumes one KML or KMZ file per archive else: basename = os.path.basename( name) # assumes no collisions across hierarchy # Extract only matching shapefiles. if 'basename' in config and basename.split( os.extsep, 1)[0] != config['basename']: continue with open(os.path.join(directory, basename), 'wb') as f: with zip_file.open(name, 'r') as fp: if 'skip_crc32' in config: fp._expected_crc = None f.write(fp.read()) if extension not in ('.kml', '.kmz'): files_to_add.append( os.path.join(directory, basename)) except BadZipfile as e: error_thrown = True print('Bad ZIP file %s %s\n' % (e, url)) finally: os.unlink(data_file_path) # Unzip any KMZ file. kmz_file_path = os.path.join(directory, 'data.kmz') if not error_thrown and os.path.exists(kmz_file_path): try: zip_file = ZipFile(kmz_file_path) for name in zip_file.namelist(): # A KMZ file contains a single KML file and other supporting files. # @see https://developers.google.com/kml/documentation/kmzarchives if os.path.splitext(name)[1] == '.kml': with open(os.path.join(directory, 'data.kml'), 'wb') as f: f.write(zip_file.read(name)) except BadZipfile: error_thrown = True print('Bad KMZ file %s\n' % url) finally: os.unlink(kmz_file_path) if not error_thrown: shp_file_path = glob(os.path.join(directory, '*.shp')) # Convert any KML to shapefile. if not shp_file_path: kml_file_path = os.path.join(directory, 'data.kml') if os.path.exists(kml_file_path): result = run('ogrinfo -q %s | grep -v "3D Point"' % kml_file_path, hide='out').stdout if result.count('\n') > 1: print('Too many layers %s' % url) else: layer = re.search(r'\A\d+: (.+?) \(', result).group(1) run('ogr2ogr -f "ESRI Shapefile" %s %s -nlt POLYGON "%s"' % (directory, kml_file_path, layer), echo=True) for name in glob( os.path.join(directory, '*.[dps][bhr][fjpx]')): files_to_add.append(name) os.unlink(kml_file_path) # Merge multiple shapefiles into one. if len(shp_file_path) > 1: for name in shp_file_path: run('ogr2ogr -f "ESRI Shapefile" %s %s -update -append -nln Boundaries' % (directory, name), echo=True) basename = os.path.splitext(os.path.basename(name))[0] for name in glob( os.path.join(directory, '%s.[dps][bhr][fjnpx]' % basename)): files_to_add.remove(name) os.unlink(name) shp_file_path = glob(os.path.join(directory, '*.shp')) if shp_file_path: shp_file_path = shp_file_path[0] if shp_file_path and os.path.exists(shp_file_path): # Convert any 3D shapefile into 2D. result = run('ogrinfo -q %s' % shp_file_path, hide='out').stdout if result.count('\n') > 1: print('Too many layers %s' % url) elif re.search('3D Polygon', result): run('ogr2ogr -f "ESRI Shapefile" -overwrite %s %s -nlt POLYGON' % (directory, shp_file_path), echo=True) for name in list(files_to_add): if not os.path.exists(name): files_to_add.remove(name) # Replace "Double_Stereographic" with "Oblique_Stereographic". prj_file_path = os.path.splitext(shp_file_path)[0] + '.prj' if prj_file_path and os.path.exists(prj_file_path): with open(prj_file_path) as f: prj = f.read() if 'Double_Stereographic' in prj: with open(prj_file_path, 'w') as f: f.write( prj.replace('Double_Stereographic', 'Oblique_Stereographic')) elif 'prj' in config: with open(prj_file_path, 'w') as f: f.write(requests.get(config['prj']).text) files_to_add.append(prj_file_path) else: print('No PRJ file %s' % url) # Update last updated timestamp. definition_path = os.path.join(directory, 'definition.py') with open(definition_path) as f: definition = f.read() with open(definition_path, 'w') as f: f.write( re.sub(r'(?<=last_updated=date\()[\d, ]+', last_updated.strftime('%Y, %-m, %-d'), definition)) # Print notes. if 'notes' in config: print('%s\n%s\n' % (config['file'], config['notes'])) else: print('Unrecognized extension %s\n' % url) # Retrieve shapefiles. processed = set() for slug, config in registry(base).items(): if config['file'] not in processed and 'data_url' in config: processed.add(config['file']) url = config['data_url'] result = urlparse(url) if result.scheme == 'ftp': # Get the last modified timestamp. ftp = FTP(result.hostname) ftp.login(result.username, result.password) last_modified = ftp.sendcmd('MDTM %s' % result.path) # Parse the timestamp as a date. last_updated = datetime.strptime(last_modified[4:], '%Y%m%d%H%M%S').date() if config['last_updated'] < last_updated: # Determine the file extension. extension = os.path.splitext(url)[1] # Set the new file's name. data_file_path = os.path.join(dirname(config['file']), 'data%s' % extension) # Download new file. ftp.retrbinary('RETR %s' % result.path, open(data_file_path, 'wb').write) ftp.quit() process(slug, config, url, data_file_path) else: # Get the last modified timestamp. arguments = {} if result.username: url = '%s://%s%s' % (result.scheme, result.hostname, result.path) arguments['auth'] = (result.username, result.password) try: response = requests.head(url, headers=headers, **arguments) except requests.exceptions.SSLError: response = requests.head(url, headers=headers, verify=False, **arguments) # If HEAD requests are not properly supported. if response.status_code in (204, 403, 405, 500) or ( response.status_code == 302 and '404' in response.headers['Location']): response = requests.get(url, headers=headers, stream=True, **arguments) last_modified = response.headers.get('last-modified') # Parse the timestamp as a date. if last_modified: last_updated = datetime.strptime( last_modified, '%a, %d %b %Y %H:%M:%S GMT') else: last_updated = datetime.now() last_updated = last_updated.date() if config['last_updated'] > last_updated: print('%s are more recent than the source (%s > %s)\n' % (slug, config['last_updated'], last_updated)) elif config['last_updated'] < last_updated: # Determine the file extension. if 'content-disposition' in response.headers: filename = parse_headers( response.headers['content-disposition'] ).filename_unsafe else: filename = url extension = os.path.splitext(filename)[1].lower() if not extension: if response.headers[ 'content-type'] == 'application/vnd.google-earth.kml+xml; charset=utf-8': extension = '.kml' # Set the new file's name. data_file_path = os.path.join(dirname(config['file']), 'data%s' % extension) # Download new file. arguments['stream'] = True try: response = requests.get(url, headers=headers, **arguments) except requests.exceptions.SSLError: response = requests.get(url, headers=headers, verify=False, **arguments) with open(data_file_path, 'wb') as f: for chunk in response.iter_content(): f.write(chunk) process(slug, config, url, data_file_path)
def roundtrip(filename): return parse_headers(build_header(filename)).filename_unsafe
def download(src, dst): do_download = True if os.path.exists(dst): logger.info("- %s: local file exists (%s)", src, dst) do_download = False """ import xdg.Mime path = dst try: filemime = xdg.Mime.get_type2(path) except AttributeError: filemime = xdg.Mime.get_type(path) filetype = str(filemime) logger.info(" file type: %s", filetype) if filetype == "application/gzip": cmd = ["gunzip", "--stdout", dst] proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL) res = proc.wait() if res == 0: do_download = False elif filetype == "application/zip": import zipfile try: z = zipfile.ZipFile(dst) l = z.infolist() do_download = False except Exception as e: logger.exception("NG zip file: %s->%s (%s)", src, dst, e) pass else: logger.error(" Unsupported file type: %s", filetype) """ if do_download: logger.debug("Downloading %s", src) try: x, headers = urllib.request.urlretrieve(src, dst + ".tmp") filename = None if "Content-Disposition" in headers: disp = headers["Content-Disposition"] filename = rfc6266.parse_headers(disp, relaxed=True).filename_unsafe if filename is not None: filename = os.path.basename(filename) if filename is None: filename = os.path.basename(src) with io.open(dst + ".filename", "w", encoding="utf-8") as fo: fo.write(filename) os.rename(dst + ".tmp", dst) logger.debug("Downloaded %s", filename) #logger.debug("Headers: %s", headers) except Exception as e: logger.exception("NG download: %s", src)
def shapefiles(base='.'): """ Update any out-of-date shapefiles. """ headers = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)'} def process(slug, config, url, data_file_path): # We can only process KML, KMZ and ZIP files. extension = os.path.splitext(data_file_path)[1] if extension in ('.kml', '.kmz', '.zip'): repo_path = os.path.dirname(data_file_path) while not os.path.exists(os.path.join(repo_path, '.git')) and not repo_path == '/': repo_path = os.path.join(repo_path, '..') repo_path = os.path.realpath(repo_path) repo = Repo(repo_path) index = repo.index directory = dirname(config['file']) # Remove old files. for basename in os.listdir(directory): if basename not in ('.DS_Store', 'definition.py', 'LICENSE.txt', 'data.kml', 'data.kmz', 'data.zip'): os.unlink(os.path.join(directory, basename)) index.remove([os.path.relpath(os.path.join(directory, basename), repo_path)]) files_to_add = [] # Unzip any zip file. error_thrown = False if extension == '.zip': try: zip_file = ZipFile(data_file_path) for name in zip_file.namelist(): # Flatten the zip file hierarchy. extension = os.path.splitext(name)[1] if extension in ('.kml', '.kmz'): basename = 'data%s' % extension # assumes one KML or KMZ file per archive else: basename = os.path.basename(name) # assumes no collisions across hierarchy with open(os.path.join(directory, basename), 'wb') as f: f.write(zip_file.read(name)) if extension not in ('.kml', '.kmz'): files_to_add.append(os.path.join(directory, basename)) except BadZipfile: error_thrown = True print('Bad ZIP file %s\n' % url) finally: os.unlink(data_file_path) # Unzip any KMZ file. kmz_file_path = os.path.join(directory, 'data.kmz') if not error_thrown and os.path.exists(kmz_file_path): try: zip_file = ZipFile(kmz_file_path) for name in zip_file.namelist(): # A KMZ file contains a single KML file and other supporting files. # @see https://developers.google.com/kml/documentation/kmzarchives if os.path.splitext(name)[1] == '.kml': with open(os.path.join(directory, 'data.kml'), 'wb') as f: f.write(zip_file.read(name)) except BadZipfile: error_thrown = True print('Bad KMZ file %s\n' % url) finally: os.unlink(kmz_file_path) if not error_thrown: # Convert any KML to shapefile. kml_file_path = os.path.join(directory, 'data.kml') if os.path.exists(kml_file_path): result = run('ogrinfo -q %s | grep -v "3D Point"' % kml_file_path, hide='out').stdout if result.count('\n') > 1: print('Too many layers %s' % url) else: layer = re.search('\A\d+: (\S+)', result).group(1) run('ogr2ogr -f "ESRI Shapefile" %s %s -nlt POLYGON %s' % (directory, kml_file_path, layer), echo=True) for name in glob(os.path.join(directory, '*.[dps][bhr][fjpx]')): files_to_add.append(name) os.unlink(kml_file_path) # Merge multiple shapefiles into one. names = glob(os.path.join(directory, '*.shp')) if len(names) > 1: for name in names: run('ogr2ogr -f "ESRI Shapefile" %s %s -update -append -nln Boundaries' % (directory, name), echo=True) basename = os.path.splitext(os.path.basename(name))[0] for name in glob(os.path.join(directory, '%s.[dps][bhr][fjnpx]' % basename)): files_to_add.remove(name) os.unlink(name) shp_file_path = glob(os.path.join(directory, '*.shp')) if shp_file_path: shp_file_path = shp_file_path[0] if shp_file_path and os.path.exists(shp_file_path): # Convert any 3D shapefile into 2D. result = run('ogrinfo -q %s' % shp_file_path, hide='out').stdout if result.count('\n') > 1: print('Too many layers %s' % url) elif re.search('3D Polygon', result): run('ogr2ogr -f "ESRI Shapefile" -overwrite %s %s -nlt POLYGON' % (directory, shp_file_path), echo=True) for name in list(files_to_add): if not os.path.exists(name): files_to_add.remove(name) # Replace "Double_Stereographic" with "Oblique_Stereographic". prj_file_path = os.path.splitext(shp_file_path)[0] + '.prj' if prj_file_path and os.path.exists(prj_file_path): with open(prj_file_path) as f: prj = f.read() if 'Double_Stereographic' in prj: with open(prj_file_path, 'w') as f: f.write(prj.replace('Double_Stereographic', 'Oblique_Stereographic')) elif config.get('prj'): with open(prj_file_path, 'w') as f: f.write(requests.get(config['prj']).content) files_to_add.append(prj_file_path) else: print('No PRJ file %s' % url) # Run any additional commands on the shapefile. if config.get('ogr2ogr'): run('ogr2ogr -f "ESRI Shapefile" -overwrite %s %s %s' % (directory, shp_file_path, config['ogr2ogr']), echo=True) for name in list(files_to_add): if not os.path.exists(name): files_to_add.remove(name) # Add files to git. index.add([os.path.relpath(name, repo_path) for name in files_to_add]) # Update last updated timestamp. definition_path = os.path.join(directory, 'definition.py') with open(definition_path) as f: definition = f.read() with open(definition_path, 'w') as f: f.write(re.sub('(?<=last_updated=date\()[\d, ]+', last_updated.strftime('%Y, %-m, %-d'), definition)) # Print notes. notes = [] if config.get('notes'): notes.append(config['notes']) if notes: print('%s\n%s\n' % (slug, '\n'.join(notes))) else: print('Unrecognized extension %s\n' % url) # Retrieve shapefiles. for slug, config in registry(base).items(): if config.get('data_url'): url = config['data_url'] result = urlparse(url) if result.scheme == 'ftp': # Get the last modified timestamp. ftp = FTP(result.hostname) ftp.login(result.username, result.password) last_modified = ftp.sendcmd('MDTM %s' % result.path) # Parse the timestamp as a date. last_updated = datetime.strptime(last_modified[4:], '%Y%m%d%H%M%S').date() if config['last_updated'] < last_updated: # Determine the file extension. extension = os.path.splitext(url)[1] # Set the new file's name. data_file_path = os.path.join(dirname(config['file']), 'data%s' % extension) # Download new file. ftp.retrbinary('RETR %s' % result.path, open(data_file_path, 'wb').write) ftp.quit() process(slug, config, url, data_file_path) else: # Get the last modified timestamp. arguments = {'allow_redirects': True} if result.username: url = '%s://%s%s' % (result.scheme, result.hostname, result.path) arguments['auth'] = (result.username, result.password) response = requests.head(url, headers=headers, **arguments) if response.status_code == 405: # if HEAD requests are not allowed response = requests.get(url, headers=headers, **arguments) last_modified = response.headers.get('last-modified') # Parse the timestamp as a date. if last_modified: last_updated = datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S GMT') else: last_updated = datetime.now() last_updated = last_updated.date() if config['last_updated'] > last_updated: print('%s are more recent than the source (%s > %s)\n' % (slug, config['last_updated'], last_updated)) elif config['last_updated'] < last_updated: # Determine the file extension. if response.headers.get('content-disposition'): filename = parse_headers(response.headers['content-disposition']).filename_unsafe else: filename = url extension = os.path.splitext(filename)[1].lower() # Set the new file's name. data_file_path = os.path.join(dirname(config['file']), 'data%s' % extension) # Download new file. arguments['stream'] = True response = requests.get(url, headers=headers, **arguments) with open(data_file_path, 'wb') as f: for chunk in response.iter_content(): f.write(chunk) process(slug, config, url, data_file_path)