def run(self): if self.outfile and not self.use_stdout: with self.outfile.open("w"): pass # create or truncate file, but don't write anything to it yet with sqlite3.connect(self.mbtiles) as conn: results = [] if self.show_size: sql = "SELECT cnt, dups.tile_id, LENGTH(tile_data) FROM (" \ " SELECT tile_id, COUNT(*) as cnt FROM map " \ " GROUP BY tile_id HAVING cnt > ?" \ ") dups JOIN images ON images.tile_id = dups.tile_id" sql_opts = [self.min_dup_count] if self.zoom: sql += f" WHERE zoom_level=?" sql_opts.append(self.zoom) else: sql_opts = [] sql = "SELECT COUNT(*) cnt, tile_id FROM map" if self.zoom: sql += f" WHERE zoom_level=?" sql_opts.append(self.zoom) sql += " GROUP BY tile_id HAVING cnt > ?" sql_opts.append(self.min_dup_count) for vals in query(conn, sql, sql_opts): results.append(vals) results.sort(reverse=True) size = None examples = None for vals in results: if len(vals) == 3: count, tile_id, size = vals else: count, tile_id = vals if self.show_examples: example_sql = "select zoom_level, tile_column, tile_row from map " \ "where tile_id = ? limit 5" examples = [ f'{z}/{x}/{y}' for z, x, y in query(conn, example_sql, [tile_id]) ] if self.verbose: res = f"{tile_id} x {count:,}" if self.show_size: res += f', {size:,} bytes' if self.show_examples: res += ', examples: ' + ', '.join(examples) print_err(res) results = [v[1] for v in results] if self.use_stdout: for v in results: print(v) elif self.outfile: with self.outfile.open("a") as f: f.writelines([str(v) + '\n' for v in results]) return results
def get_value(self, name): with sqlite3.connect(self.mbtiles) as conn: cursor = conn.cursor() cursor.execute("SELECT value FROM metadata WHERE name=?", [name]) row = cursor.fetchone() if row is None: print_err(f"Metadata field '{name}' is not found") exit(1) print(row[0])
async def load_hash(self, session: ClientSession, verbose: bool): if not self.url_hash: return try: if verbose: print(f"Getting md5 checksum from {self.url_hash}") hsh = (await fetch(session, self.url_hash)).strip().split(' ')[0] if not re.match(r'^[a-fA-F0-9]{32}$', hsh): raise ValueError(f"Invalid md5 hash '{hsh}'") self.hash = hsh except Exception as ex: print_err(f"Unable to load md5 hash for {self.to_str(True)}: {ex}")
async def load_metadata(self, session: ClientSession, verbose: bool): if not self.url: return try: if verbose: print(f"Getting content length for {self.url}") async with session.head(self.url) as resp: if resp.status >= 400: raise ValueError(f"Status={resp.status} for HEAD request") if 'Content-Length' in resp.headers: self.file_len = int(resp.headers['Content-Length']) except Exception as ex: print_err(f"Unable to load metadata for {self}: {ex}")
async def init(self, session: ClientSession, verbose: bool): """initialize the self.sources with the relevant Source objects by parsing the mirror's HTML page, and getting all <a> tags""" try: sources = await self.get_sources(session, verbose) if not sources: raise ValueError(f"No sources found") await load_sources(sources, session, verbose) if len(sources) > 1 and sources[0].hash == sources[1].hash: del sources[0] # latest is the same as the last one self.sources = sources except Exception as ex: print_err(f"Unable to use {self.country} source {self.url}: {ex}")
def parse_hrefs(self, items: List[tuple], verbose) -> List['Source']: """Convert a list of (name, href) tuples to a list of valid sources, including only the two most recent ones, plus the 'latest' if available.""" all_sources: Dict[str, Source] = {} for name, href in sorted(items): m = self.re_name.match(name) if not m: if verbose: print(f"Ignoring unexpected name '{name}' from {self.url}") continue try: url = href if '/' in href else (self.url + href) date = m.group(1) is_md5 = bool(m.group(2)) dt = None if date == 'latest' else datetime.strptime( date, '%y%m%d') if not is_md5: if date in all_sources: raise ValueError(f"{date} already already exists") all_sources[date] = Source(name, url, dt, self) else: if date not in all_sources: raise ValueError( f"md5 file exists, but data file does not") all_sources[date].url_hash = url except Exception as ex: print_err(f'WARN: {ex}, while parsing {name} from {self.url}') # get the last 2 sources that have dates in the name, as well as the "latest" latest = all_sources.pop('latest', None) result = [ all_sources[k] for k in list(sorted(all_sources.keys(), reverse=True))[:2] ] if latest: result.insert(0, latest) return result
def run(self): with sqlite3.connect(self.mbtiles) as conn: limit_to_keys = not self.outfile if self.outfile and not self.use_stdout: with self.outfile.open("w"): pass # create or truncate file, but don't write anything to it yet keyed_tiles = 0 nokey_tiles = 0 cursor = conn.cursor() key_stats = self.keys for with_key, without_key in self.tile_batches( conn, limit_to_keys): without_key.sort() if with_key: with_key.sort() for val in with_key: key_stats[val[3]] += 1 cursor.executemany( 'INSERT OR IGNORE INTO map' '(zoom_level, tile_column, tile_row, tile_id)' ' VALUES(?,?,?,?)', with_key) keyed_tiles += cursor.rowcount conn.commit() if without_key: if self.use_stdout: for v in without_key: print(v, end='') else: with self.outfile.open("a") as f: f.writelines(without_key) nokey_tiles += len(without_key) if self.verbose: for k, c in key_stats.items(): print_err(f"{k} - added {c:,}") print_err(f'Total imputed tiles: {keyed_tiles:,}') if nokey_tiles: print_err( f'Total tiles need to be generated: {nokey_tiles:,}')
async def run_aria2c(aria2c_args, dry_run, md5, urls, args, area_id): params = ['aria2c'] if md5: params.append(f'--checksum=md5={md5}') if len(urls) > 1 and not any( (v for v in aria2c_args if v == '-s' or v.startswith('--split'))): # user has not passed -s or --split, so use as many streams as urls params.append(f'--split={len(urls)}') if not any((v for v in aria2c_args if v.startswith('--http-accept-gzip'))): # user has not passed --http-accept-gzip, so always specify we accept gzip params.append('--http-accept-gzip') if not any( (v for v in aria2c_args if v == '-U' or v.startswith('--user-agent'))): # user has not set a custom user agent, set one params.append(f'--user-agent={USER_AGENT}') if args.output: assert_conflict_args("--output", aria2c_args, '-d', '--dir', '-o', '--out', '-i', '--input-file', '--auto-file-renaming', '-Z', '--force-sequential', '--allow-overwrite') out_path = Path(args.output).resolve() out_path.parent.mkdir(parents=True, exist_ok=True) params.append(f'--dir={out_path.parent}') params.append(f'--out={out_path.name}') params.append('--auto-file-renaming=false') if args.force: params.append('--allow-overwrite=true') extra_env = None if args['--make-dc']: assert_conflict_args("--make-dc", aria2c_args, '--on-download-complete') area_id, min_zoom, max_zoom, dc_ver = normalize_make_dc( area_id, args['--minzoom'], args['--maxzoom'], args['--dc-ver']) extra_env = { "DOWNLOAD_OSM_DC_FILE": str(Path(args['--make-dc'])), "OSM_AREA_NAME": str(area_id), "MIN_ZOOM": str(min_zoom), "MAX_ZOOM": str(max_zoom), "MAKE_DC_VERSION": str(dc_ver), } params.append("--on-download-complete") params.append(__file__) params.extend(aria2c_args) params.extend(urls) print(f"\n {subprocess.list2cmdline(params)}") if args.verbose and extra_env: env_str = ', '.join((f'{k}={v}' for k, v in extra_env.items())) print(f" Setting environment vars: {env_str}") capture_output = False for flag in ('--on-bt-download-complete', '--on-download-pause', '--on-download-complete', '--on-download-start', '--on-download-error', '--on-download-stop'): if any((v for v in params if v.startswith(flag))): capture_output = True break if args.verbose: if capture_output: print(" capturing stdout/stderr to wait for subprocess exit") else: print(" aria2c output will be printed directly to terminal") # Make sure to print/flush everything to STDOUT before running subprocess print("", flush=True) if not dry_run: # Use capture_output to ensure that callback finishes before run() returns # This is only needed if any callbacks are used if extra_env: env = os.environ.copy() env.update(extra_env) else: env = None res = subprocess.run(params, env=env, capture_output=capture_output) ret = res.returncode if capture_output: stdout = res.stdout.decode('utf-8') if stdout: print(stdout) stderr = res.stderr.decode('utf-8') if stderr: print_err(stderr) # Callbacks do not report errors, so detect it if ret == 0 and stderr and "Traceback (most recent call last)" in stderr: ret = 1 return ret else: print("Data is not downloaded because of the --dry-run parameter") if args['--make-dc']: print("docker-compose file generation was skipped") return 0
def print_message(msg: asyncpg.PostgresLogMessage): try: # noinspection PyUnresolvedReferences print_err(f' {msg.severity}: {msg.message} @ {msg.context}') except AttributeError: print_err(f' {msg}')