def _download(url, dirpath): chunk_size = 10*1024*1024 downloaded = 0 path = os.path.join(dirpath, os.path.basename(url)) if pool: # check in pool poolpath = pool.get_by_basename(os.path.basename(url)) if poolpath: # found in pool! shutil.copyfile(poolpath, path) print("Take from pool", poolpath) return path r = requests.get(url, stream=True) assert(r.status_code == 200) with open(path, 'wb') as f: for chunk in r.iter_content(chunk_size=chunk_size): if chunk: # filter out keep-alive new chunks f.write(chunk) downloaded += chunk_size print("... downloaded {}".format(kmgt(downloaded))) print("Downloaded {} to {}".format(kmgt(os.stat(path).st_size), path)) if pool: print("Put to pool") pool.append(path) return path
def _table(): columns = [ "File", "Unpacked", "Packed", "Ratio" ] fmt_data = "{:<30} {:<15} {:<15} {:<15.3f}" fmt_title = "{:<30} {:<15} {:<15} {:<15}" print() print(fmt_title.format(*columns)) print(fmt_title.format( *(["---"] * len(columns)) )) for data in results: print(fmt_data.format( os.path.basename(data['url']), kmgt(data['unpacked']), kmgt(data['packed']), data['packed'] * 100.0 / data['unpacked'] ))
def test_kernel(tmpdir): unpacked = os.path.join(tmpdir, 'unpacked') hgfile = os.path.join(tmpdir,'test_compress.tar.gz') if hp_kernel: print("use hp_kernel") hashdb.submit_save(hp_kernel, project=project) os.mkdir(unpacked) dstfile = _download(urls['kernel'], tmpdir) print("unpack") cp = subprocess.run(['tar', '-xf', dstfile, '-C', unpacked]) assert cp.returncode == 0 dusz = du(unpacked) print("pack") hashget.operations.pack( hashdb=hashdb, root=unpacked, file=hgfile, zip=True, exclude=None, skip=None, anchors=None, heuristics=None, pool=pool, pull=False, project=project) hgsz = os.stat(hgfile).st_size ratio = hgsz * 100.0 / dusz print("Compressed {} to {} {:.2f}%".format(kmgt(dusz), kmgt(hgsz), ratio)) assert(ratio < 1) results.append(dict( url=urls['kernel'], unpacked=dusz, packed=hgsz ))
def test_wordpress(tmpdir): unpacked = os.path.join(tmpdir, 'unpacked') hgfile = os.path.join(tmpdir,'test_compress.tar.gz') os.mkdir(unpacked) dstfile = _download(urls['wordpress'], tmpdir) cp = subprocess.run(['unzip', '-q', dstfile,'-d', unpacked]) assert cp.returncode == 0 dusz = du(unpacked) # write hint hint = dict(url = urls['wordpress'], project=project) hintfile = os.path.join(unpacked, '.hashget-hint.json') with open(hintfile, 'w') as outfile: json.dump(hint, outfile) hashget.operations.pack( hashdb=hashdb, root=unpacked, file=hgfile, zip=True, exclude=None, skip=None, anchors=None, heuristics=None, pool=pool, pull=False) hgsz = os.stat(hgfile).st_size ratio = hgsz * 100.0 / dusz assert(ratio < 1) print("Compressed {} to {} {:.2f}%".format(kmgt(dusz), kmgt(hgsz), ratio)) results.append(dict( url=urls['wordpress'], unpacked=dusz, packed=hgsz ))
def info(rfilepath, root=None, subcommand='info', pool=None): rfile = hashget.restorefile.RestoreFile( os.path.join(root, '.hashget-restore.json')) np_total = 0 np_down = 0 np_pool = 0 pool_bytes = 0 if subcommand == 'info': if rfile.expired(): log.warning('WARNING: Restoring from expired ({}) archive.'.format( rfile.get_field('expires'))) for pdata in rfile.packages_iter(): np_total += 1 poolfile = pool.get(pdata['hash'], name=pdata['url'], default=None) if poolfile: np_pool += 1 pool_bytes += os.stat(poolfile).st_size else: np_down += 1 print("Total: {} packages ({})\n" "In pool: {} packages ({})\n" "Download: {} packages\n".format(np_total, kmgt(rfile.package_size), np_pool, kmgt(pool_bytes), np_down)) elif subcommand == 'list': for pdata in rfile.packages_iter(): np_total += 1 if not pool.get(pdata['hash'], name=pdata['url'], default=None): print(pdata['url'])
def index(hashdb, root, anchors=None, filesz=None, heuristics=None, pool=None, pull=False, project=None): if heuristics is None: heuristics = list(['all']) filesz = filesz or 10 * 1024 SubmitRequest.filesz = filesz anchors = anchors or hashget.anchor.AnchorList() heur = HeuristicSet(hashdb=hashdb, heuristics=heuristics) c = Counters(['total', 'local', 'pulled', 'new', 'skipped']) started = time.time() for dir, subdirs, files in os.walk(root): for basename in files: filename = os.path.join(dir, basename) if os.path.islink(filename) or not os.path.isfile(filename): continue f = hashget.file.File(filename) anchors.check_append(f) srlist = heur.process(filename) for sr in srlist: c.inc('total') if sr.sig_present(): log.debug("local {}".format(sr.first_sig()[1])) c.inc('local') continue if sr.pull_sig(): log.info("pulled {}".format(sr.first_sig()[1])) c.inc('pulled') continue if sr.url: log.info("submitting {}".format(sr.url)) sr.submit(pool=pool, project=project) c.inc('new') else: log.info("skipped {}".format(sr.first_sig()[1])) c.inc('skipped') if pull: log.debug('Try pulling {} anchors'.format(len(anchors))) for a in anchors.anchorlist: pullanchor = hashdb.pull_anchor(a.get_hashspec()) log.debug('pull anchor for {} {}: {}'.format( a.filename, kmgt(a.size), pullanchor)) log.info( 'Indexing done in {:.2f}s. {} local + {} pulled + {} new + {} skipped = {} total packages' .format(time.time() - started, c.local, c.pulled, c.new, c.skipped, c.total)) print(c) return c