Exemple #1
0
def _download(url, dirpath):
    chunk_size = 10*1024*1024
    downloaded = 0
    path = os.path.join(dirpath, os.path.basename(url))

    if pool:
        # check in pool
        poolpath = pool.get_by_basename(os.path.basename(url))
        if poolpath:
            # found in pool!
            shutil.copyfile(poolpath, path)
            print("Take from pool", poolpath)
            return path


    r = requests.get(url, stream=True)
    assert(r.status_code == 200)

    with open(path, 'wb') as f:
        for chunk in r.iter_content(chunk_size=chunk_size):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)
                downloaded += chunk_size
                print("... downloaded {}".format(kmgt(downloaded)))

    print("Downloaded {} to {}".format(kmgt(os.stat(path).st_size), path))

    if pool:
        print("Put to pool")
        pool.append(path)

    return path
Exemple #2
0
def _table():

    columns = [ "File", "Unpacked", "Packed", "Ratio" ]
    fmt_data  = "{:<30} {:<15} {:<15} {:<15.3f}"
    fmt_title = "{:<30} {:<15} {:<15} {:<15}"
    print()
    print(fmt_title.format(*columns))
    print(fmt_title.format( *(["---"] * len(columns)) ))
    for data in results:
        print(fmt_data.format(
            os.path.basename(data['url']),
            kmgt(data['unpacked']),
            kmgt(data['packed']),
            data['packed'] * 100.0 / data['unpacked']
            ))
Exemple #3
0
def test_kernel(tmpdir):
    unpacked = os.path.join(tmpdir, 'unpacked')
    hgfile = os.path.join(tmpdir,'test_compress.tar.gz')


    if hp_kernel:
        print("use hp_kernel")
        hashdb.submit_save(hp_kernel, project=project)

    os.mkdir(unpacked)

    dstfile = _download(urls['kernel'], tmpdir)

    print("unpack")
    cp = subprocess.run(['tar', '-xf', dstfile, '-C', unpacked])
    assert cp.returncode == 0
    dusz = du(unpacked)

    print("pack")
    hashget.operations.pack(
        hashdb=hashdb,
        root=unpacked,
        file=hgfile,
        zip=True,
        exclude=None,
        skip=None,
        anchors=None,
        heuristics=None,
        pool=pool,
        pull=False,
        project=project)

    hgsz = os.stat(hgfile).st_size
    ratio = hgsz * 100.0 / dusz
    print("Compressed {} to {} {:.2f}%".format(kmgt(dusz), kmgt(hgsz), ratio))
    assert(ratio < 1)


    results.append(dict(
        url=urls['kernel'],
        unpacked=dusz,
        packed=hgsz
    ))
Exemple #4
0
def test_wordpress(tmpdir):
    unpacked = os.path.join(tmpdir, 'unpacked')
    hgfile = os.path.join(tmpdir,'test_compress.tar.gz')

    os.mkdir(unpacked)
    dstfile = _download(urls['wordpress'], tmpdir)
    cp = subprocess.run(['unzip', '-q', dstfile,'-d', unpacked])
    assert cp.returncode == 0
    dusz = du(unpacked)

    # write hint
    hint = dict(url = urls['wordpress'], project=project)
    hintfile = os.path.join(unpacked, '.hashget-hint.json')
    with open(hintfile, 'w') as outfile:
        json.dump(hint, outfile)

    hashget.operations.pack(
        hashdb=hashdb,
        root=unpacked,
        file=hgfile,
        zip=True,
        exclude=None,
        skip=None,
        anchors=None,
        heuristics=None,
        pool=pool,
        pull=False)

    hgsz = os.stat(hgfile).st_size
    ratio = hgsz * 100.0 / dusz
    assert(ratio < 1)
    print("Compressed {} to {} {:.2f}%".format(kmgt(dusz), kmgt(hgsz), ratio))


    results.append(dict(
        url=urls['wordpress'],
        unpacked=dusz,
        packed=hgsz
    ))
Exemple #5
0
def info(rfilepath, root=None, subcommand='info', pool=None):

    rfile = hashget.restorefile.RestoreFile(
        os.path.join(root, '.hashget-restore.json'))

    np_total = 0
    np_down = 0
    np_pool = 0

    pool_bytes = 0

    if subcommand == 'info':

        if rfile.expired():
            log.warning('WARNING: Restoring from expired ({}) archive.'.format(
                rfile.get_field('expires')))

        for pdata in rfile.packages_iter():
            np_total += 1
            poolfile = pool.get(pdata['hash'], name=pdata['url'], default=None)
            if poolfile:
                np_pool += 1
                pool_bytes += os.stat(poolfile).st_size
            else:
                np_down += 1

        print("Total: {} packages ({})\n"
              "In pool: {} packages ({})\n"
              "Download: {} packages\n".format(np_total,
                                               kmgt(rfile.package_size),
                                               np_pool, kmgt(pool_bytes),
                                               np_down))

    elif subcommand == 'list':
        for pdata in rfile.packages_iter():
            np_total += 1
            if not pool.get(pdata['hash'], name=pdata['url'], default=None):
                print(pdata['url'])
Exemple #6
0
def index(hashdb,
          root,
          anchors=None,
          filesz=None,
          heuristics=None,
          pool=None,
          pull=False,
          project=None):

    if heuristics is None:
        heuristics = list(['all'])

    filesz = filesz or 10 * 1024
    SubmitRequest.filesz = filesz

    anchors = anchors or hashget.anchor.AnchorList()

    heur = HeuristicSet(hashdb=hashdb, heuristics=heuristics)

    c = Counters(['total', 'local', 'pulled', 'new', 'skipped'])

    started = time.time()

    for dir, subdirs, files in os.walk(root):
        for basename in files:
            filename = os.path.join(dir, basename)

            if os.path.islink(filename) or not os.path.isfile(filename):
                continue

            f = hashget.file.File(filename)
            anchors.check_append(f)

            srlist = heur.process(filename)

            for sr in srlist:

                c.inc('total')

                if sr.sig_present():
                    log.debug("local {}".format(sr.first_sig()[1]))
                    c.inc('local')
                    continue

                if sr.pull_sig():
                    log.info("pulled {}".format(sr.first_sig()[1]))
                    c.inc('pulled')
                    continue

                if sr.url:
                    log.info("submitting {}".format(sr.url))
                    sr.submit(pool=pool, project=project)
                    c.inc('new')
                else:
                    log.info("skipped {}".format(sr.first_sig()[1]))
                    c.inc('skipped')

    if pull:
        log.debug('Try pulling {} anchors'.format(len(anchors)))
        for a in anchors.anchorlist:
            pullanchor = hashdb.pull_anchor(a.get_hashspec())
            log.debug('pull anchor for {} {}: {}'.format(
                a.filename, kmgt(a.size), pullanchor))

    log.info(
        'Indexing done in {:.2f}s. {} local + {} pulled + {} new + {} skipped = {} total packages'
        .format(time.time() - started, c.local, c.pulled, c.new, c.skipped,
                c.total))
    print(c)
    return c