コード例 #1
0
def run(warcs_dir, scratch_dir, db_dir, job):
    conn = sqlite3.connect(os.path.join(db_dir, 'index.db'))
    cursor = conn.cursor()
    inserts = defaultdict(list)
    for root, dirs, files in os.walk(warcs_dir):
        for f in [f for f in files if f.endswith('.warc')]:
            abs_path = os.path.join(root, f)
            for headers, content, _ in WARC(abs_path):
                older = _find_older(headers, cursor, job)
                for pname, patcher in PATCHERS.iteritems():
                    for sname, strategy in STRATEGIES.iteritems():
                        n = '%s@%s' % (pname, sname)
                        nc = os.path.join(scratch_dir, n, 'no_compression')
                        rel_path = abs_path.replace(warcs_dir, '', 1)
                        rel_path = rel_path.lstrip('/')
                        p = os.path.join(nc, rel_path)
                        w = WARC(p)
                        if len(older) > 0:
                            d_headers, d_content = strategy(
                                cursor, headers, content,
                                older, pname, patcher)
                            w.add_record(d_headers, d_content)
                            inserts[n].append((
                                d_headers['WARC-Record-ID'], len(d_content)))
                        else:
                            w.add_record(headers, content)
    conn.close()
    for n, i in inserts.iteritems():
        conn = sqlite3.connect(os.path.join(db_dir, '%s.db' % n))
        cursor = conn.cursor()
        cursor.executescript(SIZE_SCHEMA)
        cursor.executemany(INSERT_RECORD_SIZE, i)
        conn.commit()
        conn.close()
    return NAMES
コード例 #2
0
def test(original_path, diffed_path, index_path):
    o_warc = WARC(original_path)
    d_warc = WARC(diffed_path)
    for o_record, d_record in izip(o_warc.records(), d_warc.records()):
        assert o_record[0]['WARC-Record-ID'] == d_record[0]['WARC-Record-ID']
        if d_record[0]['WARC-Type'] == 'revisit':
            d_content = _test_unpatch(d_record[0], d_record[1], index_path)
            if d_content.strip() == o_record[1].strip():
                sys.stdout.write('.')
            else:
                sys.stdout.write('w')
            sys.stdout.flush()
    sys.stdout.write('\n')
コード例 #3
0
ファイル: job.py プロジェクト: WilliamMayor/warc-compression
def copy_data(from_dir, to_dir):
    warcs_dir = os.path.join(to_dir, 'warcs', 'no_delta', 'no_compression')
    count = 0
    p = os.path.join(warcs_dir, '%d.warc' % count)
    w = WARC(p)
    s = 0
    for root, _, files in os.walk(from_dir):
        filtered = filterer.duplicates(
            filterer.localhost(
                map(
                    lambda f: os.path.join(root, f),
                    filter(lambda f: f.endswith('.warc'), files))))
        for headers, content in filtered:
            s += w.add_record(headers, content)
            if s > 1024 * 1024 * 1024:
                count += 1
                p = os.path.join(warcs_dir, '%d.warc' % count)
                w = WARC(p)
    return warcs_dir
コード例 #4
0
import pprint
import os

import archive

from WARC import WARC

by_date_path = os.path.join('test', 'by_date.warc')
by_digest_path = os.path.join('test', 'by_digest.warc')

by_date = WARC(by_date_path, order_by='WARC-Date')
by_digest = WARC(by_digest_path, order_by='WARC-Payload-Digest')
for r in archive.filter_records('test'):
    by_date.add(r)
    by_digest.add(r)
by_date.save()
by_digest.save()

by_date = list(WARC(by_date_path))
by_digest = list(WARC(by_digest_path))

try:
    ids = [r.headers['WARC-Record-ID'] for r in by_date]
    assert len(ids) == 31
    expected_ids = set(['<urn:uuid:2d3ce775-3a91-4227-b256-7d1cdb174de1>', '<urn:uuid:c6ed326f-8037-45a5-b7f5-02df483a8fab>', '<urn:uuid:d9e8885a-8a36-4096-b218-6d542d6ac329>', '<urn:uuid:ea040b0c-6616-42a3-b493-4cbab6e9d274>', '<urn:uuid:b6ae5081-c16a-4840-b467-7106afdc0f93>', '<urn:uuid:c9756ab8-1eff-4ed1-a2a2-808a1b66fad9>', '<urn:uuid:1663a697-74dc-435c-8419-3fc8a9503e4b>', '<urn:uuid:447365d2-7436-452a-bade-4ab1fa842e85>', '<urn:uuid:54ddf8ed-0606-4918-99c2-c0a06e9575d4>', '<urn:uuid:bf8c9d05-696a-4dde-9e40-ef0ec84ba50f>', '<urn:uuid:0b015eff-585b-4a56-80ab-be769e72f009>', '<urn:uuid:46d97916-f9c6-43cc-a040-7cca366604a2>', '<urn:uuid:a783821e-1a6b-4b9b-910a-733733256b52>', '<urn:uuid:27491225-9b42-4751-b8be-52a6d505811c>', '<urn:uuid:f8886060-657c-4beb-a4aa-fbe974501e0a>', '<urn:uuid:b7fa14b3-c6d0-4139-bb7a-03dc069518b5>', '<urn:uuid:8454eabb-adbf-4a00-ba8e-79907d2e6189>', '<urn:uuid:e7946479-aa7a-42c7-8eca-ae2678d83e27>', '<urn:uuid:0b46cd06-dee4-4891-99e9-8523548ef325>', '<urn:uuid:7379008d-9495-407b-96e7-6c95a1be43f8>', '<urn:uuid:d015cdd7-39b1-4a48-a537-cd220b7ae14e>', '<urn:uuid:3f348f44-49e5-4398-b9eb-70cef9bdb284>', '<urn:uuid:5190ef46-8fa0-482a-ba5e-77364428051b>', '<urn:uuid:cefe8d16-3a06-4353-b21d-d546c887505a>', '<urn:uuid:1409bffd-dd90-434c-a45d-3513b10859e7>', '<urn:uuid:afb8898e-b07f-422b-962a-bd55f13a1c57>', '<urn:uuid:9b51cf15-263e-47cc-874c-257e474ad6c8>', '<urn:uuid:57c3c88b-56c1-4372-969d-f08ae8e57bdb>', '<urn:uuid:8f8b7366-a18a-452c-b428-43aa38567300>', '<urn:uuid:2ccbd7f1-458e-430e-b1c2-d88947ab5443>', '<urn:uuid:0c1f28e7-0b54-425f-a374-b10df4ade457>'])
    assert set(ids) == expected_ids

    assert by_date[0].headers['WARC-Type'].lower() == 'warcinfo'
    assert by_digest[0].headers['WARC-Type'].lower() == 'warcinfo'

    dates = [r.headers['WARC-Date'] for r in by_date]