def process_warcs(h, name, to_dir): logger = logging.getLogger('archive-%s' % name) try: logger.info('Processing WARCs into days') from_dir = os.path.dirname(h.get_job_info(name)['job']['primaryConfig']) for r in filter_records(from_dir): date, time = r.headers['WARC-Date'].split('T', 1) w = WARC(os.path.join(to_dir, date + ".warc"), order_by='WARC-Date') w.add(r) w.save() except KeyError: logger.error('Couldn\'t find primaryConfig') logger.error(str(h.get_job_info(name)))
import os import archive from github.WARC import WARC jobs_dir = '/cs/research/fmedia/data5/wmayor/github/heritrix-3.1.1/jobs' warcs_dir = '/cs/research/fmedia/data5/wmayor/github/warcs' for e in os.listdir(jobs_dir): p = os.path.join(jobs_dir, e) if os.path.isdir(p): print p for r in archive.filter_records(p): date, time = r.headers['WARC-Date'].split('T', 1) w = WARC(os.path.join(warcs_dir, date + ".warc"), order_by='WARC-Date') w.add(r) w.save() print 'Done!'