Example #1
0
 def __init__( self, jobs, jobname=datetime.now().strftime( "%Y%m%d%H%M%S" ), warcs=None, viral=None, logs=None, start_date=None, dummy_run=False, hash_cache_file=None, client=None ):
     """Sets up fields."""
     if client is None:
         self.client = hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user'))
     else:
         self.client = client
     self.dummy = dummy_run
     if self.dummy:
         logger.info("This is a dummy-run - no real ARKs will be minted.")
     self.overwrite = False
     self.jobs = jobs
     self.jobname = jobname
     self.warcs = warcs
     self.viral = viral
     self.logs = logs
     self.hash_cache = {}
     self.parse_hash_cache(hash_cache_file)
     self.startdate = start_date
     self.BAGIT_CONTACT_NAME="Andrew N. Jackson"
     self.BAGIT_CONTACT_EMAIL="*****@*****.**"
     self.BAGIT_DESCRIPTION="LD Crawl: "
     self.ARK_URL="http://pii.ad.bl.uk/pii/vdc?arks="
     self.ARK_PREFIX="ark:/81055/vdc_100022535899.0x"
     # And create:
     logger.info("Processing job files...")
     self.processJobs()
     logger.info("Generating METS...")
     self.createMets()
Example #2
0
def get_all_identifiers(sip):
    """Parses the SIP in HDFS and retrieves FILE/ARK tuples."""
    client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user'))
    tar = "%s/%s.tar.gz" % (SIP_ROOT, sip)
    status = client.status(tar,strict=False)
    if status:
        # Catch empty packages:
        if status['length'] == 0:
            logger.warning("Empty (zero byte) SIP package: %s" % tar)
            yield None
        else:
            with client.read(tar) as reader:
                t = reader.read()
                # Open the package:
                tar = tarfile.open(mode="r:gz", fileobj=StringIO(t))
                foundMets = False
                for i in tar.getmembers():
                    logger.debug("Examining %s" % i.name)
                    if i.name.endswith(".xml"):
                        foundMets = True
                        xml = tar.extractfile(i).read()
                        try:
                            tree = etree.fromstring(xml)
                            files = {}
                            n_files = 0
                            for mfile in tree.xpath("//mets:file", namespaces=NS):
                                #logger.debug("Found mets:file = %s " % etree.tostring(mfile))
                                admid = mfile.attrib["ADMID"]
                                logger.info("Found mets:file admid = %s " % admid)
                                path = mfile.xpath("mets:FLocat", namespaces=NS)[0].attrib["%shref" % XLINK]
                                files[admid] = { "path": path, "mimetype": mfile.attrib["MIMETYPE"], "size": mfile.attrib["SIZE"],
                                        "checksum_type": mfile.attrib["CHECKSUMTYPE"], "checksum": mfile.attrib["CHECKSUM"] }
                                n_files = n_files + 1
                            if len(files.keys()) != n_files:
                                logger.error("ERROR, more files than IDs")
                            n_amdsecs = 0
                            for amdsec in tree.xpath("//mets:amdSec", namespaces=NS):
                                #logger.debug("Found mets:amdSec = %s " % etree.tostring(amdsec))
                                admid = amdsec.attrib["ID"]
                                logger.info("Found mets:amdSec id = %s " % admid)
                                oiv = amdsec.xpath("mets:digiprovMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue", namespaces=NS)
                                if oiv and len(oiv) == 1:
                                    files[admid]['ark'] = oiv[0].text
                                    n_amdsecs = n_amdsecs + 1
                                    logger.debug("Yielding %s" % files[admid] )
                                    yield files[admid]
                                else:
                                    logger.info("Skipping amdSec ID=%s" % admid)
                            if n_files != n_amdsecs:
                                logger.error("ERROR finding all amdSec elements")
                        except IndexError as i:
                            logger.error("Problem parsing METS for SIP: %s" % sip)
                            logger.exception(i)
                if not foundMets:
                    logger.error("No METS XML file found!")
    else:
        logger.warning("Could not find SIP: hdfs://%s" % tar)
Example #3
0
def uri_of_doc(self, **kwargs):
    try:
        logger.info("Got doc to send to W3ACT for: %s" % kwargs)

        # Set up connection to W3ACT:
        w = w3act(cfg.get('act','url'),cfg.get('act','username'),cfg.get('act','password'))
        # And post this document up:
        send_document_to_w3act(kwargs,cfg.get('wayback','endpoint'),w)

    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
Example #4
0
def calculateHash( path ):
    logger.info("Starting to generate hash for %s" % path)
    client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user'))
    sha = hashlib.sha512()
    with client.read(path) as file:
        while True:
            data = file.read( 10485760 )
            if not data:
                file.close()
                break
            sha.update( data )
    logger.info("Finished generating hash for %s" % path)
    return sha.hexdigest()
Example #5
0
def uri_of_doc(self, **kwargs):
    try:
        logger.info("Got doc to send to W3ACT for: %s" % kwargs)

        # Set up connection to W3ACT:
        w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'),
                  cfg.get('act', 'password'))
        # And post this document up:
        send_document_to_w3act(kwargs, cfg.get('wayback', 'endpoint'), w)

    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
Example #6
0
def find_identifiers(output_file):
    with open(output_file, 'w') as f:
        client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user'))
        for (path, dirs, files) in client.walk(SIP_ROOT):
            logger.info("Looking at path "+path)
            for file in files:
                logger.info("Looking at file " + file)
                if file.endswith('.tar.gz'):
                    sip = "%s/%s" % (path, file)
                    sip = sip[len(SIP_ROOT) + 1:]
                    sip = sip[:-7]
                    logger.info("Scanning %s..." % sip)
                    for waid in get_all_identifiers(sip):
                        f.write("%s %s\n" % (sip, waid) )
Example #7
0
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True):
    """
    Restarts the job for a particular frequency.
    """
    try:
        logger.info("Stopping/starting %s at %s" % (frequency, start))

        # Set up connection to W3ACT:
        w = w3act(cfg.get('act','url'),cfg.get('act','username'),cfg.get('act','password'))
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" % (cfg.get('h3','host'), cfg.get('h3','port')), username=cfg.get('h3','username'), password=cfg.get('h3','password'))

        # Stop job if currently running:
        if frequency in h.list_jobs() and h.status(frequency) != "":
            """Stops a running job, notifies RabbitMQ and cleans up the directory."""
            launch_id = h.get_launch_id(frequency)
            job = W3actJob.from_directory(w, "%s/%s" % (HERITRIX_JOBS, frequency), heritrix=h)
            job.stop()
            remove_action_files(frequency)
            crawl.status.update_job_status.delay(job.name, "%s/%s" % (job.name, launch_id), "STOPPED")

            # Pass on to the next step in the chain:
            logger.info("Requesting assembly of output for: %s/%s" % (frequency, launch_id))
            assemble_job_output.delay(frequency,launch_id)
        else:
            job = None

        # Start job if requested:
        if restart:
            targets = w.get_ld_export(frequency)
            # logger.info("Found %s Targets in export." % len(export))
            #    targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)]
            logger.debug("Found %s Targets in date range." % len(targets))
            job = W3actJob(w, targets, frequency, heritrix=h)
            logger.info("Starting job %s..." % job.name)
            job.start()
            launch_id = h.get_launch_id(frequency)
            crawl.status.update_job_status.delay(job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED" )
            logger.info("Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds)))
            return "Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds))
        else:
            if job:
                logger.info("Stopped job %s/%s without restarting..." % (job.name, launch_id))
                return "Stopped job %s/%s without restarting..." % (job.name, launch_id)
            else:
                logger.warning("No running '%s' job to stop!" % frequency)
                return "No running '%s' job to stop!" % frequency
    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
Example #8
0
def calculateHash(path):
    logger.info("Starting to generate hash for %s" % path)
    client = hdfs.InsecureClient(cfg.get('hdfs', 'url'),
                                 user=cfg.get('hdfs', 'user'))
    sha = hashlib.sha512()
    with client.read(path) as file:
        while True:
            data = file.read(10485760)
            if not data:
                file.close()
                break
            sha.update(data)
    logger.info("Finished generating hash for %s" % path)
    return sha.hexdigest()
Example #9
0
    def __init__(self,
                 date,
                 warcs,
                 viral,
                 logs,
                 identifiers,
                 hash_cache=None,
                 client=None):
        if client is None:
            self.client = hdfs.InsecureClient(cfg.get('hdfs', 'url'),
                                              user=cfg.get('hdfs', 'user'))
        else:
            self.client = client
        self.warcs = []
        self.viral = []
        self.date = date
        self.wq = Queue()
        self.vq = Queue()
        self.hash_cache = hash_cache

        for i in range(NUM_THREADS):
            worker = Thread(target=create_warcs,
                            args=(self.wq, self.warcs, self))
            worker.setDaemon(True)
            worker.start()

        for warc in warcs:
            self.wq.put(warc)
        self.wq.join()

        for i in range(NUM_THREADS):
            worker = Thread(target=create_warcs,
                            args=(self.vq, self.viral, self))
            worker.setDaemon(True)
            worker.start()

        for warc in viral:
            self.vq.put(warc)
        self.vq.join()

        self.logs = []
        for log in logs:
            self.logs.append(ZipContainer(path=log, parent=self))
        self.identifiers = identifiers
        self.createDomainMets()
        self.createCrawlerMets()
Example #10
0
def uri_to_index(self, **kwargs):
    try:
        logger.debug("Got URI to index: %s" % kwargs)
        send_uri_to_tinycdxserver(cfg.get('tinycdxserver','endpoint'), kwargs)

    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
Example #11
0
def uri_to_index(self, **kwargs):
    try:
        logger.debug("Got URI to index: %s" % kwargs)
        send_uri_to_tinycdxserver(cfg.get('tinycdxserver', 'endpoint'), kwargs)

    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
Example #12
0
    def __init__(self, job_id, launch_id):
        """Takes the checkpoint info and sets up data needed to build the SIP."""
        self.hdfs =  hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user'))
        # Set up paths:
        self.WARC_ROOT = "%s/output/warcs" % HERITRIX_HDFS_ROOT
        self.VIRAL_ROOT =  "%s/output/viral" % HERITRIX_HDFS_ROOT
        self.IMAGE_ROOT = "%s/output/images" % HERITRIX_HDFS_ROOT
        self.LOG_ROOT =  "%s/output/logs" % HERITRIX_HDFS_ROOT
        self.LOCAL_LOG_ROOT = "%s/output/logs" % HERITRIX_ROOT
        self.LOCAL_JOBS_ROOT = "%s/jobs" % HERITRIX_ROOT

        #
        self.job_id = job_id
        self.launch_id = launch_id
        self.job_launch_id = "%s/%s" % (job_id, launch_id)
        self.verify_job_launch_id()
        self.crawl_log = self.get_crawl_log()
        self.start_date = CrawlJobOutput.file_start_date([self.crawl_log])
        # Find the WARCs referenced from the crawl log:
        self.parse_crawl_log()
        # TODO Get sha512 and ARK identifiers for WARCs now, and store in launch folder and thus the zip?
        # Bundle logs and configuration data into a zip and upload it to HDFS
        self.upload_logs_as_zip()
Example #13
0
    def __init__(self, job_id, launch_id):
        """Takes the checkpoint info and sets up data needed to build the SIP."""
        self.hdfs = hdfs.InsecureClient(cfg.get('hdfs', 'url'),
                                        user=cfg.get('hdfs', 'user'))
        # Set up paths:
        self.WARC_ROOT = "%s/output/warcs" % HERITRIX_HDFS_ROOT
        self.VIRAL_ROOT = "%s/output/viral" % HERITRIX_HDFS_ROOT
        self.IMAGE_ROOT = "%s/output/images" % HERITRIX_HDFS_ROOT
        self.LOG_ROOT = "%s/output/logs" % HERITRIX_HDFS_ROOT
        self.LOCAL_LOG_ROOT = "%s/output/logs" % HERITRIX_ROOT
        self.LOCAL_JOBS_ROOT = "%s/jobs" % HERITRIX_ROOT

        #
        self.job_id = job_id
        self.launch_id = launch_id
        self.job_launch_id = "%s/%s" % (job_id, launch_id)
        self.verify_job_launch_id()
        self.crawl_log = self.get_crawl_log()
        self.start_date = CrawlJobOutput.file_start_date([self.crawl_log])
        # Find the WARCs referenced from the crawl log:
        self.parse_crawl_log()
        # TODO Get sha512 and ARK identifiers for WARCs now, and store in launch folder and thus the zip?
        # Bundle logs and configuration data into a zip and upload it to HDFS
        self.upload_logs_as_zip()
Example #14
0
    def __init__( self, date, warcs, viral, logs, identifiers, hash_cache=None, client=None ):
        if client is None:
            self.client=  hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user'))
        else:
            self.client = client
        self.warcs = []
        self.viral = []
        self.date = date
        self.wq = Queue()
        self.vq = Queue()
        self.hash_cache = hash_cache

        for i in range(NUM_THREADS):
            worker = Thread(target=create_warcs, args=(self.wq, self.warcs, self))
            worker.setDaemon(True)
            worker.start()

        for warc in warcs:
            self.wq.put(warc)
        self.wq.join()

        for i in range(NUM_THREADS):
            worker = Thread(target=create_warcs, args=(self.vq, self.viral, self))
            worker.setDaemon(True)
            worker.start()

        for warc in viral:
            self.vq.put(warc)
        self.vq.join()

        self.logs = []
        for log in logs:
            self.logs.append( ZipContainer( path=log, parent=self ))
        self.identifiers = identifiers
        self.createDomainMets()
        self.createCrawlerMets()
Example #15
0
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True):
    """
    Restarts the job for a particular frequency.
    """
    try:
        logger.info("Stopping/starting %s at %s" % (frequency, start))

        # Set up connection to W3ACT:
        w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'),
                  cfg.get('act', 'password'))
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" %
                        (cfg.get('h3', 'host'), cfg.get('h3', 'port')),
                        username=cfg.get('h3', 'username'),
                        password=cfg.get('h3', 'password'))

        # Stop job if currently running:
        if frequency in h.list_jobs() and h.status(frequency) != "":
            """Stops a running job, notifies RabbitMQ and cleans up the directory."""
            launch_id = h.get_launch_id(frequency)
            job = W3actJob.from_directory(w,
                                          "%s/%s" % (HERITRIX_JOBS, frequency),
                                          heritrix=h)
            job.stop()
            remove_action_files(frequency)
            crawl.status.update_job_status.delay(
                job.name, "%s/%s" % (job.name, launch_id), "STOPPED")

            # Pass on to the next step in the chain:
            logger.info("Requesting assembly of output for: %s/%s" %
                        (frequency, launch_id))
            assemble_job_output.delay(frequency, launch_id)
        else:
            job = None

        # Start job if requested:
        if restart:
            targets = w.get_ld_export(frequency)
            # logger.info("Found %s Targets in export." % len(export))
            #    targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)]
            logger.debug("Found %s Targets in date range." % len(targets))
            job = W3actJob(w, targets, frequency, heritrix=h)
            logger.info("Starting job %s..." % job.name)
            job.start()
            launch_id = h.get_launch_id(frequency)
            crawl.status.update_job_status.delay(
                job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED")
            logger.info("Launched job %s/%s with %s seeds." %
                        (job.name, launch_id, len(job.seeds)))
            return "Launched job %s/%s with %s seeds." % (job.name, launch_id,
                                                          len(job.seeds))
        else:
            if job:
                logger.info("Stopped job %s/%s without restarting..." %
                            (job.name, launch_id))
                return "Stopped job %s/%s without restarting..." % (job.name,
                                                                    launch_id)
            else:
                logger.warning("No running '%s' job to stop!" % frequency)
                return "No running '%s' job to stop!" % frequency
    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
Example #16
0
 def __init__(self, job_id, launch_id, sip_tgz):
     self.hdfs =  hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user'))
     self.submit_sip(launch_id,sip_tgz)
Example #17
0
from __future__ import absolute_import

import os
import bagit
import tarfile
import hdfs
import shutil

# import the Celery app context
from crawl.celery import app
from crawl.celery import cfg

# Set up drop/watched folder configuration
DLS_DROP=cfg.get('dls','drop_folder')
DLS_WATCH=cfg.get('dls','watch_folder')

# import the Celery log getter and use it
from celery.utils.log import get_task_logger
logger = get_task_logger(__name__)


#
class SubmitSip():
    def __init__(self, job_id, launch_id, sip_tgz):
        self.hdfs =  hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user'))
        self.submit_sip(launch_id,sip_tgz)

    def submit_sip(self, job_id, sip_tgz):
        """
        Download, unpack, check and submit the specified SIP tar.gz file (from HDFS)
Example #18
0
def getLength( path ):
    client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user'))
    status = client.status(path)
    return status['length']
Example #19
0
from __future__ import absolute_import

import os
import bagit
import tarfile
import hdfs
import shutil

# import the Celery app context
from crawl.celery import app
from crawl.celery import cfg

# Set up drop/watched folder configuration
DLS_DROP = cfg.get('dls', 'drop_folder')
DLS_WATCH = cfg.get('dls', 'watch_folder')

# import the Celery log getter and use it
from celery.utils.log import get_task_logger
logger = get_task_logger(__name__)


#
class SubmitSip():
    def __init__(self, job_id, launch_id, sip_tgz):
        self.hdfs = hdfs.InsecureClient(cfg.get('hdfs', 'url'),
                                        user=cfg.get('hdfs', 'user'))
        self.submit_sip(launch_id, sip_tgz)

    def submit_sip(self, job_id, sip_tgz):
        """
        Download, unpack, check and submit the specified SIP tar.gz file (from HDFS)
Example #20
0
def getLength(path):
    client = hdfs.InsecureClient(cfg.get('hdfs', 'url'),
                                 user=cfg.get('hdfs', 'user'))
    status = client.status(path)
    return status['length']
Example #21
0
 def __init__(self, job_id, launch_id, sip_tgz):
     self.hdfs = hdfs.InsecureClient(cfg.get('hdfs', 'url'),
                                     user=cfg.get('hdfs', 'user'))
     self.submit_sip(launch_id, sip_tgz)