Example #1
0
def get_warc_identifiers(sip):
    """Parses the SIP in HDFS and retrieves WARC/ARK tuples."""
    w = webhdfs.API(prefix=WEBHDFS)
    identifiers = []
    tar = "%s/%s.tar.gz" % (SIP_ROOT, sip)
    if w.exists(tar):
        logger.debug("Found %s" % tar)
        t = w.open(tar)
        tar = tarfile.open(mode="r:gz", fileobj=StringIO(t))
        for i in tar.getmembers():
            if i.name.endswith(".xml"):
                xml = tar.extractfile(i).read()
                tree = etree.fromstring(xml)
                for warc in tree.xpath(
                        "//mets:file[@MIMETYPE='application/warc']",
                        namespaces=NS):
                    try:
                        admid = warc.attrib["ADMID"]
                        amdsec = tree.xpath("//mets:amdSec[@ID='%s']" % admid,
                                            namespaces=NS)[0]
                        oiv = amdsec.xpath(
                            "mets:digiprovMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue",
                            namespaces=NS)[0]
                        path = re.findall(
                            "^.+(/heritrix.+\.warc\.gz)\?.+$",
                            warc.xpath("mets:FLocat",
                                       namespaces=NS)[0].attrib["%shref" %
                                                                XLINK])[0]
                        identifiers.append((path, oiv.text))
                    except IndexError as i:
                        logger.error("Problem parsing METS for SIP: %s" % sip)
    else:
        logger.warning("Could not find SIP: hdfs://%s" % tar)
    return identifiers
Example #2
0
def update_solr(warc_arks):
    """Given a list of WARCs, submits the content to Solr."""
    warcs = [w for (w, a) in warc_arks]
    input = tempfile.NamedTemporaryFile(delete=False)
    for warc in warcs:
        input.write("%s\n" % warc)
    input.close()

    w = webhdfs.API(prefix=WEBHDFS, user="******")
    logger.info("Copying input file to HDFS: %s" % input.name)
    w.create(input.name, file=input.name)
    if not w.exists(input.name):
        logger.error("Problem copying input file: %s" % input.name)
        sys.exit(1)
    output = "%s/%s/" % (SOLR_OUTPUT, datetime.now().strftime("%Y%m%d%H%M%S"))
    command = "hadoop jar %s %s -Dmapred.compress.map.output=true -i %s -o %s -c %s -a -w -x" % (
        HADOOP_JAR, SOLR_CLASS, input.name, output, SOLR_CONF)
    logger.info("Running command: %s" % command)
    try:
        solr = subprocess.check_output(command.split())
    except subprocess.CalledProcessError as s:
        logger.error("CalledProcessError: %s" % str(s))
        sys.exit(1)

    logger.info("Removing input files from HDFS.")
    w.delete(input.name)
    os.unlink(input.name)
    if w.exists(input.name) or os.path.exists(input.name):
        logger.warning("Problem deleting input file: %s" % input.name)
Example #3
0
def copy_to_hdfs(sip_dir):
    """Creates a tarball of a SIP and copies to HDFS."""
    gztar = shutil.make_archive(base_name=sip_dir,
                                format="gztar",
                                root_dir=os.path.dirname(sip_dir),
                                base_dir=os.path.basename(sip_dir))
    w = webhdfs.API(prefix=settings.WEBHDFS_PREFIX, user=settings.WEBHDFS_USER)
    r = w.create(gztar, file=gztar)
    if not r.status_code == 201:
        raise Exception("Error copying to HDFS: %s" % dir)
    return gztar
Example #4
0
 def __init__(self,
              jobs,
              jobname,
              dummy=False,
              warcs=None,
              viral=None,
              logs=None):
     """Sets up APIs."""
     self.webhdfs = webhdfs.API(prefix=WEBHDFS_PREFIX, user=WEBHDFS_USER)
     self.dummy = dummy
     self.jobname = jobname
     self.jobs = jobs
     self.warcs = warcs
     self.viral = viral
     self.logs = logs
     self.startdate = None
Example #5
0
def create_cdx(warc_arks):
    """Given a list of (WARC, ARK) tuples, creates a CDX in HDFS."""
    input = tempfile.NamedTemporaryFile(delete=False)
    warc_ark_lookup = tempfile.NamedTemporaryFile(delete=False)
    for warc, ark in warc_arks:
        input.write("%s\n" % warc)
        warc_ark_lookup.write("%s\t%s\n" % (os.path.basename(warc), ark))
    input.close()
    warc_ark_lookup.close()

    w = webhdfs.API(prefix=WEBHDFS, user="******")
    logger.info("Copying input file to HDFS: %s" % input.name)
    w.create(input.name, file=input.name)
    if not w.exists(input.name):
        logger.error("Problem copying input file: %s" % input.name)
        sys.exit(1)
    logger.info("Copying lookup file to HDFS: %s" % warc_ark_lookup.name)
    w.create(warc_ark_lookup.name, warc_ark_lookup.name)
    if not w.exists(warc_ark_lookup.name):
        logger.error("Problem copying lookup file: %s" % warc_ark_lookup.name)
        sys.exit(1)

    output = "%s/index-queue-%s/%s/" % (CDX_OUTPUT, datetime.now().strftime(
        "%Y%m%d%H%M%S"), datetime.now().strftime("%Y%m%d%H%M%S"))
    command = "hadoop jar %s %s -i %s -o %s -s /tmp/split.txt -r 260 -h -a %s -w" % (
        HADOOP_JAR, CDX_CLASS, input.name, output, warc_ark_lookup.name)
    logger.info("Running command: %s" % command)
    try:
        hadoop = subprocess.check_output(command.split())
    except subprocess.CalledProcessError as s:
        logger.error("CalledProcessError: %s" % str(s))
        sys.exit(1)

    hdfssize = 0
    for part in w.list(output)["FileStatuses"]["FileStatus"]:
        hdfssize += part["length"]
    if hdfssize == 0:
        logger.warning("Problem creating CDX!")
        sys.exit(1)

    logger.info("Removing input files from HDFS.")
    w.delete(input.name)
    w.delete(warc_ark_lookup.name)
    if w.exists(input.name) or w.exists(warc_ark_lookup.name):
        logger.warning("Problem deleting input files: %s" %
                       [input.name, warc_ark_lookup.name])
Example #6
0
def sort_cdx(delete_input=False):
    """Sorts a single directory in HDFS."""
    logger.info("Sorting CDX files...")
    w = webhdfs.API(prefix=WEBHDFS, user="******")
    if w.exists(CDX_SORTED):
        logger.info("Removing old, sorted CDX.")
        w.delete(CDX_SORTED, recursive=True)
    command = "hadoop jar %s %s -Dmapred.compress.map.output=true --total-order 0.1 100000 100 %s %s" % (
        HADOOP_SORT, HADOOP_SORT_CLASS, "%s/*" % CDX_COMBINED, CDX_SORTED)
    try:
        solr = subprocess.check_output(command.split())
    except subprocess.CalledProcessError as s:
        logger.error("CalledProcessError: %s" % str(s))
        sys.exit(1)
    if delete_input:
        logger.info("Deleting sort-input.")
        w.delete(CDX_COMBINED, recursive=True)
Example #7
0
def create_sip(job):
    """Creates a SIP and returns the path to the folder containing the METS."""
    sip_dir = "%s/%s" % (settings.SIP_ROOT, job)
    w = webhdfs.API(prefix=settings.WEBHDFS_PREFIX, user=settings.WEBHDFS_USER)
    if os.path.exists(sip_dir):
        raise Exception("Directory already exists: %s." % sip_dir)
    if w.exists("%s.tar.gz" % sip_dir):
        raise Exception("SIP already exists in HDFS: %s.tar.gz" % sip_dir)

    s = sip.SipCreator(jobs=[job], jobname=job, dummy=settings.DUMMY)
    if s.verifySetup():
        s.processJobs()
        s.createMets()
        filename = os.path.basename(job)
        os.makedirs(sip_dir)
        with open("%s/%s.xml" % (sip_dir, filename), "wb") as o:
            s.writeMets(o)
        s.bagit(sip_dir)
    else:
        raise Exception("Could not verify SIP for %s" % job)
    return sip_dir
Example #8
0
def get_identifiers(sip):
    """Parses the SIP in HDFS and retrieves ARKs."""
    w = webhdfs.API(prefix=WEBHDFS)
    arks = []
    tar = "%s/%s.tar.gz" % (SIP_ROOT, sip)
    if w.exists(tar):
        logger.debug("Found %s" % tar)
        t = w.open(tar)
        tar = tarfile.open(mode="r:gz", fileobj=StringIO(t))
        for i in tar.getmembers():
            if i.name.endswith(".xml"):
                xml = tar.extractfile(i).read()
                tree = etree.fromstring(xml)
                for warc in tree.xpath(
                        "//premis:object[premis:objectCharacteristics/premis:format/premis:formatDesignation/premis:formatName='application/warc']",
                        namespaces=NS):
                    for id in warc.xpath(
                            "premis:objectIdentifier/premis:objectIdentifierValue",
                            namespaces=NS):
                        arks.append(id.text.replace("ark:/81055/", ""))
    else:
        logger.warning("Could not find SIP: hdfs://%s" % tar)
    return arks
Example #9
0
def combine_cdx():
    """Merges multiple CDX outputs to a single directory."""
    logger.info("Merging CDX files...")
    w = webhdfs.API(prefix=WEBHDFS, user="******")
    if w.exists(CDX_COMBINED):
        logger.info("Removing old, merged CDX.")
        w.delete(CDX_COMBINED, recursive=True)
    if w.exists(CDX_SORTED):
        logger.info("Removing old, sorted CDX.")
        w.delete(CDX_SORTED, recursive=True)
    command = "hadoop jar %s \
		-Dmapred.reduce.tasks=80 \
		-Dmapred.textoutputformat.separator=# \
		-Dmapred.job.name=combine \
        -Dmapred.compress.map.output=true \
		-mapper cat \
		-reducer cat \
		-input %s \
		-output %s" % (HADOOP_STREAM, "/wayback/cdx-index/*/*/part-*", CDX_COMBINED)
    try:
        solr = subprocess.check_output(command.split())
    except subprocess.CalledProcessError as s:
        logger.error("CalledProcessError: %s" % str(s))
        sys.exit(1)
Example #10
0
import sys
import json
import logging
import webhdfs
import subprocess
from time import sleep
from urlparse import urlparse
from threading import Thread

logger = logging.getLogger( "ld-to-ukwa-opera" )
s = logging.StreamHandler( sys.stdout )
s.setLevel( logging.DEBUG )
s.setFormatter( logging.Formatter( "[%(asctime)s] %(levelname)s: %(message)s" ) )
logger.addHandler( s )

w = webhdfs.API( prefix="http://dls.httpfs.wa.bl.uk:14000/webhdfs/v1" )
a = act.ACT()

def threaded( node, logger ):
	body = node[ "body" ][ "value" ]
	id = node[ "field_target" ][ "id" ]
	data = a.request_node( str( id ) )
	wct_id = str( data[ "field_wct_id" ] )
	timestamp = node[ "field_timestamp" ]
	logger.info( "Migrating %s" % timestamp )
	domains = []
	for url in data[ "field_url" ]:
		domains.append( urlparse( url[ "url" ] ).netloc )
	jobname = re.findall( "Job ID: ([^<]+)", body )[ 0 ]
	logger.debug( "{\n\t\"id\": %s,\n\t\"wct_id\": %s,\n\t\"timestamp\": %s\n\t\"domains\": %s\n\t\"jobname\": %s\n}" % ( id, wct_id, timestamp, domains, jobname ) )
	cdx = "/dev/shm/%s-%s.cdx" % ( wct_id, timestamp )
Example #11
0
disk. If so, replaces the local version.
"""

import os
import sys
import logging
import webhdfs
from hdfssync import settings

LOGGING_FORMAT="[%(asctime)s] %(levelname)s: %(message)s"
logging.basicConfig( format=LOGGING_FORMAT, level=logging.INFO )
logger = logging.getLogger( "hdfscdxsync" )
logging.root.setLevel( logging.INFO )

if __name__ == "__main__":
	w = webhdfs.API( prefix="http://%s:14000/webhdfs/v1" % settings.hdfshost, user=settings.hdfscdxuser )
	if not w.exists( settings.hdfscdxroot ):
		logger.error( "No HDFS CDX found: %s" % settings.hdfscdx )
		sys.exit( 1 )
	if not os.path.exists( settings.localcdx ):
		logger.error( "No local CDX found: %s" % settings.localcdx )
		sys.exit( 1 )

	hdfssize = 0
	for part in w.list( settings.hdfscdxroot )[ "FileStatuses" ][ "FileStatus" ]:
		hdfssize += part[ "length" ]

	localcdxsize = os.stat( settings.localcdx ).st_size
	if hdfssize > localcdxsize:
		logger.info( "Replacing local CDX (%s) with HDFS CDX (%s)." % ( settings.localcdx, settings.hdfscdxroot ) )
		with open( settings.localcdx, "wb" ) as o: