def get_warc_identifiers(sip): """Parses the SIP in HDFS and retrieves WARC/ARK tuples.""" w = webhdfs.API(prefix=WEBHDFS) identifiers = [] tar = "%s/%s.tar.gz" % (SIP_ROOT, sip) if w.exists(tar): logger.debug("Found %s" % tar) t = w.open(tar) tar = tarfile.open(mode="r:gz", fileobj=StringIO(t)) for i in tar.getmembers(): if i.name.endswith(".xml"): xml = tar.extractfile(i).read() tree = etree.fromstring(xml) for warc in tree.xpath( "//mets:file[@MIMETYPE='application/warc']", namespaces=NS): try: admid = warc.attrib["ADMID"] amdsec = tree.xpath("//mets:amdSec[@ID='%s']" % admid, namespaces=NS)[0] oiv = amdsec.xpath( "mets:digiprovMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue", namespaces=NS)[0] path = re.findall( "^.+(/heritrix.+\.warc\.gz)\?.+$", warc.xpath("mets:FLocat", namespaces=NS)[0].attrib["%shref" % XLINK])[0] identifiers.append((path, oiv.text)) except IndexError as i: logger.error("Problem parsing METS for SIP: %s" % sip) else: logger.warning("Could not find SIP: hdfs://%s" % tar) return identifiers
def update_solr(warc_arks): """Given a list of WARCs, submits the content to Solr.""" warcs = [w for (w, a) in warc_arks] input = tempfile.NamedTemporaryFile(delete=False) for warc in warcs: input.write("%s\n" % warc) input.close() w = webhdfs.API(prefix=WEBHDFS, user="******") logger.info("Copying input file to HDFS: %s" % input.name) w.create(input.name, file=input.name) if not w.exists(input.name): logger.error("Problem copying input file: %s" % input.name) sys.exit(1) output = "%s/%s/" % (SOLR_OUTPUT, datetime.now().strftime("%Y%m%d%H%M%S")) command = "hadoop jar %s %s -Dmapred.compress.map.output=true -i %s -o %s -c %s -a -w -x" % ( HADOOP_JAR, SOLR_CLASS, input.name, output, SOLR_CONF) logger.info("Running command: %s" % command) try: solr = subprocess.check_output(command.split()) except subprocess.CalledProcessError as s: logger.error("CalledProcessError: %s" % str(s)) sys.exit(1) logger.info("Removing input files from HDFS.") w.delete(input.name) os.unlink(input.name) if w.exists(input.name) or os.path.exists(input.name): logger.warning("Problem deleting input file: %s" % input.name)
def copy_to_hdfs(sip_dir): """Creates a tarball of a SIP and copies to HDFS.""" gztar = shutil.make_archive(base_name=sip_dir, format="gztar", root_dir=os.path.dirname(sip_dir), base_dir=os.path.basename(sip_dir)) w = webhdfs.API(prefix=settings.WEBHDFS_PREFIX, user=settings.WEBHDFS_USER) r = w.create(gztar, file=gztar) if not r.status_code == 201: raise Exception("Error copying to HDFS: %s" % dir) return gztar
def __init__(self, jobs, jobname, dummy=False, warcs=None, viral=None, logs=None): """Sets up APIs.""" self.webhdfs = webhdfs.API(prefix=WEBHDFS_PREFIX, user=WEBHDFS_USER) self.dummy = dummy self.jobname = jobname self.jobs = jobs self.warcs = warcs self.viral = viral self.logs = logs self.startdate = None
def create_cdx(warc_arks): """Given a list of (WARC, ARK) tuples, creates a CDX in HDFS.""" input = tempfile.NamedTemporaryFile(delete=False) warc_ark_lookup = tempfile.NamedTemporaryFile(delete=False) for warc, ark in warc_arks: input.write("%s\n" % warc) warc_ark_lookup.write("%s\t%s\n" % (os.path.basename(warc), ark)) input.close() warc_ark_lookup.close() w = webhdfs.API(prefix=WEBHDFS, user="******") logger.info("Copying input file to HDFS: %s" % input.name) w.create(input.name, file=input.name) if not w.exists(input.name): logger.error("Problem copying input file: %s" % input.name) sys.exit(1) logger.info("Copying lookup file to HDFS: %s" % warc_ark_lookup.name) w.create(warc_ark_lookup.name, warc_ark_lookup.name) if not w.exists(warc_ark_lookup.name): logger.error("Problem copying lookup file: %s" % warc_ark_lookup.name) sys.exit(1) output = "%s/index-queue-%s/%s/" % (CDX_OUTPUT, datetime.now().strftime( "%Y%m%d%H%M%S"), datetime.now().strftime("%Y%m%d%H%M%S")) command = "hadoop jar %s %s -i %s -o %s -s /tmp/split.txt -r 260 -h -a %s -w" % ( HADOOP_JAR, CDX_CLASS, input.name, output, warc_ark_lookup.name) logger.info("Running command: %s" % command) try: hadoop = subprocess.check_output(command.split()) except subprocess.CalledProcessError as s: logger.error("CalledProcessError: %s" % str(s)) sys.exit(1) hdfssize = 0 for part in w.list(output)["FileStatuses"]["FileStatus"]: hdfssize += part["length"] if hdfssize == 0: logger.warning("Problem creating CDX!") sys.exit(1) logger.info("Removing input files from HDFS.") w.delete(input.name) w.delete(warc_ark_lookup.name) if w.exists(input.name) or w.exists(warc_ark_lookup.name): logger.warning("Problem deleting input files: %s" % [input.name, warc_ark_lookup.name])
def sort_cdx(delete_input=False): """Sorts a single directory in HDFS.""" logger.info("Sorting CDX files...") w = webhdfs.API(prefix=WEBHDFS, user="******") if w.exists(CDX_SORTED): logger.info("Removing old, sorted CDX.") w.delete(CDX_SORTED, recursive=True) command = "hadoop jar %s %s -Dmapred.compress.map.output=true --total-order 0.1 100000 100 %s %s" % ( HADOOP_SORT, HADOOP_SORT_CLASS, "%s/*" % CDX_COMBINED, CDX_SORTED) try: solr = subprocess.check_output(command.split()) except subprocess.CalledProcessError as s: logger.error("CalledProcessError: %s" % str(s)) sys.exit(1) if delete_input: logger.info("Deleting sort-input.") w.delete(CDX_COMBINED, recursive=True)
def create_sip(job): """Creates a SIP and returns the path to the folder containing the METS.""" sip_dir = "%s/%s" % (settings.SIP_ROOT, job) w = webhdfs.API(prefix=settings.WEBHDFS_PREFIX, user=settings.WEBHDFS_USER) if os.path.exists(sip_dir): raise Exception("Directory already exists: %s." % sip_dir) if w.exists("%s.tar.gz" % sip_dir): raise Exception("SIP already exists in HDFS: %s.tar.gz" % sip_dir) s = sip.SipCreator(jobs=[job], jobname=job, dummy=settings.DUMMY) if s.verifySetup(): s.processJobs() s.createMets() filename = os.path.basename(job) os.makedirs(sip_dir) with open("%s/%s.xml" % (sip_dir, filename), "wb") as o: s.writeMets(o) s.bagit(sip_dir) else: raise Exception("Could not verify SIP for %s" % job) return sip_dir
def get_identifiers(sip): """Parses the SIP in HDFS and retrieves ARKs.""" w = webhdfs.API(prefix=WEBHDFS) arks = [] tar = "%s/%s.tar.gz" % (SIP_ROOT, sip) if w.exists(tar): logger.debug("Found %s" % tar) t = w.open(tar) tar = tarfile.open(mode="r:gz", fileobj=StringIO(t)) for i in tar.getmembers(): if i.name.endswith(".xml"): xml = tar.extractfile(i).read() tree = etree.fromstring(xml) for warc in tree.xpath( "//premis:object[premis:objectCharacteristics/premis:format/premis:formatDesignation/premis:formatName='application/warc']", namespaces=NS): for id in warc.xpath( "premis:objectIdentifier/premis:objectIdentifierValue", namespaces=NS): arks.append(id.text.replace("ark:/81055/", "")) else: logger.warning("Could not find SIP: hdfs://%s" % tar) return arks
def combine_cdx(): """Merges multiple CDX outputs to a single directory.""" logger.info("Merging CDX files...") w = webhdfs.API(prefix=WEBHDFS, user="******") if w.exists(CDX_COMBINED): logger.info("Removing old, merged CDX.") w.delete(CDX_COMBINED, recursive=True) if w.exists(CDX_SORTED): logger.info("Removing old, sorted CDX.") w.delete(CDX_SORTED, recursive=True) command = "hadoop jar %s \ -Dmapred.reduce.tasks=80 \ -Dmapred.textoutputformat.separator=# \ -Dmapred.job.name=combine \ -Dmapred.compress.map.output=true \ -mapper cat \ -reducer cat \ -input %s \ -output %s" % (HADOOP_STREAM, "/wayback/cdx-index/*/*/part-*", CDX_COMBINED) try: solr = subprocess.check_output(command.split()) except subprocess.CalledProcessError as s: logger.error("CalledProcessError: %s" % str(s)) sys.exit(1)
import sys import json import logging import webhdfs import subprocess from time import sleep from urlparse import urlparse from threading import Thread logger = logging.getLogger( "ld-to-ukwa-opera" ) s = logging.StreamHandler( sys.stdout ) s.setLevel( logging.DEBUG ) s.setFormatter( logging.Formatter( "[%(asctime)s] %(levelname)s: %(message)s" ) ) logger.addHandler( s ) w = webhdfs.API( prefix="http://dls.httpfs.wa.bl.uk:14000/webhdfs/v1" ) a = act.ACT() def threaded( node, logger ): body = node[ "body" ][ "value" ] id = node[ "field_target" ][ "id" ] data = a.request_node( str( id ) ) wct_id = str( data[ "field_wct_id" ] ) timestamp = node[ "field_timestamp" ] logger.info( "Migrating %s" % timestamp ) domains = [] for url in data[ "field_url" ]: domains.append( urlparse( url[ "url" ] ).netloc ) jobname = re.findall( "Job ID: ([^<]+)", body )[ 0 ] logger.debug( "{\n\t\"id\": %s,\n\t\"wct_id\": %s,\n\t\"timestamp\": %s\n\t\"domains\": %s\n\t\"jobname\": %s\n}" % ( id, wct_id, timestamp, domains, jobname ) ) cdx = "/dev/shm/%s-%s.cdx" % ( wct_id, timestamp )
disk. If so, replaces the local version. """ import os import sys import logging import webhdfs from hdfssync import settings LOGGING_FORMAT="[%(asctime)s] %(levelname)s: %(message)s" logging.basicConfig( format=LOGGING_FORMAT, level=logging.INFO ) logger = logging.getLogger( "hdfscdxsync" ) logging.root.setLevel( logging.INFO ) if __name__ == "__main__": w = webhdfs.API( prefix="http://%s:14000/webhdfs/v1" % settings.hdfshost, user=settings.hdfscdxuser ) if not w.exists( settings.hdfscdxroot ): logger.error( "No HDFS CDX found: %s" % settings.hdfscdx ) sys.exit( 1 ) if not os.path.exists( settings.localcdx ): logger.error( "No local CDX found: %s" % settings.localcdx ) sys.exit( 1 ) hdfssize = 0 for part in w.list( settings.hdfscdxroot )[ "FileStatuses" ][ "FileStatus" ]: hdfssize += part[ "length" ] localcdxsize = os.stat( settings.localcdx ).st_size if hdfssize > localcdxsize: logger.info( "Replacing local CDX (%s) with HDFS CDX (%s)." % ( settings.localcdx, settings.hdfscdxroot ) ) with open( settings.localcdx, "wb" ) as o: