class SipCreator: def __init__( self, jobs, jobname=datetime.now().strftime( "%Y%m%d%H%M%S" ), warcs=None, viral=None, logs=None, start_date=None, dummy_run=False, hash_cache_file=None, client=None ): """Sets up fields.""" if client is None: self.client = hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user')) else: self.client = client self.dummy = dummy_run if self.dummy: logger.info("This is a dummy-run - no real ARKs will be minted.") self.overwrite = False self.jobs = jobs self.jobname = jobname self.warcs = warcs self.viral = viral self.logs = logs self.hash_cache = {} self.parse_hash_cache(hash_cache_file) self.startdate = start_date self.BAGIT_CONTACT_NAME="Andrew N. Jackson" self.BAGIT_CONTACT_EMAIL="*****@*****.**" self.BAGIT_DESCRIPTION="LD Crawl: " self.ARK_URL="http://pii.ad.bl.uk/pii/vdc?arks=" self.ARK_PREFIX="ark:/81055/vdc_100022535899.0x" # And create: logger.info("Processing job files...") self.processJobs() logger.info("Generating METS...") self.createMets() def parse_hash_cache(self, hash_cache_file=None): if hash_cache_file is None: self.hash_cache = None return # Parse and populate the files generated by the HdfsFileHasher # /hdfs/path/to/directory<tab>SHA-512 Length /hdfs/full/path/including/filename.warc.gz self.viral = [] self.warcs = [] self.logs = [] with open(hash_cache_file,'r') as f: for line in f: (dir, details) = line.split('\t') (hash, length, path) = details.split(' ') path = path.strip() length = long(length) logger.debug("GOT %s %s %s" % (path, hash, length)) self.hash_cache[path] = "%s %i" % ( hash, length ) if "/output/viral/" in path: self.viral.append(path) elif "/output/warcs/" in path: self.warcs.append(path) else: self.logs.append(path) def processJobs( self ): """All WARCs and logs associated with jobs, optionally loaded from files.""" if not isinstance(self.warcs,list): with open(self.warcs, "r") as i: self.warcs = [w.strip() for w in i] if not isinstance(self.viral,list): with open(self.viral, "r") as i: self.viral = [v.strip() for v in i] if not isinstance(self.logs,list): with open(self.logs, "r") as i: self.logs = [l.strip() for l in i] if self.startdate is None and self.dummy: self.startdate = datetime.now().isoformat() if( self.dummy ): logger.info( "Getting dummy ARK identifiers..." ) self.getDummyIdentifiers( len( self.warcs ) + len( self.viral ) + len( self.logs ) ) else: logger.info( "Getting ARK identifiers..." ) self.getIdentifiers( len( self.warcs ) + len( self.viral ) + len( self.logs ) ) def createMets( self ): """Creates the Mets object.""" logger.info( "Building METS..." ) self.mets = Mets( self.startdate, self.warcs, self.viral, self.logs, self.identifiers, hash_cache=self.hash_cache, client=self.client ) def writeMets( self, output=sys.stdout ): """Writes the METS XML to a file handle.""" output.write( self.mets.getXml() ) def bagit( self, directory, metadata=None ): """Creates a Bagit, if needs be with default metadata.""" if metadata is None: metadata = { "Contact-Name": self.BAGIT_CONTACT_NAME, "Contact-Email": self.BAGIT_CONTACT_EMAIL, "Timestamp": datetime.now().strftime( "%Y-%m-%dT%H:%M:%SZ" ), "Description": self.BAGIT_DESCRIPTION + ";".join( self.jobs ) } bagit.make_bag( directory, metadata ) def getDummyIdentifiers( self, num ): """Provides a series of 'dummy' identifiers for testing purposes.""" self.identifiers = [] for i in range( num ): self.identifiers.append( "%s%s" % ( self.ARK_PREFIX, "{0:06d}".format( i ) ) ) def getIdentifiers( self, num ): """Retrieves 'num' ARK identifiers from a webservice; alternatively calls an alternate method to provide 'dummy' identifiers.""" self.identifiers = [] if( self.dummy ): return self.getDummyIdentifiers( num ) try: url = "%s%s" % ( self.ARK_URL, str( num ) ) logger.debug( "Requesting %s ARKS: %s" % ( num, url ) ) response = requests.post( url ) data = response.content except Exception as e: logger.error( "Could not obtain ARKs: %s" % str( e ) ) raise Exception( "Could not obtain ARKs: %s" % str( e ) ) xml = parseString( data ) for ark in xml.getElementsByTagName( "ark" ): self.identifiers.append( ark.firstChild.wholeText ) if( len( self.identifiers ) != num ): raise Exception( "Problem parsing ARKs; %s, %s, %s" % ( self.jobs, self.identifiers, data ) ) def verifySetup(self): return True def create_sip(self, sip_dir): """Creates a SIP and returns the path to the folder containing the METS.""" if self.verifySetup(): if not os.path.exists(sip_dir): os.makedirs(sip_dir) else: raise Exception("SIP directory already exists: %s" % sip_dir) with open("%s/%s.xml" % (sip_dir, self.jobname.replace('/','_')), "wb") as o: self.writeMets(o) self.bagit(sip_dir) else: raise Exception("Could not verify SIP for %s" % sip_dir) return sip_dir def copy_sip_to_hdfs(self, sip_dir, hdfs_sip_path): """ Creates a tarball of a SIP and copies to HDFS. """ # Check if it's already there: hdfs_sip_tgz = "%s.tar.gz" % hdfs_sip_path status = self.client.status(hdfs_sip_tgz, strict=False) if status and not self.overwrite: raise Exception("SIP already exists in HDFS: %s" % hdfs_sip_tgz) # Build the TGZ gztar = shutil.make_archive(base_name=sip_dir, format="gztar", root_dir=os.path.dirname(sip_dir), base_dir=os.path.basename(sip_dir)) if self.dummy: logger.warning("Not copying to HDFS as this is a dummy-run! Leaving %s in place." % gztar) else: logger.info("Copying %s to HDFS..." % gztar) with open(gztar,'r') as f: self.client.write(data=f, hdfs_path=hdfs_sip_tgz, overwrite=False) os.remove(gztar) logger.info("Done.") return hdfs_sip_tgz
def createMets( self ): """Creates the Mets object.""" logger.info( "Building METS..." ) self.mets = Mets( self.startdate, self.warcs, self.viral, self.logs, self.identifiers, hash_cache=self.hash_cache, client=self.client )