def officeConverter(item): if not exists(item.get_file_path()+'.presform.pdf'): fileName, fileExtension = splitext(item.get_file_path()) mkdirArgs = ['mkdir', '-p', '/tmp/officeConv'] mkdirCommand = BashCommand(mkdirArgs) mkdirCommand.set_timeout(timeout) mkdirCommand.run_command() officeConvertArgs = ['/Applications/LibreOffice.app/Contents/MacOS/soffice', '--headless', '--convert-to', 'pdf', '--outdir', '/tmp/officeConv', item.get_file_path()] officeConvertCommand = BashCommand(officeConvertArgs) officeConvertCommand.set_timeout(timeout) officeConvertCommand.run_command() cpCommandArgs = ['cp', '/tmp/officeConv/'+basename(fileName)+'.pdf', item.get_file_path()+'.presform.pdf'] cpCommand = BashCommand(cpCommandArgs) cpCommand.run_command() rmCommandArgs = ['rm', '-r', '/tmp/officeConv'] rmCommand = BashCommand(rmCommandArgs) rmCommand.run_command() logger.debug(officeConvertCommand.get_data()) return officeConvertCommand.get_data() else: logger.info("Office (PDF) preservation format for file exists. " + "Not Clobbering.") return(None, None)
def imageConverter(item): if not exists(item.get_file_path()+'.presform.tif'): imageConvertArgs = ['ffmpeg', '-n', '-i', item.get_file_path(), item.get_file_path()+'.presform.tif'] imageConvertCommand = BashCommand(imageConvertArgs) imageConvertCommand.set_timeout(timeout) imageConvertCommand.run_command() logger.debug(imageConvertCommand.get_data()) return imageConvertCommand.get_data() else: logger.info("Image (tif) preservaiton format for file exists. " + "Not Clobbering.") return(None, None)
def audioConverter(item): if not exists(item.get_file_path()+'.presform.wav'): audioConvertArgs = ['ffmpeg', '-n', '-i', item.get_file_path(), item.get_file_path()+'.presform.wav'] audioConvertCommand = BashCommand(audioConvertArgs) audioConvertCommand.set_timeout(timeout) audioConvertCommand.run_command() logger.debug(audioConvertCommand.get_data()) return audioConvertCommand.get_data() else: logger.info("Audio (wav) preservation format for file exists. " + "Not Clobbering.") return (None, None)
def htmlConverter(item): if not exists(item.get_file_path()+'.presform.pdf'): originalFilePath = item.get_file_path() intermediaryFilePath = originalFilePath+'.intermediary.pdf' wkhtmltopdfArgs = ['wkhtmltopdf', item.get_file_path(), intermediaryFilePath] wkhtmltopdfCommand = BashCommand(wkhtmltopdfArgs) wkhtmltopdfCommand.set_timeout(timeout) wkhtmltopdfCommand.run_command() i = Item(intermediaryFilePath, root) itemStack.append(i) return wkhtmltopdfCommand.get_data() else: return (None, None)
def videoConverter(item): if not exists(item.get_file_path()+'.presform.avi'): videoConvertArgs = ['ffmpeg', '-n', '-i', item.get_file_path(), '-vcodec', 'rawvideo', '-acodec', 'pcm_u24le', '-pix_fmt', 'uyvy422', '-vtag', '2vuy', item.get_file_path()+".presform.avi"] videoConvertCommand = BashCommand(videoConvertArgs) videoConvertCommand.set_timeout(timeout) videoConvertCommand.run_command() logger.debug(videoConvertCommand.get_data()) return videoConvertCommand.get_data() else: logger.info("Video (avi) preservation format for file exists. " + "Not Clobbering.") return (None, None)
def zipConverter(item): if not exists(item.get_file_path()+'.presform.extracted'): unzipCommandArgs = ['7z', 'x', '-o'+item.get_file_path()+'.presform.extracted', item.get_file_path()] unzipCommand = BashCommand(unzipCommandArgs) unzipCommand.set_timeout(timeout) unzipCommand.run_command() if exists(item.get_file_path()+'.presform.extracted'): b = Batch(root, item.get_file_path()+'.presform.extracted') for item in b.find_items(from_directory=True): itemStack.append(item) return unzipCommand.get_data() else: logger.info("Already extracted.") return(None, None)
def gifConverter(item): if not exists(item.get_file_path()+'.presform'): mkdirArgs = ['mkdir', item.get_file_path()+".presform"] mkdirCommand = BashCommand(mkdirArgs) mkdirCommand.run_command() gifConvertArgs = ['ffmpeg', '-n', '-i', item.get_file_path(), item.get_file_path() + '.presform/output%04d.presform.tif'] gifConvertCommand = BashCommand(gifConvertArgs) gifConvertCommand.set_timeout(timeout) gifConvertCommand.run_command() logger.debug(gifConvertCommand.get_data()) return gifConvertCommand.get_data() else: logger.info("Image (tif) preservation format for file exists. " + "Not Clobbering.") return(None, None)
def main(): # start of parser boilerplate parser = ArgumentParser(description="This module is meant to take a batch of files (probably an accession in place) and generate the technical metadata for it.", epilog="Copyright University of Chicago; " + \ "written by "+__author__ + \ " "+__email__) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) # let the user decide the verbosity level of logging statements # -b sets it to INFO so warnings, errors and generic informative statements # will be logged parser.add_argument( \ '-b','-verbose',help="set verbosity for logging to stdout", action='store_const',dest='log_level', const=INFO,default='INFO' \ ) # -d is debugging so anything you want to use a debugger gets logged if you # use this level parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG,default='INFO' \ ) # optionally save the log to a file. set a location or use the default constant parser.add_argument( \ '-l','--log_loc',help="save logging to a file", dest="log_loc", \ ) parser.add_argument( \ '-t','--timeout',help="set a timeout in seconds for any single bash command", dest='timeout',default=3600,type=int \ ) parser.add_argument("item", help="Enter a noid for an accession or a " + \ "directory path that you need to validate against" + \ " a type of controlled collection" ) parser.add_argument("root",help="Enter the root of the directory path", action="store" ) args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) logger.setLevel(DEBUG) ch = StreamHandler() ch.setFormatter(log_format) ch.setLevel(args.log_level) logger.addHandler(ch) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) try: fitscommand="fits" md5command="md5" shacommand="sha256" b = Batch(abspath(args.root), abspath(args.item)) for item in b.find_items(from_directory=True): if ".fits.xml" in item.find_file_name() or ".stif.txt" in item.find_file_name(): continue item.find_technical_metadata() if item.has_technical_md: logger.info(item.get_file_path()+" already has technical metadata. Continuing.") continue else: logger.info("Attempting technical metadata generation for: "+item.get_file_path()) fitsArgs=[fitscommand,'-i',item.get_file_path(),'-o',item.get_file_path()+'.fits.xml'] fitsCommand=BashCommand(fitsArgs) fitsCommand.set_timeout(args.timeout) try: logger.info("Attempting FITS generation for: "+item.get_file_path()) result=fitsCommand.run_command() if isinstance(result[1],Exception): raise result[1] assert(exists(item.get_file_path()+'.fits.xml')) logger.info("FITS generated for: "+item.get_file_path()) except TimeoutExpired: logger.warn("FITS generation timed out") logger.info("Attempting STIF generation") statArgs=['stat',item.get_file_path()] statCommand=BashCommand(statArgs) statCommand.set_timeout(args.timeout) mimeArgs=['file','-i',item.get_file_path()] mimeCommand=BashCommand(mimeArgs) mimeCommand.set_timeout(args.timeout) fileArgs=['file',item.get_file_path()] fileCommand=BashCommand(fileArgs) fileCommand.set_timeout(args.timeout) assert(statCommand.run_command()[0]) assert(mimeCommand.run_command()[0]) assert(fileCommand.run_command()[0]) md5hash=item.find_md5_hash() shahash=item.find_sha256_hash with open(item.get_file_path()+'.stif.txt','w') as f: f.write(statCommand.get_data()[1].stdout.decode(encoding='UTF-8')+ \ mimeCommand.get_data()[1].stdout.decode(encoding='UTF-8')+ \ fileCommand.get_data()[1].stdout.decode(encoding='UTF-8')+ \ "md5: " + item.find_md5_hash() + '\n'+ \ "sha256: " + item.find_sha256_hash() \ ) assert(exists(item.get_file_path()+'.stif.txt')) logger.info("STIF generated for: "+item.get_file_path()) item.find_technical_metadata() assert(item.has_technical_md) logger.info("Technical metadata generation complete for: "+item.get_file_path()) return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131