def generateFileEntries(root, item): from os.path import join from hashlib import sha256 from uchicagoldr.batch import Batch fileInfoDict = {} b = Batch(root, item) totalDigitalSize = 0 for item in b.find_items(from_directory=True): itemDict = {} item.set_accession(item.find_file_accession()) uid = sha256(join( item.get_accession(), item.find_canonical_filepath() ).encode('utf-8')).hexdigest() itemDict['fileSize'] = item.find_file_size() itemDict['fileMime'] = item.find_file_mime_type() itemDict['fileHash'] = item.find_sha256_hash() totalDigitalSize += itemDict['fileSize'] if ".presform" in item.find_file_name(): presStable = "True" else: presStable = "False" itemDict['fileStable'] = presStable fileInfoDict[uid] = itemDict return fileInfoDict
def zipConverter(item): if not exists(item.get_file_path()+'.presform.extracted'): unzipCommandArgs=['7z','x','-o'+item.get_file_path()+'.presform.extracted',item.get_file_path()] unzipCommand=BashCommand(unzipCommandArgs) unzipCommand.run_command() unzipCommand.read_data() print(unzipCommand.read_data()) b=Batch(root,item.get_file_path()+'.presform.extracted') for item in b.find_items(from_directory=True): itemStack.append(item) return unzipCommand.get_data() else: logger.info("Already extracted.")
def createMetaAccession(self): metaAccessionDict = {} for accession in self.accessions: for item in accession.get_items(): # Clobber entries from previous accessions, keep the newest metaAccessionDict[item.get_canonical_filepath()] = \ item.get_root_path() metaAccessionList = [] for canonicalFilepath in metaAccessionDict: metaAccessionList.append( metaAccessionDict[canonicalFilepath] + "/" + canonicalFilepath) metaAccession = Batch() for reconstructedFilePath in metaAccessionList: metaAccession.add_item(Item(reconstructedFilePath)) return metaAccession
def main(): # start of parser boilerplate parser = ArgumentParser(description="Produce TFIDF numbers for terms in the text preservation formats in a batch", epilog="Copyright University of Chicago; " + \ "written by "+__author__ + \ " "+__email__) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) # let the user decide the verbosity level of logging statements # -b sets it to INFO so warnings, errors and generic informative statements # will be logged parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO,default='INFO' \ ) # -d is debugging so anything you want to use a debugger gets logged if you # use this level parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG,default='INFO' \ ) # optionally save the log to a file. set a location or use the default constant parser.add_argument( \ '-l','--log_loc',help="save logging to a file", dest="log_loc", \ ) parser.add_argument("item", help="Enter a noid for an accession or a " + \ "directory path that you need to validate against" + \ " a type of controlled collection" ) parser.add_argument("root",help="Enter the root of the directory path", action="store" ) args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) logger.setLevel(args.log_level) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) try: b = Batch(args.root, args.item) textDocs=TextBatch(args.item,args.root) for item in b.find_items(from_directory=True): if ".presform.txt" in item.find_file_name(): textDoc=TextItem(item.get_file_path(),item.get_root_path()) textDocs.add_item(textDoc) if textDocs.validate_items(): logger.info("Getting document term indices") term_map={} for item in textDocs.get_items(): item.set_raw_string(item.find_raw_string()) indexOut=item.find_index(purge_raw=True,scrub_text=True,term_map=term_map) item.set_index(indexOut[0]) term_map.update(indexOut[1]) textDocs.set_term_map(term_map) logger.info("Getting IDFs") textDocs.set_doc_counts(textDocs.find_doc_counts()) textDocs.set_idfs(textDocs.find_idfs()) logger.info("Computing TFIDFs") textDocs.set_tf_idfs(textDocs.find_tf_idfs()) textDocs.rev_term_map() for key in textDocs.get_tf_idfs(): print(key) tfidfs=[] for entry in textDocs.get_tf_idfs()[key]: tfidfs.append((entry,textDocs.get_tf_idfs()[key][entry])) tfidfs=sorted(tfidfs,key=lambda x: x[1],reverse=True) printFirstX=9 firstX=tfidfs[0:printFirstX] justTerms=[] for entry in firstX: justTerms.append(textDocs.get_term_map()[entry[0]]) print(",".join(justTerms)+"\n") return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131
def main(): # Master log instantiation begins # global masterLog masterLog = MasterLogger() # Master log instantiation ends # # Application specific log instantation begins # global logger logger = masterLog.getChild(__name__) f = UserAndIPFilter() termHandler = DefaultTermHandler() logger.addHandler(termHandler) logger.addFilter(f) logger.info("BEGINS") # Application specific log instantation ends # # Parser instantiation begins # parser = ArgumentParser(description="[A brief description of the utility]", epilog="Copyright University of Chicago; " + "written by "+__author__ + " "+__email__) parser.add_argument( "-v", help="See the version of this program", action="version", version=__version__ ) # let the user decide the verbosity level of logging statements # -b sets it to INFO so warnings, errors and generic informative statements # will be logged parser.add_argument( '-b', '--verbosity', help="set logging verbosity " + "(DEBUG,INFO,WARN,ERROR,CRITICAL)", nargs='?', const='INFO' ) # -d is debugging so anything you want to use a debugger gets logged if you # use this level parser.add_argument( '-d', '--debugging', help="set debugging logging", action='store_true' ) # optionally save the log to a file. # Set a location or use the default constant parser.add_argument( '-l', '--log_loc', help="save logging to a file", dest="log_loc", ) parser.add_argument( "item", help="Enter a noid for an accession or a " + "directory path that you need to validate against" + " a type of controlled collection" ) parser.add_argument( "root", help="Enter the root of the directory path", action="store" ) try: args = parser.parse_args() except SystemExit: logger.critical("ENDS: Command line argument parsing failed.") exit(1) # Begin argument post processing, if required # if args.verbosity and args.verbosity not in ['DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']: logger.critical("You did not pass a valid argument to the verbosity \ flag! Valid arguments include: \ 'DEBUG','INFO','WARN','ERROR', and 'CRITICAL'") return(1) if args.log_loc: if not exists(split(args.log_loc)[0]): logger.critical("The specified log location does not exist!") return(1) # End argument post processing # # Begin user specified log instantiation, if required # if args.log_loc: fileHandler = DefaultFileHandler(args.log_loc) logger.addHandler(fileHandler) if args.verbosity: logger.removeHandler(termHandler) termHandler = DefaultTermHandlerAtLevel(args.verbosity) logger.addHandler(termHandler) if args.log_loc: logger.removeHandler(fileHandler) fileHandler = DefaultFileHandlerAtLevel(args.log_loc, args.verbosity) logger.addHandler(fileHandler) if args.debugging: logger.removeHandler(termHandler) termHandler = DebugTermHandler() logger.addHandler(termHandler) if args.log_loc: logger.removeHandler(fileHandler) fileHandler = DebugFileHandler(args.log_loc) logger.addHandler(fileHandler) # End user specified log instantiation # try: # Begin module code # b = Batch(args.root, args.item) for item in b.find_items(from_directory=True): print(item.get_file_path()) # End module code # logger.info("ENDS: COMPLETE") return 0 except KeyboardInterrupt: logger.error("ENDS: Program aborted manually") return 131 except Exception as e: logger.critical("ENDS: Exception ("+str(e)+")") return 1
def main(): # start of parser boilerplate parser = ArgumentParser(description="Produce TFIDF numbers for terms in the text preservation formats in a batch", epilog="Copyright University of Chicago; " + \ "written by "+__author__ + \ " "+__email__) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) # let the user decide the verbosity level of logging statements # -b sets it to INFO so warnings, errors and generic informative statements # will be logged parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO,default='INFO' \ ) # -d is debugging so anything you want to use a debugger gets logged if you # use this level parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG,default='INFO' \ ) # optionally save the log to a file. set a location or use the default constant parser.add_argument( \ '-l','--log_loc',help="save logging to a file", dest="log_loc", \ ) parser.add_argument("restritem", help="Enter a noid for an accession or a " + \ "directory path that you need to validate against" + \ " a type of controlled collection" ) parser.add_argument("restrroot",help="Enter the root of the directory path", action="store" ) parser.add_argument("item", help="Enter a noid for an accession or a " + \ "directory path that you need to validate against" + \ " a type of controlled collection" ) parser.add_argument("root",help="Enter the root of the directory path", action="store" ) args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) logger.setLevel(args.log_level) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) try: args.restritem=abspath(args.restritem) args.restrroot=abspath(args.restrroot) args.item=abspath(args.item) args.root=abspath(args.root) b = Batch(args.restrroot, args.restritem) restrDocs=TextBatch(args.restritem,args.restrroot) for item in b.find_items(from_directory=True): if ".presform.txt" in item.find_file_name(): textDoc=TextItem(item.get_file_path(),item.get_root_path()) restrDocs.add_item(textDoc) if restrDocs.validate_items(): logger.info("Generating language model from provided document set.") logger.info("Getting document term indices") term_map={} for item in restrDocs.get_items(): item.set_raw_string(item.find_raw_string()) indexOut=item.find_index(purge_raw=True,scrub_text=False,stem=False,term_map=term_map) item.set_index(indexOut[0]) term_map.update(indexOut[1]) restrDocs.set_term_map(term_map) logger.info("Generating corpus term index") restrDocs.set_term_index(restrDocs.find_term_index()) logger.info("Getting iIDFs") restrDocs.set_doc_counts(restrDocs.find_doc_counts()) restrDocs.set_iIdfs(restrDocs.find_iIdfs()) logger.info("Computing Language Model") restrDocs.set_language_model(restrDocs.find_language_model()) logger.info("Computing LM VSM") restrDocs.set_vector_space_model(restrDocs.find_vector_space_model()) c=Batch(args.root,args.item) Docs=TextBatch(args.root,args.item) for item in c.find_items(from_directory=True): if ".presform.txt" in item.find_file_name(): textDoc=TextItem(item.get_file_path(),item.get_root_path()) Docs.add_item(textDoc) if Docs.validate_items(): logger.info("Generating TFIDF models for each document in the batch.") logger.info("Getting document term indices") tote=len(Docs.get_items()) i=0 for item in Docs.get_items(): i+=1 print("\r"+str(i)+"/"+str(tote)+" - "+item.get_file_path(),end="") item.set_raw_string(item.find_raw_string()) indexOut=item.find_index(purge_raw=True,scrub_text=False,stem=False,term_map=term_map,only_mapped=True) item.set_index(indexOut[0]) print() logger.info("Getting IDFs") Docs.set_doc_counts(Docs.find_doc_counts()) Docs.set_idfs(Docs.find_idfs()) logger.info("Computing TFIDFs") Docs.set_tf_idfs(Docs.find_tf_idfs()) logger.info("Generating document vector space models.") Docs.set_document_vector_space_models(Docs.find_document_vector_space_models()) logger.info("Computing similarity metrics.") rels=[] for document in Docs.get_document_vector_space_models(): rels.append((document,restrDocs.find_similarity(Docs.get_document_vector_space_models()[document]))) logger.info("Sorting similarity metrics for output") rels=sorted(rels,key=itemgetter(1)) for entry in rels: print(entry[0]+": "+str(entry[1])) return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131
def main(): # start of parser boilerplate parser = ArgumentParser(description="[A brief description of the utility]", epilog="Copyright University of Chicago; " + \ "written by "+__author__ + \ " "+__email__) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) # let the user decide the verbosity level of logging statements # -b sets it to INFO so warnings, errors and generic informative statements # will be logged parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO,default='INFO' \ ) # -d is debugging so anything you want to use a debugger gets logged if you # use this level parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG,default='INFO' \ ) # optionally save the log to a file. set a location or use the default constant parser.add_argument( \ '-l','--log_loc',help="save logging to a file", dest="log_loc", \ ) parser.add_argument("item", help="Enter a noid for an accession or a " + \ "directory path that you need to validate against" + \ " a type of controlled collection" ) parser.add_argument("root",help="Enter the root of the directory path", action="store" ) args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) logger.setLevel(args.log_level) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) #BEGIN MAIN HERE - EXAMPLE BELOW try: b = Batch(args.root, args.item) for item in b.find_items(from_directory=True): print(item.filepath) return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131
def main(): # start of parser boilerplate parser = ArgumentParser(description="This module is meant to take a batch of files (probably an accession in place) and generate the technical metadata for it.", epilog="Copyright University of Chicago; " + \ "written by "+__author__ + \ " "+__email__) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) # let the user decide the verbosity level of logging statements # -b sets it to INFO so warnings, errors and generic informative statements # will be logged parser.add_argument( \ '-b','-verbose',help="set verbosity for logging to stdout", action='store_const',dest='log_level', const=INFO,default='INFO' \ ) # -d is debugging so anything you want to use a debugger gets logged if you # use this level parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG,default='INFO' \ ) # optionally save the log to a file. set a location or use the default constant parser.add_argument( \ '-l','--log_loc',help="save logging to a file", dest="log_loc", \ ) parser.add_argument( \ '-t','--timeout',help="set a timeout in seconds for any single bash command", dest='timeout',default=3600,type=int \ ) parser.add_argument("item", help="Enter a noid for an accession or a " + \ "directory path that you need to validate against" + \ " a type of controlled collection" ) parser.add_argument("root",help="Enter the root of the directory path", action="store" ) args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) logger.setLevel(DEBUG) ch = StreamHandler() ch.setFormatter(log_format) ch.setLevel(args.log_level) logger.addHandler(ch) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) try: fitscommand="fits" md5command="md5" shacommand="sha256" b = Batch(abspath(args.root), abspath(args.item)) for item in b.find_items(from_directory=True): if ".fits.xml" in item.find_file_name() or ".stif.txt" in item.find_file_name(): continue item.find_technical_metadata() if item.has_technical_md: logger.info(item.get_file_path()+" already has technical metadata. Continuing.") continue else: logger.info("Attempting technical metadata generation for: "+item.get_file_path()) fitsArgs=[fitscommand,'-i',item.get_file_path(),'-o',item.get_file_path()+'.fits.xml'] fitsCommand=BashCommand(fitsArgs) fitsCommand.set_timeout(args.timeout) try: logger.info("Attempting FITS generation for: "+item.get_file_path()) result=fitsCommand.run_command() if isinstance(result[1],Exception): raise result[1] assert(exists(item.get_file_path()+'.fits.xml')) logger.info("FITS generated for: "+item.get_file_path()) except TimeoutExpired: logger.warn("FITS generation timed out") logger.info("Attempting STIF generation") statArgs=['stat',item.get_file_path()] statCommand=BashCommand(statArgs) statCommand.set_timeout(args.timeout) mimeArgs=['file','-i',item.get_file_path()] mimeCommand=BashCommand(mimeArgs) mimeCommand.set_timeout(args.timeout) fileArgs=['file',item.get_file_path()] fileCommand=BashCommand(fileArgs) fileCommand.set_timeout(args.timeout) assert(statCommand.run_command()[0]) assert(mimeCommand.run_command()[0]) assert(fileCommand.run_command()[0]) md5hash=item.find_md5_hash() shahash=item.find_sha256_hash with open(item.get_file_path()+'.stif.txt','w') as f: f.write(statCommand.get_data()[1].stdout.decode(encoding='UTF-8')+ \ mimeCommand.get_data()[1].stdout.decode(encoding='UTF-8')+ \ fileCommand.get_data()[1].stdout.decode(encoding='UTF-8')+ \ "md5: " + item.find_md5_hash() + '\n'+ \ "sha256: " + item.find_sha256_hash() \ ) assert(exists(item.get_file_path()+'.stif.txt')) logger.info("STIF generated for: "+item.get_file_path()) item.find_technical_metadata() assert(item.has_technical_md) logger.info("Technical metadata generation complete for: "+item.get_file_path()) return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131
def main(): # start of parser boilerplate parser = ArgumentParser(description="Program for the conversion of uncontrolled accessions into preservation stable file formats for ingest into the University of Chicago Library Digital Repository", epilog="Copyright University of Chicago; " + \ "written by Brian Balsamo" + \ "<*****@*****.**>") parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) # let the user decide the verbosity level of logging statements # -b sets it to INFO so warnings, errors and generic informative statements # will be logged parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO,default='INFO' \ ) # -d is debugging so anything you want to use a debugger gets logged if you # use this level parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG \ ) # optionally save the log to a file. set a location or use the default constant parser.add_argument( \ '-l','--log_loc',help="save logging to a file", action="store_const",dest="log_loc", const='./current.log' \ ) parser.add_argument("item", help="Enter a noid for an accession or a " + \ "directory path that you need to validate against" + \ " a type of controlled collection") parser.add_argument("root",help="Enter the root of the directory path", action="store") args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) logger.setLevel(args.log_level) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) logger.info("Beginning") global itemStack itemStack=[] global root root=args.root try: if isdir(args.item): b = Batch(root, args.item) for item in b.find_items(from_directory=True): itemStack.append(item) if isfile(args.item): itemStack.append(Item(args.item,root)) for item in itemStack: logger.info("Parsing "+item.get_file_path()) parse(item) logger.info("Parsing complete on "+item.get_file_path()) return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131
def main(): # start of parser boilerplate parser = ArgumentParser( description="A command line utility for staging physical media", epilog="Copyright University of Chicago; " + "written by " + __author__ + " " + __email__, ) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) # let the user decide the verbosity level of logging statements # -b sets it to INFO so warnings, errors and generic informative statements # will be logged parser.add_argument( "-b", "-verbose", help="set verbose logging", action="store_const", dest="log_level", const=INFO, default="INFO" ) # -d is debugging so anything you want to use a debugger gets logged if you # use this level parser.add_argument( "-d", "--debugging", help="set debugging logging", action="store_const", dest="log_level", const=DEBUG, default="DEBUG", ) # optionally save the log to a file. set a location or use the default constant parser.add_argument("-l", "--log_loc", help="save logging to a file", dest="log_loc") parser.add_argument( "--log_verb", help="Set a separate verbosity for the log written to disk, if desired", dest="log_verb", default=None, ) parser.add_argument( "item", help="Enter a noid for an accession or a " + "directory path that you need to validate against" + " a type of controlled collection", ) parser.add_argument("root", help="Enter the root of the directory path", action="store") parser.add_argument("dest_root", help="Enter the destination root path", action="store") parser.add_argument( "containing_folder", help="The name of the containing folder on disk (prefix+number)", action="store" ) parser.add_argument( "--rehash", help="Disregard any existing previously generated hashes, recreate them on this run", action="store_true", ) args = parser.parse_args() log_format = Formatter("[%(levelname)s] %(asctime)s " + "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S") global logger logger = getLogger("lib.uchicago.repository.logger") ch = StreamHandler() ch.setFormatter(log_format) ch.setLevel(args.log_level) logger.setLevel("DEBUG") if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) try: if args.item[-1] == "/" and args.item != args.root: logger.warn("It looks like you may have set the root incorrectly.") wrongRootGoAnyways = input("Are you sure you want to continue? (y/n)\n") if wrongRootGoAnyways is not "y": exit(1) shouldBeEAD = getImmediateSubDirs(args.dest_root) assert len(shouldBeEAD) == 1 shouldBeAccNo = getImmediateSubDirs(join(args.dest_root, shouldBeEAD[0])) assert len(shouldBeAccNo) == 1 stageRoot = join(join(args.dest_root, shouldBeEAD[0]), shouldBeAccNo[0]) destinationAdminRoot = join(stageRoot, "admin/") destinationDataRoot = join(stageRoot, "data/") containing_folder = args.containing_folder destinationAdminFolder = join(destinationAdminRoot, containing_folder) destinationDataFolder = join(destinationDataRoot, containing_folder) stagingDebugLog = FileHandler(join(destinationAdminFolder, "log.txt")) stagingDebugLog.setFormatter(log_format) stagingDebugLog.setLevel("DEBUG") logger.addHandler(stagingDebugLog) logger.debug("Creating batch from original files.") originalFiles = Batch(args.root, directory=args.item) existingOriginalFileHashes = {} originalFileHashes = {} logger.info("Hashing original files") if exists(join(destinationAdminFolder, "fixityFromOrigin.txt")): with open(join(destinationAdminFolder, "fixityFromOrigin.txt"), "r") as f: for line in f.readlines(): if not args.rehash: splitLine = line.split("\t") if splitLine[1] != "ERROR": existingOriginalFileHashes[splitLine[0]] = [splitLine[1], splitLine[2].rstrip("\n")] with open(join(destinationAdminFolder, "fixityFromOrigin.txt"), "a") as f: for item in originalFiles.find_items(from_directory=True): if item.test_readability(): item.set_root_path(args.root) if relpath(item.get_file_path(), start=item.get_root_path()) not in existingOriginalFileHashes: item.set_sha256(item.find_sha256_hash()) item.set_md5(item.find_md5_hash()) originalFileHashes[relpath(item.get_file_path(), start=item.get_root_path())] = [ item.get_sha256(), item.get_md5(), ] else: logger.warn("COULD NOT READ FILE: " + item.get_file_path()) originalFileHashes[relpath(item.get_file_path(), start=args.root)] = ["ERROR", "ERROR"] for entry in originalFileHashes: f.write(entry + "\t" + originalFileHashes[entry][0] + "\t" + originalFileHashes[entry][1] + "\n") return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131
def main(): parser = ArgumentParser(description="{description}". \ format(description = __description__), epilog="{copyright}; ". \ format(copyright = __copyright__) + \ "written by {author} <{email}>.". \ format(author = __author__, email = __email__)) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO \ ) parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG \ ) parser.add_argument( \ '-l','--log_loc',help="save logging to a file", action="store_const",dest="log_loc", const='./{progname}.log'. \ format(progname=__file__) \ ) parser.add_argument( \ '--db_url',help="Enter a db url",action='store' \ ) parser.add_argument( \ '--root',help="Enter the root of the repository", action='store') parser.add_argument( \ '--object_pattern',help="Enter the regex pattern " + \ "to match an object", action='store') parser.add_argument( \ '--page_pattern',help="Enter the regex pattern " + \ "to match a page", action='store') parser.add_argument( \ 'accessions',nargs="*",action='store', help="Enter 1 or more accession " + \ "identifiers to process" \ ) args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) try: logger.setLevel(args.log_level) except TypeError: logger.setLevel(INFO) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) db = Database(args.db_url, ['record','file']) class Record(db.base): __table__ = Table('record', db.metadata, autoload=True) class File(db.base): __table__ = Table('file', db.metadata, autoload=True) query = db.session.query(File).filter(File.accession.in_(args.accessions)) if args.root: batch = Batch(args.root, query = query) items = batch.find_items(from_db = True) batch.set_items(items) else: raise ValueError("need to include a root") try: all_objects = [] for item in batch.get_items(): accession = item.find_file_accession() item.set_accession(accession) canon = item.find_canonical_filepath() item.set_canonical_filepath(canon) search_pattern = item.find_matching_object_pattern( \ re_compile("(mvol)/(\w{4})/(\w{4})/(\w{4})/" + "(mvol)-(\w{4})-(\w{4})-(\w{4})")) if search_pattern.status == True: potential_identifier = '-'.join(search_pattern.data.groups()) is_an_object_already_present = [x for x in all_objects \ if x.identifier == \ potential_identifier] if is_an_object_already_present: logger.debug("found this id already") else: logger.debug("this id is new!") new_object = DigitalObject(potential_identifier) all_objects.append(new_object) logger.debug(potential_identifier) return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131
def main(): parser = ArgumentParser(description="{description}". \ format(description = __description__), epilog="{copyright}; ". \ format(copyright=__copyright__) + \ "written by {name} ".format(name=__author__) + \ " <{email}> ".format(email=__email__) + \ "University of Chicago") parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO \ ) parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG \ ) parser.add_argument( \ '-l','--log_loc',help="save logging to a file", action="store_const",dest="log_loc", const='./{progname}.log'. \ format(progname=argv[0]) \ ) parser.add_argument("location_root",help="Enter the root " + \ "of the directory path", action="store") parser.add_argument("directory_path", help="Enter a directory that you need to work on ", action='store') args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) try: logger.setLevel(args.log_level) except TypeError: logger.setLevel(INFO) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) try: b = Batch(args.location_root, directory = args.directory_path) generator_object = b.find_items(from_directory=True) logger.debug(generator_object) b.set_items(generator_object) stdout.write("begin transaction;\n") for a_file in b.get_items(): if a_file.test_readability(): file_hash = a_file.find_hash_of_file(sha256) mime = a_file.find_file_mime_type() size = a_file.find_file_size() accession = a_file.find_file_accession() a_file.set_file_mime_type(mime) a_file.set_file_size(size) a_file.set_hash(file_hash) a_file.set_accession(accession) out_string = "insert into file (filepath,accession," + \ "mimetype,size,checksum) values (" + \ "\"{path}\",\"{accession}\",\"{mimetype}\"". \ format(path = a_file.filepath, accession = a_file.get_accession(), mimetype = a_file.get_file_mime_type()) + \ ",{filesize},\"{filehash}\");\n". \ format(filesize = a_file.get_file_size(), filehash = a_file.get_hash()) stdout.write(out_string) else: logger.error("{path} could not be read". \ format(path=a_file.filepath)) stdout.write("commit;\n") return 0 except KeyboardInterrupt: logger.warn("Program aborted manually") return 131
def main(): parser = ArgumentParser(description="{description}". \ format(description=__description__), epilog="Copyright University of Chicago; " + \ "written by {author} ". \ format(author = __author__) + \ " <{email}> University of Chicago". \ format(email = __email__)) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO \ ) parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG \ ) parser.add_argument( \ '-l','--log_loc',help="save logging to a file", action="store_const",dest="log_loc", const='./{progname}.log'. \ format(progname=argv[0]) \ ) parser.add_argument("-o","--object_level", help="Enter the level at which object starts", type=int, action='store') parser.add_argument("-r", "--root", help="Enter the root of the directory path", action="store") parser.add_argument("directory_path", help="Enter a directory that you need to work on ", action='store') parser.add_argument('pattern', help="Enter a pattern to filter files with", action="store") global args args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) try: logger.setLevel(args.log_level) except TypeError: logger.setLevel(INFO) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) current_date = datetime.now() isof_current_date = current_date.strftime("%Y-%m-%dT%H:%M:%S") sixty_days_ago_date = current_date - timedelta(days=60) isof_sixty_days_ago_date = sixty_days_ago_date.strftime( \ "%Y-%m-%dT%H:%M:%S") db = Database("sqlite:////media/repo/repository/databases/" + "official/repositoryAccessions.db.new",tables_to_bind= \ ['record']) class Record(db.base): __table__ = Table('record', db.metadata, autoload=True) b = Batch(args.root, directory = args.directory_path) difference_in_path = relpath(args.directory_path, args.root) query = db.session.query(Record.createdate).filter(Record.receipt == \ difference_in_path) createdate = query.first()[0] items = b.find_items(from_directory = True, filterable = re_compile(args.pattern)) b.set_items(items) try: generated_data = evaluate_items(b,createdate) count = 0 objects = {} descriptive_metadata = '.dc.xml$' representation_file = '.pdf$' mets_file = '.mets.xml$' file_definers = ['dc.xml','ALTO','TIFF','JPEG','pdf','mets.xml', '\d{4}.txt'] file_definer_sequences = ['ALTO','TIFF','JPEG'] page_number_pattern = '_(\w{4})' for n in generated_data: id_parts = args.pattern.split('/') id_parts_enumerated = [x for x in range(args.object_level)] id_part_values = [n.canonical_filepath.split('/')[x] \ for x in id_parts_enumerated] identifier = "-".join(id_part_values) to_add = None for p in file_definers: if p in n.canonical_filepath: to_add = n break if to_add: if objects.get(identifier): objects.get(identifier).append(n) else: objects[identifier] = [n] else: logger.error("{fpath} in {id} could not be matched". \ format(fpath = n.canonical_filepath, id = identifier)) for k, v in objects.items(): logger.error(k) k_identifier = k k_id_part_values = k.split('-') logger.info(k_id_part_values) k_id_directory = '/'.join(k_id_part_values) for p in file_definer_sequences: sequence = sorted([(int(re_compile(page_number_pattern). \ search(x.canonical_filepath).group(1).lstrip('0')), x.canonical_filepath) \ for x in v if p in x.canonical_filepath]) known_complete_page_range = [x for x in \ range(sequence[-1][0])][1:] what_is_actually_present = [x[0] for x in sequence] if set(known_complete_page_range) - \ set(what_is_actually_present): difference = list(set(known_complete_page_range) - \ set(what_is_actually_present)) l = [str(x) for x in list(difference)] logger.error("The sequence part {part} ". \ format(part = p) + "is missing pages {pages}". \ format(pages = ','.join(l))) for p in file_definers: seek = [x for x in v if p in x.canonical_filepath] if len(seek) == 0: logger.error("{identifier}". \ format(identifier = k_identifier) + \ " missing part {part}".format(part = p)) ldrurl = LDRURL(join(k_id_directory, k_identifier)) piurl = PIURL("dig/campub", join(k_id_directory, k_identifier)) rightsurl = RightsURL() repurl = URL("http://repository.lib.uchicago.edu/") collectionurl = URL("ead/ICU.SPCL.CAMPUB") dcfile = [x for x in v if '.dc.xml' in x.canonical_filepath][0] proxy = Proxy(join(dcfile.accession, dcfile.dirhead, dcfile.canonical_filepath)) pdffile = [x for x in v if '.pdf' in x.canonical_filepath][0] jpegfile = [x for x in v if 'JPEG' in x.canonical_filepath \ and '_0001' in x.canonical_filepath][0] pdfresource = WebResource(join(pdffile.accession, pdffile.dirhead, pdffile.canonical_filepath)) metsfile = [x for x in v if '.mets.xml' in x.canonical_filepath][0] metsresource = RDFSResource(join(metsfile.accession, metsfile.dirhead, metsfile.canonical_filepath)) pages = set([int(re_compile('_(\w{4}).*'). \ search(basename(x.canonical_filepath)). \ group(1).lstrip('0')) for x in v if re_compile('_\w{4}.*'). \ search(basename(x.canonical_filepath))]) numpages = list(pages)[-1] providedcho = ProvidedCHO(k_id_directory) aggregation = Aggregation(k_id_directory) rem = ResourceMap(k_id_directory) proxy.add_statement("dc:format", TextValue(dcfile.mimetype)) proxy.add_statement("ore:proxyFor", URL(providedcho.subject)) proxy.add_statement("ore:proxyIn", URL(aggregation.subject)) stdout.write(str(proxy)) providedcho.add_statement("dc:coverage",TextValue("Chicago")) providedcho.add_statement("dc:date", DateValue(dcfile.date)) providedcho.add_statement("edm:year", DateValue(dcfile.date.split('-')[0])) providedcho.add_statement("dc:description", TextValue(dcfile.description)) providedcho.add_statement("dc:identifier", TextValue(dcfile.identifier)) providedcho.add_statement("dc:language", TextValue("en")) providedcho.add_statement("dc:rights", rightsurl) providedcho.add_statement("dc:title", TextValue(dcfile.title)) providedcho.add_statement("dc:type", TextValue("text")) providedcho.add_statement("edm:type", TextValue("TEXT")) providedcho.add_statement("dc:description", URL(aggregation.subject)) providedcho.add_statement("dcterms:isPartOf", collectionurl) rem.add_statement("dcterms:created", DateValue(createdate)) rem.add_statement("dcterms:creator", repurl) rem.add_statement("ore:describes", URL(aggregation.subject)) stdout.write(str(rem)) aggregation.add_statement("edm:aggregatedCHO", URL(providedcho.subject)) aggregation.add_statement("edm:dataProvider", TextValue("University of Chicago Library")) aggregation.add_statement("edm:isShownAt", piurl) aggregation.add_statement("edm:isShownBy", URL(join(pdffile.accession, pdffile.dirhead, pdffile.canonical_filepath))) aggregation.add_statement("edm:object", URL(join(jpegfile.accession, jpegfile.dirhead, jpegfile.canonical_filepath))) aggregation.add_statement("edm:provider", TextValue("University of Chicago Library")) aggregation.add_statement("dc:rights", rightsurl) aggregation.add_statement("ore:isDescribedBy", URL(rem.subject)) stdout.write(str(aggregation)) metsresource.add_statement("dc:format", TextValue(metsfile.mimetype)) stdout.write(str(metsresource)) pdfresource.add_statement("dcterms:isFormatOf", ldrurl.subject) pdfresource.add_statement("premis:objectIdentifierType", TextValue("ARK")) pdfresource.add_statement("premis:objectIdentifierValue", URL(pdfresource.subject)) pdfresource.add_statement("dc:format", TextValue(pdffile.mimetype)) pdfresource.add_statement("premis:objectCategory", TextValue("file")) pdfresource.add_statement("premis:compositionLevel", IntegerValue(0)) pdfresource.add_statement("premis:messageDigestAlgorithm", TextValue("SHA-256")) pdfresource.add_statement("premis:messageDigest", TextValue(pdffile.checksum)) pdfresource.add_statement("premis:messageDigestOriginator", TextValue("/sbin/sha256")) pdfresource.add_statement("premis:size", IntegerValue(pdffile.file_size)) pdfresource.add_statement("premis:formatName", TextValue(pdffile.mimetype)) pdfresource.add_statement("premis:originalName", TextValue(pdffile.canonical_filepath)) pdfresource.add_statement("premis:eventIdentifierType", TextValue("ARK")) pdfresource.add_statement("premis:eventIdentifierValue", TextValue(pdffile.accession)) pdfresource.add_statement("premis:eventType", TextValue("creation")) pdfresource.add_statement("premis:eventDateTime", DateValue(createdate)) stdout.write(str(pdfresource)) all_pages = range(1, numpages + 1) for n in all_pages: if n != all_pages[-1]: next_page = n + 1 canonical_next_page = '0' * (4 - len(str(next_page))) + \ str(next_page) canonical_next_page_name = join(k_id_directory,k_identifier + \ '_' + canonical_next_page) else: next_page = None canonical_page = ('0' * (4 - len(str(n)))) + str(n) canonical_page_file_name = k_identifier + '_' + canonical_page page_name = join(k_id_directory, canonical_page_file_name) logger.info(page_name) providedcho.add_statement("dcterms:hasPart", "<{url}>". \ format(url = page_name)) tiffile = [x for x in v if 'TIFF' in x.canonical_filepath and str('_' + canonical_page) in x.canonical_filepath][0] ocrfile = [x for x in v if 'ALTO' in x.canonical_filepath and str('_' + canonical_page) in x.canonical_filepath][0] jpegfile = [x for x in v if 'JPEG' in x.canonical_filepath and str('_' + canonical_page) in x.canonical_filepath][0] page_providedcho = ProvidedCHO(page_name) page_aggregation = Aggregation(page_name) page_rem = ResourceMap(page_name) page_webresource = WebResource(join(tiffile.accession, tiffile.dirhead, tiffile.canonical_filepath)) page_jpeg = RDFSResource(join(jpegfile.accession, jpegfile.dirhead, jpegfile.canonical_filepath)) page_ocr = RDFSResource(join(ocrfile.accession, ocrfile.dirhead, ocrfile.canonical_filepath)) page_providedcho.add_statement("dc:description", "<{url}>". \ format(url = join(ocrfile.accession, ocrfile.dirhead, ocrfile.canonical_filepath))) page_providedcho.add_statement("dc:language", TextValue("en")) page_providedcho.add_statement("dc:rights", rightsurl) page_providedcho.add_statement("dc:type", TextValue("Text")) page_providedcho.add_statement("edm:type", TextValue("TEXT")) page_providedcho.add_statement("dc:title", TextValue("Page {number}". \ format(number = str(n)))) page_providedcho.add_statement("dcterms:isPartOf", URL(providedcho.subject)) if next_page: page_providedcho.add_statement("edm:isNextInSequence", URL(join("/",canonical_next_page_name))) stdout.write(str(page_providedcho)) page_aggregation.add_statement("edm:aggregatedCHO", URL(page_providedcho.subject)) page_aggregation.add_statement("edm:dataProvider", TextValue("University of Chicago Library")) page_aggregation.add_statement("edm:isShownBy", URL(page_webresource.subject)) page_aggregation.add_statement("edm:object", URL(page_jpeg.subject)) page_aggregation.add_statement("edm:provider", TextValue("University of Chicago Library")) page_aggregation.add_statement("edm:rights", URL(rightsurl.subject)) page_aggregation.add_statement("ore:isDescribedBy", URL(page_rem.subject)) stdout.write(str(page_aggregation)) page_rem.add_statement("dc:created", DateValue(createdate)) page_rem.add_statement("dcterms:creator", URL(repurl.subject)) stdout.write(str(page_rem)) page_webresource.add_statement("mix:fileSize", IntegerValue(tiffile.file_size)) page_webresource.add_statement("mix:formatName", TextValue(tiffile.mimetype)) if getattr(tiffile,'mixchecksum',None): page_webresource.add_statement("mix:messageDigestAlgorithm", TextValue("MD5")) page_webresource.add_statement("mix:messageDigest", TextValue(tiffile.mixchecksum)) if getattr(tiffile,'imageheight',None): page_webresource.add_statement("mix:imageHeight", IntegerValue(int(tiffile.imageheight))) if getattr(tiffile,'imagewidth',None): page_webresource.add_statement("mix:imageWidth", IntegerValue(int(tiffile.imagewidth))) if getattr(tiffile,'bitspersample',None): page_webresource.add_statement("mix:bitsPerSample", TextValue(tiffile.bitspersample)) stdout.write(str(page_webresource)) page_jpegresource = RDFSResource(join(jpegfile.accession, jpegfile.dirhead, jpegfile.canonical_filepath)) page_ocrresource = RDFSResource(join(ocrfile.accession, ocrfile.dirhead, ocrfile.canonical_filepath)) page_ocrresource.add_statement("dc:format", TextValue(ocrfile.mimetype)) stdout.write(str(page_ocrresource)) page_jpegresource.add_statement("dc:format", TextValue(jpegfile.mimetype)) stdout.write(str(page_jpegresource)) stdout.write(str(providedcho)) return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131
def main(): parser = ArgumentParser(description="{description}". \ format(description=__description__), epilog="Copyright University of Chicago; " + \ "written by {author} ". \ format(author = __author__) + \ " <{email}> University of Chicago". \ format(email = __email__)) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO \ ) parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG \ ) parser.add_argument( \ '-l','--log_loc',help="save logging to a file", action="store_const",dest="log_loc", const='./{progname}.log'. \ format(progname=argv[0]) \ ) parser.add_argument("-o","--object_level", help="Enter the level at which object starts", type=int, action='store') parser.add_argument("-r", "--root", help="Enter the root of the directory path", action="store") parser.add_argument("directory_path", help="Enter a directory that you need to work on ", action='store') parser.add_argument('pattern', help="Enter a pattern to filter files with", action="store") global args args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) try: logger.setLevel(args.log_level) except TypeError: logger.setLevel(INFO) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) current_date = datetime.now() isof_current_date = current_date.strftime("%Y-%m-%dT%H:%M:%S") sixty_days_ago_date = current_date - timedelta(days=60) isof_sixty_days_ago_date = sixty_days_ago_date.strftime( \ "%Y-%m-%dT%H:%M:%S") db = Database("sqlite:////media/repo/repository/databases/" + "official/repositoryAccessions.db.new",tables_to_bind= \ ['record']) class Record(db.base): __table__ = Table('record', db.metadata, autoload=True) b = Batch(args.root, directory = args.directory_path) difference_in_path = relpath(args.directory_path, args.root) query = db.session.query(Record.createdate).filter(Record.receipt == \ difference_in_path) createdate = query.first()[0] items = b.find_items(from_directory = True, filterable = re_compile(args.pattern)) b.set_items(items) try: generated_data = evaluate_items(b,createdate) count = 0 objects = {} descriptive_metadata = '.dc.xml$' representation_file = '.pdf$' file_definers = ['dc.xml','ALTO','TIFF','JPEG','pdf','mets.xml', '\d{4}.txt'] file_definer_sequences = ['ALTO','TIFF','JPEG'] page_number_pattern = '_(\w{4})' for n in generated_data: id_parts = args.pattern.split('/') id_parts_enumerated = [x for x in range(args.object_level)] id_part_values = [n.canonical_filepath.split('/')[x] \ for x in id_parts_enumerated] identifier = "-".join(id_part_values) to_add = None for p in file_definers: if p in n.canonical_filepath: to_add = n break if to_add: if objects.get(identifier): objects.get(identifier).append(n) else: objects[identifier] = [n] else: logger.error("{fpath} in {id} could not be matched". \ format(fpath = n.canonical_filepath, id = identifier)) for k, v in objects.items(): for p in file_definer_sequences: sequence = sorted([(int(re_compile(page_number_pattern). \ search(x.canonical_filepath).group(1).lstrip('0')), x.canonical_filepath) \ for x in v if p in x.canonical_filepath]) known_complete_page_range = [x for x in \ range(sequence[-1][0])][1:] what_is_actually_present = [x[0] for x in sequence] if set(known_complete_page_range) - \ set(what_is_actually_present): difference = list(set(known_complete_page_range) - \ set(what_is_actually_present)) l = [str(x) for x in list(difference)] logger.error("The sequence part {part} ". \ format(part = p) + "is missing pages {pages}". \ format(pages = ','.join(l))) for p in file_definers: seek = [x for x in v if p in x.canonical_filepath] if len(seek) == 0: logger.error("{identifier}". \ format(identifier = identifier) + \ " missing part {part}".format(part = p)) i = make_identifier(id_part_values, v) metadata = [x for x in v if re_compile(descriptive_metadata). \ search(x.canonical_filepath)][0] representation = [x for x in v if re_compile(representation_file). \ search(x.canonical_filepath)][0] providedcho = make_providedcho(i, metadata) print(providedcho) aggregation = make_aggregation(i, representation) print(aggregation) logger.info(i) return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131
def main(): # start of parser boilerplate parser = ArgumentParser(description="A command line utility for staging physical media", epilog="Copyright University of Chicago; " + \ "written by "+__author__ + \ " "+__email__) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) # let the user decide the verbosity level of logging statements # -b sets it to INFO so warnings, errors and generic informative statements # will be logged parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO,default='INFO' \ ) # -d is debugging so anything you want to use a debugger gets logged if you # use this level parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG,default='DEBUG' \ ) # optionally save the log to a file. set a location or use the default constant parser.add_argument( \ '-l','--log_loc',help="save logging to a file", dest="log_loc", \ ) parser.add_argument( \ '--log_verb',help="Set a separate verbosity for the log written to disk, if desired", dest="log_verb",default=None \ ) parser.add_argument("dest_root",help="Enter the destination root path", action='store' ) parser.add_argument("containing_folder",help="The name of the containing folder on disk (prefix+number)", action='store' ) parser.add_argument("--rehash",help="Disregard any existing previously generated hashes, recreate them on this run", action="store_true" ) args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) ch.setLevel(args.log_level) logger.setLevel('DEBUG') if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) try: shouldBeEAD=getImmediateSubDirs(args.dest_root) assert(len(shouldBeEAD)==1) shouldBeAccNo=getImmediateSubDirs(join(args.dest_root,shouldBeEAD[0])) assert(len(shouldBeAccNo)==1) stageRoot=join(join(args.dest_root,shouldBeEAD[0]),shouldBeAccNo[0]) destinationAdminRoot=join(stageRoot,'admin/') destinationDataRoot=join(stageRoot,'data/') containing_folder=args.containing_folder destinationAdminFolder=join(destinationAdminRoot,containing_folder) destinationDataFolder=join(destinationDataRoot,containing_folder) stagingDebugLog = FileHandler(join(destinationAdminFolder,'log.txt')) stagingDebugLog.setFormatter(log_format) stagingDebugLog.setLevel('DEBUG') logger.addHandler(stagingDebugLog) logger.debug("Creating batch from moved files.") movedFiles=Batch(args.dest_root,directory=destinationDataFolder) existingMovedFileHashes={} movedFileHashes={} logger.info("Hashing copied files.") if exists(join(destinationAdminFolder,'fixityInStaging.txt')): with open(join(destinationAdminFolder,'fixityInStaging.txt'),'r') as f: if not args.rehash: for line in f.readlines(): splitLine=line.split('\t') if splitLine[1] != "ERROR": existingMovedFileHashes[splitLine[0]]=[splitLine[1],splitLine[2].rstrip('\n')] with open(join(destinationAdminFolder,'fixityInStaging.txt'),'a') as f: for item in movedFiles.find_items(from_directory=True): if item.test_readability(): item.set_root_path(destinationDataFolder) if relpath(item.get_file_path(),start=item.get_root_path()) not in existingMovedFileHashes: item.set_sha256(item.find_sha256_hash()) item.set_md5(item.find_md5_hash()) movedFileHashes[relpath(item.get_file_path(),start=destinationDataFolder)]=[item.get_sha256(),item.get_md5()] else: logger.warn("COULD NOT READ FILE: "+item.get_file_path()) movedFileHashes[relpath(item.get_file_path(),start=destinationDataFolder)]=["ERROR","ERROR"] for entry in movedFileHashes: f.write(entry+"\t"+movedFileHashes[entry][0]+'\t'+movedFileHashes[entry][1]+'\n') return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131