def main(): parser = ArgumentParser(description="{description}". \ format(description = __description__), epilog="{copyright}; ". \ format(copyright = __copyright__) + \ "written by {author} <{email}>.". \ format(author = __author__, email = __email__)) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO \ ) parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG \ ) parser.add_argument( \ '-l','--log_loc',help="save logging to a file", action="store_const",dest="log_loc", const='./{progname}.log'. \ format(progname=__file__) \ ) parser.add_argument( \ '--db_url',help="Enter a db url",action='store' \ ) parser.add_argument( \ '--root',help="Enter the root of the repository", action='store') parser.add_argument( \ '--object_pattern',help="Enter the regex pattern " + \ "to match an object", action='store') parser.add_argument( \ '--page_pattern',help="Enter the regex pattern " + \ "to match a page", action='store') parser.add_argument( \ 'accessions',nargs="*",action='store', help="Enter 1 or more accession " + \ "identifiers to process" \ ) args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) try: logger.setLevel(args.log_level) except TypeError: logger.setLevel(INFO) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) db = Database(args.db_url, ['record','file']) class Record(db.base): __table__ = Table('record', db.metadata, autoload=True) class File(db.base): __table__ = Table('file', db.metadata, autoload=True) query = db.session.query(File).filter(File.accession.in_(args.accessions)) if args.root: batch = Batch(args.root, query = query) items = batch.find_items(from_db = True) batch.set_items(items) else: raise ValueError("need to include a root") try: all_objects = [] for item in batch.get_items(): accession = item.find_file_accession() item.set_accession(accession) canon = item.find_canonical_filepath() item.set_canonical_filepath(canon) search_pattern = item.find_matching_object_pattern( \ re_compile("(mvol)/(\w{4})/(\w{4})/(\w{4})/" + "(mvol)-(\w{4})-(\w{4})-(\w{4})")) if search_pattern.status == True: potential_identifier = '-'.join(search_pattern.data.groups()) is_an_object_already_present = [x for x in all_objects \ if x.identifier == \ potential_identifier] if is_an_object_already_present: logger.debug("found this id already") else: logger.debug("this id is new!") new_object = DigitalObject(potential_identifier) all_objects.append(new_object) logger.debug(potential_identifier) return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131
def main(): parser = ArgumentParser(description="{description}". \ format(description=__description__), epilog="Copyright University of Chicago; " + \ "written by {author} ". \ format(author = __author__) + \ " <{email}> University of Chicago". \ format(email = __email__)) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO \ ) parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG \ ) parser.add_argument( \ '-l','--log_loc',help="save logging to a file", action="store_const",dest="log_loc", const='./{progname}.log'. \ format(progname=argv[0]) \ ) parser.add_argument("-o","--object_level", help="Enter the level at which object starts", type=int, action='store') parser.add_argument("-r", "--root", help="Enter the root of the directory path", action="store") parser.add_argument("directory_path", help="Enter a directory that you need to work on ", action='store') parser.add_argument('pattern', help="Enter a pattern to filter files with", action="store") global args args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) try: logger.setLevel(args.log_level) except TypeError: logger.setLevel(INFO) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) current_date = datetime.now() isof_current_date = current_date.strftime("%Y-%m-%dT%H:%M:%S") sixty_days_ago_date = current_date - timedelta(days=60) isof_sixty_days_ago_date = sixty_days_ago_date.strftime( \ "%Y-%m-%dT%H:%M:%S") db = Database("sqlite:////media/repo/repository/databases/" + "official/repositoryAccessions.db.new",tables_to_bind= \ ['record']) class Record(db.base): __table__ = Table('record', db.metadata, autoload=True) b = Batch(args.root, directory = args.directory_path) difference_in_path = relpath(args.directory_path, args.root) query = db.session.query(Record.createdate).filter(Record.receipt == \ difference_in_path) createdate = query.first()[0] items = b.find_items(from_directory = True, filterable = re_compile(args.pattern)) b.set_items(items) try: generated_data = evaluate_items(b,createdate) count = 0 objects = {} descriptive_metadata = '.dc.xml$' representation_file = '.pdf$' mets_file = '.mets.xml$' file_definers = ['dc.xml','ALTO','TIFF','JPEG','pdf','mets.xml', '\d{4}.txt'] file_definer_sequences = ['ALTO','TIFF','JPEG'] page_number_pattern = '_(\w{4})' for n in generated_data: id_parts = args.pattern.split('/') id_parts_enumerated = [x for x in range(args.object_level)] id_part_values = [n.canonical_filepath.split('/')[x] \ for x in id_parts_enumerated] identifier = "-".join(id_part_values) to_add = None for p in file_definers: if p in n.canonical_filepath: to_add = n break if to_add: if objects.get(identifier): objects.get(identifier).append(n) else: objects[identifier] = [n] else: logger.error("{fpath} in {id} could not be matched". \ format(fpath = n.canonical_filepath, id = identifier)) for k, v in objects.items(): logger.error(k) k_identifier = k k_id_part_values = k.split('-') logger.info(k_id_part_values) k_id_directory = '/'.join(k_id_part_values) for p in file_definer_sequences: sequence = sorted([(int(re_compile(page_number_pattern). \ search(x.canonical_filepath).group(1).lstrip('0')), x.canonical_filepath) \ for x in v if p in x.canonical_filepath]) known_complete_page_range = [x for x in \ range(sequence[-1][0])][1:] what_is_actually_present = [x[0] for x in sequence] if set(known_complete_page_range) - \ set(what_is_actually_present): difference = list(set(known_complete_page_range) - \ set(what_is_actually_present)) l = [str(x) for x in list(difference)] logger.error("The sequence part {part} ". \ format(part = p) + "is missing pages {pages}". \ format(pages = ','.join(l))) for p in file_definers: seek = [x for x in v if p in x.canonical_filepath] if len(seek) == 0: logger.error("{identifier}". \ format(identifier = k_identifier) + \ " missing part {part}".format(part = p)) ldrurl = LDRURL(join(k_id_directory, k_identifier)) piurl = PIURL("dig/campub", join(k_id_directory, k_identifier)) rightsurl = RightsURL() repurl = URL("http://repository.lib.uchicago.edu/") collectionurl = URL("ead/ICU.SPCL.CAMPUB") dcfile = [x for x in v if '.dc.xml' in x.canonical_filepath][0] proxy = Proxy(join(dcfile.accession, dcfile.dirhead, dcfile.canonical_filepath)) pdffile = [x for x in v if '.pdf' in x.canonical_filepath][0] jpegfile = [x for x in v if 'JPEG' in x.canonical_filepath \ and '_0001' in x.canonical_filepath][0] pdfresource = WebResource(join(pdffile.accession, pdffile.dirhead, pdffile.canonical_filepath)) metsfile = [x for x in v if '.mets.xml' in x.canonical_filepath][0] metsresource = RDFSResource(join(metsfile.accession, metsfile.dirhead, metsfile.canonical_filepath)) pages = set([int(re_compile('_(\w{4}).*'). \ search(basename(x.canonical_filepath)). \ group(1).lstrip('0')) for x in v if re_compile('_\w{4}.*'). \ search(basename(x.canonical_filepath))]) numpages = list(pages)[-1] providedcho = ProvidedCHO(k_id_directory) aggregation = Aggregation(k_id_directory) rem = ResourceMap(k_id_directory) proxy.add_statement("dc:format", TextValue(dcfile.mimetype)) proxy.add_statement("ore:proxyFor", URL(providedcho.subject)) proxy.add_statement("ore:proxyIn", URL(aggregation.subject)) stdout.write(str(proxy)) providedcho.add_statement("dc:coverage",TextValue("Chicago")) providedcho.add_statement("dc:date", DateValue(dcfile.date)) providedcho.add_statement("edm:year", DateValue(dcfile.date.split('-')[0])) providedcho.add_statement("dc:description", TextValue(dcfile.description)) providedcho.add_statement("dc:identifier", TextValue(dcfile.identifier)) providedcho.add_statement("dc:language", TextValue("en")) providedcho.add_statement("dc:rights", rightsurl) providedcho.add_statement("dc:title", TextValue(dcfile.title)) providedcho.add_statement("dc:type", TextValue("text")) providedcho.add_statement("edm:type", TextValue("TEXT")) providedcho.add_statement("dc:description", URL(aggregation.subject)) providedcho.add_statement("dcterms:isPartOf", collectionurl) rem.add_statement("dcterms:created", DateValue(createdate)) rem.add_statement("dcterms:creator", repurl) rem.add_statement("ore:describes", URL(aggregation.subject)) stdout.write(str(rem)) aggregation.add_statement("edm:aggregatedCHO", URL(providedcho.subject)) aggregation.add_statement("edm:dataProvider", TextValue("University of Chicago Library")) aggregation.add_statement("edm:isShownAt", piurl) aggregation.add_statement("edm:isShownBy", URL(join(pdffile.accession, pdffile.dirhead, pdffile.canonical_filepath))) aggregation.add_statement("edm:object", URL(join(jpegfile.accession, jpegfile.dirhead, jpegfile.canonical_filepath))) aggregation.add_statement("edm:provider", TextValue("University of Chicago Library")) aggregation.add_statement("dc:rights", rightsurl) aggregation.add_statement("ore:isDescribedBy", URL(rem.subject)) stdout.write(str(aggregation)) metsresource.add_statement("dc:format", TextValue(metsfile.mimetype)) stdout.write(str(metsresource)) pdfresource.add_statement("dcterms:isFormatOf", ldrurl.subject) pdfresource.add_statement("premis:objectIdentifierType", TextValue("ARK")) pdfresource.add_statement("premis:objectIdentifierValue", URL(pdfresource.subject)) pdfresource.add_statement("dc:format", TextValue(pdffile.mimetype)) pdfresource.add_statement("premis:objectCategory", TextValue("file")) pdfresource.add_statement("premis:compositionLevel", IntegerValue(0)) pdfresource.add_statement("premis:messageDigestAlgorithm", TextValue("SHA-256")) pdfresource.add_statement("premis:messageDigest", TextValue(pdffile.checksum)) pdfresource.add_statement("premis:messageDigestOriginator", TextValue("/sbin/sha256")) pdfresource.add_statement("premis:size", IntegerValue(pdffile.file_size)) pdfresource.add_statement("premis:formatName", TextValue(pdffile.mimetype)) pdfresource.add_statement("premis:originalName", TextValue(pdffile.canonical_filepath)) pdfresource.add_statement("premis:eventIdentifierType", TextValue("ARK")) pdfresource.add_statement("premis:eventIdentifierValue", TextValue(pdffile.accession)) pdfresource.add_statement("premis:eventType", TextValue("creation")) pdfresource.add_statement("premis:eventDateTime", DateValue(createdate)) stdout.write(str(pdfresource)) all_pages = range(1, numpages + 1) for n in all_pages: if n != all_pages[-1]: next_page = n + 1 canonical_next_page = '0' * (4 - len(str(next_page))) + \ str(next_page) canonical_next_page_name = join(k_id_directory,k_identifier + \ '_' + canonical_next_page) else: next_page = None canonical_page = ('0' * (4 - len(str(n)))) + str(n) canonical_page_file_name = k_identifier + '_' + canonical_page page_name = join(k_id_directory, canonical_page_file_name) logger.info(page_name) providedcho.add_statement("dcterms:hasPart", "<{url}>". \ format(url = page_name)) tiffile = [x for x in v if 'TIFF' in x.canonical_filepath and str('_' + canonical_page) in x.canonical_filepath][0] ocrfile = [x for x in v if 'ALTO' in x.canonical_filepath and str('_' + canonical_page) in x.canonical_filepath][0] jpegfile = [x for x in v if 'JPEG' in x.canonical_filepath and str('_' + canonical_page) in x.canonical_filepath][0] page_providedcho = ProvidedCHO(page_name) page_aggregation = Aggregation(page_name) page_rem = ResourceMap(page_name) page_webresource = WebResource(join(tiffile.accession, tiffile.dirhead, tiffile.canonical_filepath)) page_jpeg = RDFSResource(join(jpegfile.accession, jpegfile.dirhead, jpegfile.canonical_filepath)) page_ocr = RDFSResource(join(ocrfile.accession, ocrfile.dirhead, ocrfile.canonical_filepath)) page_providedcho.add_statement("dc:description", "<{url}>". \ format(url = join(ocrfile.accession, ocrfile.dirhead, ocrfile.canonical_filepath))) page_providedcho.add_statement("dc:language", TextValue("en")) page_providedcho.add_statement("dc:rights", rightsurl) page_providedcho.add_statement("dc:type", TextValue("Text")) page_providedcho.add_statement("edm:type", TextValue("TEXT")) page_providedcho.add_statement("dc:title", TextValue("Page {number}". \ format(number = str(n)))) page_providedcho.add_statement("dcterms:isPartOf", URL(providedcho.subject)) if next_page: page_providedcho.add_statement("edm:isNextInSequence", URL(join("/",canonical_next_page_name))) stdout.write(str(page_providedcho)) page_aggregation.add_statement("edm:aggregatedCHO", URL(page_providedcho.subject)) page_aggregation.add_statement("edm:dataProvider", TextValue("University of Chicago Library")) page_aggregation.add_statement("edm:isShownBy", URL(page_webresource.subject)) page_aggregation.add_statement("edm:object", URL(page_jpeg.subject)) page_aggregation.add_statement("edm:provider", TextValue("University of Chicago Library")) page_aggregation.add_statement("edm:rights", URL(rightsurl.subject)) page_aggregation.add_statement("ore:isDescribedBy", URL(page_rem.subject)) stdout.write(str(page_aggregation)) page_rem.add_statement("dc:created", DateValue(createdate)) page_rem.add_statement("dcterms:creator", URL(repurl.subject)) stdout.write(str(page_rem)) page_webresource.add_statement("mix:fileSize", IntegerValue(tiffile.file_size)) page_webresource.add_statement("mix:formatName", TextValue(tiffile.mimetype)) if getattr(tiffile,'mixchecksum',None): page_webresource.add_statement("mix:messageDigestAlgorithm", TextValue("MD5")) page_webresource.add_statement("mix:messageDigest", TextValue(tiffile.mixchecksum)) if getattr(tiffile,'imageheight',None): page_webresource.add_statement("mix:imageHeight", IntegerValue(int(tiffile.imageheight))) if getattr(tiffile,'imagewidth',None): page_webresource.add_statement("mix:imageWidth", IntegerValue(int(tiffile.imagewidth))) if getattr(tiffile,'bitspersample',None): page_webresource.add_statement("mix:bitsPerSample", TextValue(tiffile.bitspersample)) stdout.write(str(page_webresource)) page_jpegresource = RDFSResource(join(jpegfile.accession, jpegfile.dirhead, jpegfile.canonical_filepath)) page_ocrresource = RDFSResource(join(ocrfile.accession, ocrfile.dirhead, ocrfile.canonical_filepath)) page_ocrresource.add_statement("dc:format", TextValue(ocrfile.mimetype)) stdout.write(str(page_ocrresource)) page_jpegresource.add_statement("dc:format", TextValue(jpegfile.mimetype)) stdout.write(str(page_jpegresource)) stdout.write(str(providedcho)) return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131
def main(): parser = ArgumentParser(description="{description}". \ format(description = __description__), epilog="{copyright}; ". \ format(copyright=__copyright__) + \ "written by {name} ".format(name=__author__) + \ " <{email}> ".format(email=__email__) + \ "University of Chicago") parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO \ ) parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG \ ) parser.add_argument( \ '-l','--log_loc',help="save logging to a file", action="store_const",dest="log_loc", const='./{progname}.log'. \ format(progname=argv[0]) \ ) parser.add_argument("location_root",help="Enter the root " + \ "of the directory path", action="store") parser.add_argument("directory_path", help="Enter a directory that you need to work on ", action='store') args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) try: logger.setLevel(args.log_level) except TypeError: logger.setLevel(INFO) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) try: b = Batch(args.location_root, directory = args.directory_path) generator_object = b.find_items(from_directory=True) logger.debug(generator_object) b.set_items(generator_object) stdout.write("begin transaction;\n") for a_file in b.get_items(): if a_file.test_readability(): file_hash = a_file.find_hash_of_file(sha256) mime = a_file.find_file_mime_type() size = a_file.find_file_size() accession = a_file.find_file_accession() a_file.set_file_mime_type(mime) a_file.set_file_size(size) a_file.set_hash(file_hash) a_file.set_accession(accession) out_string = "insert into file (filepath,accession," + \ "mimetype,size,checksum) values (" + \ "\"{path}\",\"{accession}\",\"{mimetype}\"". \ format(path = a_file.filepath, accession = a_file.get_accession(), mimetype = a_file.get_file_mime_type()) + \ ",{filesize},\"{filehash}\");\n". \ format(filesize = a_file.get_file_size(), filehash = a_file.get_hash()) stdout.write(out_string) else: logger.error("{path} could not be read". \ format(path=a_file.filepath)) stdout.write("commit;\n") return 0 except KeyboardInterrupt: logger.warn("Program aborted manually") return 131
def main(): parser = ArgumentParser(description="{description}". \ format(description=__description__), epilog="Copyright University of Chicago; " + \ "written by {author} ". \ format(author = __author__) + \ " <{email}> University of Chicago". \ format(email = __email__)) parser.add_argument("-v", help="See the version of this program", action="version", version=__version__) parser.add_argument( \ '-b','-verbose',help="set verbose logging", action='store_const',dest='log_level', const=INFO \ ) parser.add_argument( \ '-d','--debugging',help="set debugging logging", action='store_const',dest='log_level', const=DEBUG \ ) parser.add_argument( \ '-l','--log_loc',help="save logging to a file", action="store_const",dest="log_loc", const='./{progname}.log'. \ format(progname=argv[0]) \ ) parser.add_argument("-o","--object_level", help="Enter the level at which object starts", type=int, action='store') parser.add_argument("-r", "--root", help="Enter the root of the directory path", action="store") parser.add_argument("directory_path", help="Enter a directory that you need to work on ", action='store') parser.add_argument('pattern', help="Enter a pattern to filter files with", action="store") global args args = parser.parse_args() log_format = Formatter( \ "[%(levelname)s] %(asctime)s " + \ "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" \ ) global logger logger = getLogger( \ "lib.uchicago.repository.logger" \ ) ch = StreamHandler() ch.setFormatter(log_format) try: logger.setLevel(args.log_level) except TypeError: logger.setLevel(INFO) if args.log_loc: fh = FileHandler(args.log_loc) fh.setFormatter(log_format) logger.addHandler(fh) logger.addHandler(ch) current_date = datetime.now() isof_current_date = current_date.strftime("%Y-%m-%dT%H:%M:%S") sixty_days_ago_date = current_date - timedelta(days=60) isof_sixty_days_ago_date = sixty_days_ago_date.strftime( \ "%Y-%m-%dT%H:%M:%S") db = Database("sqlite:////media/repo/repository/databases/" + "official/repositoryAccessions.db.new",tables_to_bind= \ ['record']) class Record(db.base): __table__ = Table('record', db.metadata, autoload=True) b = Batch(args.root, directory = args.directory_path) difference_in_path = relpath(args.directory_path, args.root) query = db.session.query(Record.createdate).filter(Record.receipt == \ difference_in_path) createdate = query.first()[0] items = b.find_items(from_directory = True, filterable = re_compile(args.pattern)) b.set_items(items) try: generated_data = evaluate_items(b,createdate) count = 0 objects = {} descriptive_metadata = '.dc.xml$' representation_file = '.pdf$' file_definers = ['dc.xml','ALTO','TIFF','JPEG','pdf','mets.xml', '\d{4}.txt'] file_definer_sequences = ['ALTO','TIFF','JPEG'] page_number_pattern = '_(\w{4})' for n in generated_data: id_parts = args.pattern.split('/') id_parts_enumerated = [x for x in range(args.object_level)] id_part_values = [n.canonical_filepath.split('/')[x] \ for x in id_parts_enumerated] identifier = "-".join(id_part_values) to_add = None for p in file_definers: if p in n.canonical_filepath: to_add = n break if to_add: if objects.get(identifier): objects.get(identifier).append(n) else: objects[identifier] = [n] else: logger.error("{fpath} in {id} could not be matched". \ format(fpath = n.canonical_filepath, id = identifier)) for k, v in objects.items(): for p in file_definer_sequences: sequence = sorted([(int(re_compile(page_number_pattern). \ search(x.canonical_filepath).group(1).lstrip('0')), x.canonical_filepath) \ for x in v if p in x.canonical_filepath]) known_complete_page_range = [x for x in \ range(sequence[-1][0])][1:] what_is_actually_present = [x[0] for x in sequence] if set(known_complete_page_range) - \ set(what_is_actually_present): difference = list(set(known_complete_page_range) - \ set(what_is_actually_present)) l = [str(x) for x in list(difference)] logger.error("The sequence part {part} ". \ format(part = p) + "is missing pages {pages}". \ format(pages = ','.join(l))) for p in file_definers: seek = [x for x in v if p in x.canonical_filepath] if len(seek) == 0: logger.error("{identifier}". \ format(identifier = identifier) + \ " missing part {part}".format(part = p)) i = make_identifier(id_part_values, v) metadata = [x for x in v if re_compile(descriptive_metadata). \ search(x.canonical_filepath)][0] representation = [x for x in v if re_compile(representation_file). \ search(x.canonical_filepath)][0] providedcho = make_providedcho(i, metadata) print(providedcho) aggregation = make_aggregation(i, representation) print(aggregation) logger.info(i) return 0 except KeyboardInterrupt: logger.error("Program aborted manually") return 131