def clobber_document(params, opts): doc = openn_db.get_doc(params) if logger.getEffectiveLevel() >= logging.INFO: msg = "Preparing to clobber document id: %d," msg += " repository: %s, base_dir: %s" logger.info(msg % doc.id, doc.collection, doc.base_dir) if doc.is_online: msg = "Clobber requested, but refusing to delete record " msg += "for document on-line at: %s" % (doc.package_dir, ) raise OPennException(msg) else: if opts.yes is True: logger.info("Deleting existing document.") elif opts.no is True: msg = "User canceled clobber; no changes made" raise OPennException(msg) else: question = "Proceed with clobber and delete this document?" yes_response = "OK. Deleting existing document." no_response = "User canceled clobber; no changes made." # the following raises an exception unless the user enters yes handle_yes_no_input(question, yes_response, no_response) # We only get here if it's ok to proceed doc.delete()
def redo_document(doc, opts): if logger.getEffectiveLevel() >= logging.INFO: msg = "Preparing to redo document id: %d," msg += " repository: %s, base_dir: %s" logger.info(msg % doc.id, doc.collection, doc.base_dir) if doc.is_online and str(os.getenv('OPENN_REDO_OVERRIDE_ONLINE_HALT', None)).lower() != 'true': msg = "Redo requested, but refusing to redo record " msg += "for document on-line at: %s" % (doc.package_dir, ) raise OPennException(msg) else: if opts.yes is True: logger.info("Removing images from existing document.") elif opts.no is True: msg = "User canceled redo; no changes made" raise OPennException(msg) else: question = "Proceed with redo and delete all images?" yes_response = "OK. Removing images from existing document." no_response = "User canceled redo; no changes made." # the following raises an exception unless the user enters yes handle_yes_no_input(question, yes_response, no_response) # We only get here if it's ok to proceed doc.image_set.all().delete()
def bibid_filename(self): if not os.path.exists(self.source_dir): raise OPennException("Could not find source_dir: %s" % self.source_dir) bibid_txt = os.path.join(self.source_dir, 'bibid.txt') if not os.path.exists(bibid_txt): raise OPennException("Could not find bibid.txt: %s" % bibid_txt) return bibid_txt
def get_config_dict(self, tag): configlist = [ x for x in self._configs if x['tag'] == tag ] if len(configlist) == 1: return configlist[0] elif len(configlist) > 1: msg = "Invalid repositories config: more than one has tag '%s'" raise OPennException(msg % (tag,)) else: raise OPennException("Unknown tag: '%s'" % (tag,))
def regen_partial_tei(self, doc, **kwargs): # validate directory # Move files: # # - pages.xlsx required # - marc.xml required unless bibid.txt present # - bibid.txt ignored; BibID should be in existing TEI # - holdingid.txt optional; may be required for Penn MSS (with BibID in TEI) data_dir = kwargs.get('METADATA_DIR', None) if data_dir is None or data_dir.strip() == '': raise OPennException("Missing required METADATA_DIR") if not os.path.exists(data_dir): raise OPennException("Cannot find METADATA_DIR: '%s'" % (data_dir,)) metadata_files = ('pages.xlsx', 'marc.xml', 'holdingid.txt') for file in metadata_files: full_path = os.path.abspath(os.path.join(data_dir, file)) if os.path.exists(full_path): dest = os.path.abspath(os.path.join(self.source_dir, file)) if full_path == dest: pass elif os.path.exists(full_path): shutil.copyfile(full_path, dest) tei = OPennTEI(doc.tei_xml) bibid = tei.bibid # make sure we have the marc.xml file if os.path.exists(self.marc_xml): pass elif bibid is None: OPennException("Saved TEI lacks BibID; required MARC file missing: '%s'" % (self.marc_xml,)) else: if not self.NEW_BIBID_RE.match(bibid): bibid = '99%s3503681' % (str(bibid),) self.write_xml(bibid, self.marc_xml) # create pages.xml from the page.xlsx self.write_openn_xml(self.openn_xml_path) # fake the pih.xml by merging pages.xml with marc.xml (from above) self.write_pih_xml() self.save_rights_data() partial_tei_xml = self.gen_partial_tei() self.write_partial_tei(self.source_dir, partial_tei_xml) self.validate_partial_tei() self.stage_marc_xml() self.add_removal(self.pih_filename) self.add_removal(self.bibid_filename) self.add_removal(self.holdingid_filename) self.add_removal(self.openn_xml_path) self.add_removal(self.xlsx_path) self.add_removal(os.path.join(self.source_dir, 'sha1manifest.txt'))
def check_file_names(self, expected): # print sys_file_names(source_dir) if len(expected) < 1: raise OPennException("Penn in Hand XML lists no files: see %s" % pih_xml) missing = [] for file in expected: path = os.path.join(self.source_dir, file) if not os.path.exists(path): missing.append(file) if len(missing) > 0: smiss = ', '.join(missing) raise OPennException("Expected images are missing from %s: %s" % (self.source_dir, smiss))
def main(cmdline=None): """op-info """ status = 0 parser = make_parser() opts, args = parser.parse_args(cmdline) setup_logger() logger = logging.getLogger(__name__) try: if len(args) != 2: raise OPennException("Wrong number of arguments") repo_name, pkg_dir = args errors = validate(repo_name, pkg_dir) if len(errors) > 0: logging.error("Errors found checking package directory: %s" % (args[1],)) for er in errors: logging.error(er) status = 1 else: logging.info("Valid package directory: %s" % (args[1],)) except OPennException as ex: parser.error(str(ex)) status = 4 except Exception as ex: parser.error(str(ex)) status = 4 return status
def make_readme_html(readme, opts): try: readme_dict = find_readme(readme) if readme_dict == None: raise OPennException("Unknown readme file: %s" % (readme, )) page = Page(readme, site_dir(), title=readme_dict['title']) if is_makeable(page, opts): logging.info("Creating page: %s" % (page.outfile_path(), )) if not opts.dry_run: page.create_pages() else: logging.info("Skipping page: %s" % (page.outfile_path(), )) except TemplateDoesNotExist as ex: msg = "Could not find template: %s" % (readme, ) raise OPennException(msg)
def validate_workbook(self): if not os.path.exists(self.xlsx_path): msg = 'Cannot find required metadata workbook: %s' % ( self.xlsx_path) raise OPennException(msg) self.workbook().validate()
def check_valid(self): """ Confirm that the source dir has a data directory, PARTIAL_TEI.xml, and file_list.json """ for name in PackageDir._required_paths: path = getattr(self,PackageDir._required_paths[name]) if not os.path.exists(path): raise OPennException("No %s found in %s" % (name, self.source_dir))
def copy_current_manifest(doc, source_dir): dest_path = os.path.join(source_dir, 'manifest-sha1.txt') if os.path.exists(dest_path): logger.info("Manifest found in source_dir: %s", dest_path) return site_manifest = os.path.join(os.environ['OPENN_SITE_DIR'], doc.manifest_path) if os.path.exists(site_manifest): logger.info("Copying manifest from %s", site_manifest) dest = os.path.join(source_dir, os.path.basename(site_manifest)) shutil.copyfile(site_manifest, dest) return staged_manifest = os.path.join(os.environ['OPENN_STAGING_DIR'], doc.manifest_path) if os.path.exists(staged_manifest): logger.info("Copying manifest from %s", staged_manifest) dest = os.path.join(source_dir, os.path.basename(staged_manifest)) shutil.copyfile(staged_manifest, dest) return url = "https://%s/%s" % (settings.OPENN_HOST, doc.manifest_path) logger.info("Downloading manifest from %s", url) try: data = urllib2.urlopen(url).read() with open(dest_path, 'w+') as f: f.write(data) except urllib2.HTTPError as ex: if ex.getcode() == 404: raise OPennException("Manifest not found at %s" % (url, )) else: raise ex
def prep_class_parameter(self, name): try: return self.prep_class_params()[name] except KeyError: msg = "Cannot find prep_class_parameter '%s' in dict %s" msg = msg % (name, json.dumps(self.prep_class_params())) raise OPennException(msg)
def holdingid_filename(self): if not os.path.exists(self.source_dir): raise OPennException("Could not find source_dir: %s" % self.source_dir) holdingid_txt = os.path.join(self.source_dir, 'holdingid.txt') if not os.path.exists(holdingid_txt): return None return holdingid_txt
def validate(self): errors = [] if self.package_validation: errors = self.package_validation.validate(self.source_dir) if len(errors) > 0: msg = 'Invalid package directory: %s' % (self.source_dir, ) raise (OPennException('\n'.join([msg] + errors)))
def _get_prep_config_dict(self, tag): try: return self._prep_configs[tag] except KeyError: msg = "Could not find prep_config_dict for tag '%s' (known: %s)" msg = msg % (tag,self.prep_config_tags()) raise OPennException(msg)
def folder(self): oprepo = self.repository() if oprepo is not None: return oprepo.long_id() else: msg = "RepositoryWrapper with tag '%s' is not in db; has no folder" raise OPennException(msg % self.tag)
def regen_partial_tei(self, doc, **kwargs): xlsx_path = None data_dir = kwargs.get('METADATA_DIR', None) if data_dir is None: msg = 'METADATA_DIR is required to update TEI (document ID: %d)' raise OPennException(msg % (self.document.id,)) # copy the xlsx file into the source_dir as openn_metadata.xlsx xlsx_path = os.path.abspath(os.path.join(data_dir, 'openn_metadata.xlsx')) dest = os.path.abspath(self.xlsx_path) if xlsx_path == dest: pass else: shutil.copyfile(xlsx_path, dest) self.write_openn_xml(self.openn_xml_path()) partial_tei = self.gen_partial_tei() # xxxxx self.write_partial_tei(self.source_dir, partial_tei) self.check_page_count(self.openn_xml_path(), doc) self.update_serial_numbers(self.openn_xml_path(), doc) self.validate_partial_tei() self.archive_xlsx() self.add_removal(self.openn_xml_path()) self.add_removal(self.xlsx_path)
def get_bibid(self): bibid = open(self.bibid_filename).read().strip() if not re.match('\d+$', bibid): raise OPennException("Bad BibID; expected only digits; found: '%s'" % bibid) if len(bibid) > 7: return bibid else: return '99%s3503681' % (str(bibid),)
def main(cmdline=None): """op-prep main """ status = 0 parser = make_parser() opts, args = parser.parse_args(cmdline) if len(args) < 2 or len(args) > 3: parser.error('Wrong number of arguments') # Prep config is required, b/c only some prep methods implement # TEI regeneration. prep_config_tag = args[0] doc_id = args[1] metadata_dir = None if len(args) > 2: metadata_dir = args[2] setup_logger() logger = logging.getLogger(__name__) try: prep_config = get_prep_config(prep_config_tag) doc = Document.objects.get(pk=doc_id) output_dir = os.path.join(opts.out_dir, doc.base_dir) if os.path.exists(output_dir): raise OPennException("Output directory already exists: %s" % (output_dir)) else: os.mkdir(output_dir) kwargs = {} if metadata_dir is not None: if os.path.exists(metadata_dir): kwargs['METADATA_DIR'] = metadata_dir else: raise OPennException("Cannot find METADATA_DIR: '%s'" % (metadata_dir,)) OPennPrep().update_tei(output_dir, doc, prep_config, **kwargs) except OPennException as ex: if opts.verbose: opfunc.print_exc() status = 4 parser.error(str(ex)) return status
def get_tei(self, document): try: return OPennTEI(document.tei_xml) except OPennException as oex: msg = "Error processing document: id: %d, base_dir: '%s'" % ( document.id, document.base_dir) self.logger.error(msg) raise OPennException(msg, oex, str(oex))
def image_types(self): try: return self._repo_prep_dict['image_types'] except KeyError: msg = "Cannot find required PREP_CONFIG parameter 'image_type'" msg += " in dict %s" msg = msg % (json.dumps(self._repo_prep_dict), ) raise OPennException(msg)
def get_method_config(self, tag): for cfg in self._prep_methods: if cfg.get('tag', False) == tag: return cfg msg = "Could not find prep method for tag '%s' (known %s)" msg = msg % (tag, ', '.join(self.known_tags())) raise OPennException(msg)
def validate(self): for sheet in self.sheets(): sheet.validate() if self.has_metadata_errors(): msg = [ "Errors found in metadata for workbook: %s" % (self.xlsx_path, ) ] + self.metadata_errors() raise OPennException('\n'.join(msg))
def prep_source_dir_arg(source_dir): if source_dir.strip().endswith('/'): source_dir = source_dir[:-1] if not os.path.exists(source_dir): msg = "SOURCE_DIR does not exist: %s" % source_dir raise OPennException(msg) return source_dir
def validate(self): msgs = [] msgs += self.validate_unique_fields() msgs += self.validate_required_fields() if len(msgs) > 0: msgs = [" %s" % (x) for x in msgs] msgs.insert(0, "Errors found in configurations:") msg = "\n".join(msgs) raise OPennException(msg)
def validate_file_lists(self): for sheet in self.sheets(): sheet.validate_file_lists() if self.has_file_errors(): msg = [ "Errors found checking files in workbook: %s" % (self.xlsx_path, ) ] + self.file_errors() raise OPennException('\n'.join(msg))
def rewrite_manifest(doc, source_dir): manifest_path = os.path.join(source_dir, 'manifest-sha1.txt') tei_rel_path = os.path.join('data', doc.tei_basename) tei_full_path = os.path.join(source_dir, tei_rel_path) marc_rel_path = os.path.join('data', 'marc.xml') marc_full_path = os.path.join(source_dir, marc_rel_path) if not os.path.exists(tei_full_path): raise OPennException("No TEI file found at %s" % (tei_full_path, )) tei_sha1 = hashlib.sha1() with open(tei_full_path, 'rb') as tei: tei_sha1.update(tei.read()) tei_digest = tei_sha1.hexdigest() if os.path.exists(marc_full_path): marc_sha1 = hashlib.sha1() with open(marc_full_path, 'rb') as marc: marc_sha1.update(marc.read()) marc_digest = marc_sha1.hexdigest() with open(manifest_path, "r") as manifest: lines = manifest.readlines() # make sure we need to update the manifest; tei_line_re = r'^%s +%s' % (tei_digest, tei_rel_path) for line in lines: if re.search(tei_line_re, line): raise OPennException("Manifest already up-to-date") with open(manifest_path, "w") as manifest: for line in lines: parts = re.split('\s+', line.strip(), 1) if len(parts) < 2: continue file = parts[1] if file == tei_rel_path: manifest.write("%s %s\n" % (tei_digest, tei_rel_path)) elif file == marc_rel_path and marc_digest is not None: manifest.write("%s %s\n" % (marc_digest, marc_rel_path)) logger.info('Writing marc_digest: %s' % (marc_digest, )) else: manifest.write(line)
def validate_source_dir(prep_method, source_dir): validation_params = prep_method.package_validations() if validation_params is None: return validator = PackageValidation(**validation_params) errors = validator.validate(source_dir) if len(errors) > 0: msg = 'Invalid package directory: %s' % (source_dir, ) raise (OPennException('\n'.join([msg] + errors)))
def gen_partial_tei(self): xsl_command = 'op-gen-tei' p = subprocess.Popen([xsl_command, self.openn_xml_path(), self._xsl], stderr=subprocess.PIPE, stdout=subprocess.PIPE) out, err = p.communicate() if p.returncode != 0: raise OPennException("TEI Generation failed: %s" % err) return out
def repository(self): try: if self._repository is None: self._repository = Repository.objects.get(tag=self.tag()) except Repository.DoesNotExist: repos = [x.tag for x in Repository.objects.all()] raise OPennException( "Could not find repository for tag: %s; repos: %s" % (self.tag(), ', '.join(repos))) return self._repository