class PdfToTextTest(unittest.TestCase): fixture_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'fixtures') pdf_filepath = os.path.join(fixture_dir, 'test.pdf') pdf_text = 'This is a short PDF document to use for testing.' def setUp(self): self.repo = Repository(settings.FEDORA_ROOT, settings.FEDORA_USER, settings.FEDORA_PASSWORD) with open(self.pdf_filepath) as pdf: self.pdfobj = self.repo.get_object(type=TestPdfObject) self.pdfobj.label = 'eulindexer test pdf object' self.pdfobj.pdf.content = pdf self.pdfobj.save() def tearDown(self): self.repo.purge_object(self.pdfobj.pid) def test_file(self): # extract text from a pdf from a file on the local filesystem text = pdf_to_text(open(self.pdf_filepath, 'rb')) self.assertEqual(self.pdf_text, text) def test_object_datastream(self): # extract text from a pdf datastream in fedora pdfobj = self.repo.get_object(self.pdfobj.pid, type=TestPdfObject) text = pdf_to_text(pdfobj.pdf.content) self.assertEqual(self.pdf_text, text)
class Command(BaseCommand): ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo ''' args = "[netid netid ...]" help = __doc__ option_list = BaseCommand.option_list + ( make_option('--noact', '-n', action='store_true', default=False, help='Fixed all caps title in articles'), ) def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #connection to repository self.repo = Repository(settings.FEDORA_ROOT, username=settings.FEDORA_MANAGEMENT_USER, password=settings.FEDORA_PASSWORD) pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication) coll = self.repo.get_object(pid=settings.PID_ALIASES['oe-collection']) try: articles = Paginator(pid_set, 100) except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) continue for article in objs: try: if not article.exists: self.output(0, "Skipping %s because pid does not exist" % article.pid) continue else: print coll print article.pid article.collection = coll ark_uri = '%sark:/25593/%s' % (settings.PIDMAN_HOST, article.pid.split(':')[1]) article.dc.content.identifier_list.extend([ark_uri]) article.save() except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) # self.counts['errors'] +=1 def output(self, v, msg): '''simple function to handle logging output based on verbosity''' if self.verbosity >= v: self.stdout.write("%s\n" % msg)
def get_queryset(self): self.repo = Repository(request=self.request) # store the volume for use in get_context_data self.vol = self.repo.get_object(self.kwargs['pid'], type=Volume) if not self.vol.exists or not self.vol.is_a_volume: raise Http404 return self.vol.find_solr_pages()
def main(argv): pids = [] s = solr.SolrConnection('%s/solr' % HOST) repo = Repository(root='%s/fedora/' % HOST, username='******' % fedoraUser, password='******' % fedoraPass) results = repo.risearch.sparql_query('PREFIX dime: <http://dimenovels.org/ontology#> select ?pid where {?pid <fedora-rels-ext:isMemberOfCollection> <info:fedora/dimenovels:fame> . OPTIONAL { ?pid dime:IsCopyOf ?copy } FILTER (! BOUND(?copy)) }') for row in results: for k, v in row.items(): pids.append(v.replace('info:fedora/', '')) with open('C:/Users/a1691506/Desktop/ffw_editions.csv', mode='r') as infile: reader = csv.reader(infile) editionDict = {rows[0]:rows[1] for rows in reader} for p in pids: print "Processing %s" % p try: response = s.query('PID:"%s"' % p) for hit in response.results: number = hit['mods_series_number_ms'][0].split(' ')[1] editionUri = editionDict[number] obj = repo.get_object(p) obj.add_relationship('http://dimenovels.org/ontology#IsCopyOf', editionUri) obj.save() except: print "%s failed. Check it!" % p continue
def process(self, input): """ Write the input to the given path. """ if input is None: return #if not os.environ.get("NODETREE_WRITE_FILEOUT"): # return input repo = Repository(self._params.get("url"), self._params.get("username"), self._params.get("password")) try: buf = StringIO() Image.fromarray(input).save(buf, self._params.get("format").upper()) except IOError: raise exceptions.NodeError( "Error obtaining image buffer in format: %s" % self._params.get("format").upper(), self) pclass = get_fedora_proxy_class(self._params.get("dsid")) obj = repo.get_object(self._params.get("pid"), type=pclass) obj.DATASTREAM.content = buf obj.DATASTREAM.label = "Test Ingest Datastream 1" obj.DATASTREAM.mimetype = "image/%s" % self._params.get("format") obj.save() return input
def get(self, request): context = {} item_id = request.GET.get('id', None) fmt = request.GET.get('format', None) if item_id is not None: context['id'] = item_id repo = Repository(request=self.request) # generalized class-based view would need probably a get-item method # for repo objects, could use type-inferring repo variant obj = repo.get_object(item_id, type=Volume) formats = obj.unapi_formats if fmt is None: # display formats for this item context['formats'] = formats else: current_format = formats[fmt] # return requested format for this item meth = getattr(obj, current_format['method']) return HttpResponse(meth(), content_type=current_format['type']) else: # display formats for all items # NOTE: if multiple classes, should be able to combine the formats context['formats'] = Volume.unapi_formats # NOTE: doesn't really even need to be a template, could be generated # with eulxml just as easily if that simplifies reuse return render(request, 'books/unapi_format.xml', context, content_type='application/xml')
def volume_modified(request, pid): 'last modification time for a single volume' solr = solr_interface() results = solr.query(content_model=VolumeV1_0.VOLUME_CONTENT_MODEL, pid=pid) \ .sort_by('-timestamp').field_limit('timestamp') # NOTE: using solr indexing timestamp instead of object last modified, since # if an object's index has changed it may have been modified, # and index timestamp for a volume will be updated when pages are added # if a user is logged in, page should show as modified # when annotation count changes latest_note = None if request.user.is_authenticated(): # NOTE: shouldn't be very expensive to init volume here; not actually # making any api calls, just using volume to get volume # uri and associated annotations repo = Repository() vol = repo.get_object(pid, type=Volume) # newest annotation creation for pages in this volume latest_note = vol.annotations().visible_to(request.user) \ .last_created_time() solrtime = results[0]['timestamp'] if results.count() else None return solrtimestamp_or_datetime(solrtime, latest_note)
def remove_test_objects(self): # remove any leftover test object before or after running tests # NOTE: This method expects to be called only when FEDORA_PIDSPACE has been # switched to a test pidspace # use test fedora credentials if they are set repo = Repository(root=getattr(settings, 'FEDORA_TEST_ROOT', None), username=getattr(settings, 'FEDORA_TEST_USER', None), password=getattr(settings, 'FEDORA_TEST_PASSWORD', None)) test_objects = repo.find_objects(pid__contains='%s:*' % settings.FEDORA_PIDSPACE) count = 0 for obj in test_objects: # if objects are unexpectedly not being cleaned up, pid/label may help # to isolate which test is creating the leftover objects try: repo.purge_object(obj.pid, "removing test object") # NOTE: not displaying label because we may not have permission to access it logger.info('Purged test object %s' % obj.pid) count += 1 except RequestFailed: logger.warn('Error purging test object %s' % obj.pid) if count: print >> sys.stderr, "Removed %s test object(s) with pidspace %s" \ % (count, settings.FEDORA_PIDSPACE)
def purge_item(item_id): repo = Repository() pid = 'pitt:%s' % (item_id,) objs = repo.find_objects(pid__contains=pid) for o in objs: repo.purge_object(o.pid) print '%s purged' % (o.pid,)
def download_file(pid, dsid): repo = Repository(testsettings.FEDORA_ROOT_NONSSL, testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD) obj = repo.get_object(pid) ds = obj.getDatastreamObject(dsid) widgets = ['Download: ', progressbar.widgets.Percentage(), ' ', progressbar.widgets.Bar(), ' ', progressbar.widgets.ETA(), ' ', progressbar.widgets.FileTransferSpeed()] # set initial progressbar size based on file; will be slightly larger because # of multipart boundary content pbar = progressbar.ProgressBar(widgets=widgets, max_value=ds.size).start() # download content to a tempfile tmpfile = tempfile.NamedTemporaryFile( prefix='%s-%s_' % (pid, dsid), delete=False) print('writing to ', tmpfile.name) size_read = 0 try: for chunk in ds.get_chunked_content(): size_read += len(chunk) pbar.update(size_read) tmpfile.write(chunk) except Exception: raise
def datastream_etag(request, pid, dsid, type=None, repo=None, accept_range_request=False, **kwargs): """Method suitable for use as an etag function with :class:`django.views.decorators.http.condition`. Takes the same arguments as :meth:`~eulfedora.views.raw_datastream`. """ # if a range is requested and it is not for the entire file, # do *NOT* return an etag if accept_range_request and request.META.get("HTTP_RANGE", None) and request.META["HTTP_RANGE"] != "bytes=1-": return None try: if repo is None: repo = Repository() get_obj_opts = {} if type is not None: get_obj_opts["type"] = type obj = repo.get_object(pid, **get_obj_opts) ds = obj.getDatastreamObject(dsid) if ds and ds.exists and ds.checksum_type != "DISABLED": return ds.checksum except RequestFailed: pass return None
def setUp(self): # instantiate repo_admin the first time we run, after the test settings are in place if self.repo_admin is None: self.repo_admin = Repository(username=getattr(settings, 'FEDORA_TEST_USER', None), password=getattr(settings, 'FEDORA_TEST_PASSWORD', None)) self.client = Client() # create a file object to edit with open(self.ingest_fname) as ingest_f: self.obj = self.repo_admin.get_object(type=FileObject) self.obj.dc.content.title = self.obj.label = 'Test file object' self.obj.dc.content.date = '2011' self.obj.master.content = ingest_f self.obj.master.label = 'hello-world.txt' self.obj.master.checksum = self.ingest_md5sum self.obj.save() self.edit_url = reverse('file:edit', kwargs={'pid': self.obj.pid}) self.download_url = reverse('file:download', kwargs={'pid': self.obj.pid}) self.view_url = reverse('file:view', kwargs={'pid': self.obj.pid}) # create a image object for testing with open(self.image_fname) as ingest_f: self.imgobj = self.repo_admin.get_object(type=FileObject) self.imgobj.dc.content.title = self.imgobj.label = 'Test file object' self.imgobj.master.content = ingest_f self.imgobj.master.label = 'test.jpg' self.imgobj.master.checksum = self.image_md5sum self.imgobj.save() self.pids = [self.obj.pid, self.imgobj.pid]
def postcard_image(request, pid, size): '''Lin to postcard image in requested size. :param pid: postcard object pid :param size: size to return, one of thumbnail, medium, or large ''' # NOTE: formerly this served out actual image content, via # fedora dissemination & djatoka # Images now use an IIIF image server; adding redirects here # for the benefit of search engines or indexes referencing # the old urls try: repo = Repository() obj = repo.get_object(pid, type=ImageObject) if not obj.exists: raise Http404 if size == 'thumbnail': url = obj.thumbnail_url elif size == 'medium': url = obj.medium_img_url elif size == 'large': url = obj.large_img_url return HttpResponsePermanentRedirect(url) except RequestFailed: raise Http404
def handle(self, *pids, **options): # bind a handler for interrupt signal signal.signal(signal.SIGINT, self.interrupt_handler) verbosity = int(options.get('verbosity', self.v_normal)) repo = Repository() try: pidman = DjangoPidmanRestClient() except Exception as err: # error if pid manager config options not in localsettings raise CommandError(err) old_page_target = '%s/books/pages/' % Site.objects.get_current().domain search_args = {'type':'ark', 'target': old_page_target, 'count': 10} # get a small result set to retrieve the total results = pidman.search_pids(**search_args) total = results['results_count'] # then set a larger page size for actual processing search_args['count'] = 100 if verbosity >= self.v_normal: print 'Found %d total page ARKs with targets to be updated' % total pbar = ProgressBar(widgets=[Percentage(), ' (', Counter(), ')', Bar(), ETA()], maxval=total).start() self.stats = defaultdict(int) self.processed = set() for ark in self.get_search_results(pidman, search_args): self.processed.add(ark['pid']) # get fedora pid from target uri target_uri = ark['targets'][0]['target_uri'] baseurl, pid = target_uri.rstrip('/').rsplit('/', 1) try: page = repo.get_object(pid, type=Page) # this should probably only happen in dev/qa if not page.exists: if verbosity > self.v_normal: self.stderr.write('Page %s does not exist' % pid) self.stats['notfound'] += 1 else: # check if volume exists? pidman.update_ark_target(ark['pid'], target_uri=page.absolute_url) self.stats['updated'] += 1 except RequestFailed as rf: print 'Error accessing %s: %s' % (pid, rf) self.stats['error'] += 1 pbar.update(len(self.processed)) if self.interrupted: break if not self.interrupted: pbar.finish() # summarize self.stderr.write('Updated %(updated)d, %(error)d error(s), %(notfound)d not found' \ % self.stats)
def datastream_etag(request, pid, dsid, type=None, repo=None, accept_range_request=False, **kwargs): '''Method suitable for use as an etag function with :class:`django.views.decorators.http.condition`. Takes the same arguments as :meth:`~eulfedora.views.raw_datastream`. ''' # if a range is requested and it is not for the entire file, # do *NOT* return an etag if accept_range_request and request.META.get('HTTP_RANGE', None) and \ request.META['HTTP_RANGE'] != 'bytes=1-': return None try: if repo is None: repo = Repository() get_obj_opts = {} if type is not None: get_obj_opts['type'] = type obj = repo.get_object(pid, **get_obj_opts) ds = obj.getDatastreamObject(dsid) if ds and ds.exists and ds.checksum_type != 'DISABLED': return ds.checksum except RequestFailed: pass return None
def main(argv): # Make Fedora connection repo = Repository(root='http://localhost:8080/fedora/', username='******', password='******') # Retreive pids using content model philologic_pids = repo.get_objects_with_cmodel(cmodel_uri='info:fedora/niu-objects:cmodel') # Loop through Philologic pids and retreive each object for p in philologic_pids: print 'Processing %s' % p # Extract the text philologic = p.getDatastreamObject('OBJ').content text=strip_tags(philologic) # Add FULL_TEXT full_text = p.getDatastreamObject('FULL_TEXT') full_text.label='Full text' full_text.mimetype='text/plain' full_text.versionable=True full_text.state='A' full_text.checksum_type='MD5' full_text.content = text full_text.save()
class ThesisBase(unittest.TestCase): """Base class for testing the functionality of the ETD Django application.""" def __init__(self, *args, **kwargs): unittest.TestCase.__init__(self, *args, **kwargs) self.fedora_fixtures_ingested = [] self.pidspace = FEDORA_PIDSPACE def setUp(self): """Creates a base class instance of an `eulfedora` Repository for testing the basic functionality of the ingesting a thesis object into a Fedora Repository.""" self.repo = Repository() # self.repo = Repository(FEDORA_ROOT,FEDORA_USER,FEDORA_PASSWORD) self.repo.risearch.RISEARCH_FLUSH_ON_QUERY = True def tearDown(self): """Removes test objects from the repository""" for pid in self.fedora_fixtures_ingested: try: self.repo.purge_object(pid) except RequestFailed as rf: logger.warn('Error purging test object %s in tear down:%s' %\ (pid,rf))
def _load_postcard(self, label, description, subjects, filename): '''Create a postcard object and load to fedora. :param label: object label and dc:title :param description: object dc:description :param subjects: list of subjects to be set in dc:subject :param filename: filename for image content, assumed relative to current directory ''' # NOTE: image object init here somewhat redundant with current postcard ingest logic repo = Repository() obj = repo.get_object(type=ImageObject) obj.label = label obj.owner = settings.FEDORA_OBJECT_OWNERID obj.dc.content.title = obj.label obj.dc.content.description_list.extend(description) obj.dc.content.subject_list.extend(subjects) # common DC for all postcards obj.dc.content.type = 'image' # FIXME: configure this somewhere? obj.dc.content.relation_list.extend([settings.RELATION, 'http://beck.library.emory.edu/greatwar/']) # set file as content of image datastream obj.image.content = open(path.join(fixture_path, filename)) # add relation to postcard collection obj.rels_ext.content.add(( URIRef(obj.uri), URIRef(MEMBER_OF_COLLECTION), URIRef(PostcardCollection.get().uri) )) obj.save() self.postcards.append(obj)
def browse(request): "Browse postcards and display thumbnail images." repo = Repository() repo.default_object_type = ImageObject number_of_results = 15 context = {} search_opts = postcard_search_opts().copy() if 'subject' in request.GET: context['subject'] = request.GET['subject'] search_opts['subject'] = request.GET['subject'] postcards = repo.find_objects(**search_opts) postcard_paginator = Paginator(list(postcards), number_of_results) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 # If page request (9999) is out of range, deliver last page of results. try: postcard_page = postcard_paginator.page(page) except (EmptyPage, InvalidPage): postcard_page = postcard_paginator.page(postcard_paginator.num_pages) context['postcards_paginated'] = postcard_page return render(request, 'postcards/browse.html', context)
def download_file(pid, dsid): repo = Repository(testsettings.FEDORA_ROOT_NONSSL, testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD) obj = repo.get_object(pid) ds = obj.getDatastreamObject(dsid) widgets = [ 'Download: ', progressbar.widgets.Percentage(), ' ', progressbar.widgets.Bar(), ' ', progressbar.widgets.ETA(), ' ', progressbar.widgets.FileTransferSpeed() ] # set initial progressbar size based on file; will be slightly larger because # of multipart boundary content pbar = progressbar.ProgressBar(widgets=widgets, max_value=ds.size).start() # download content to a tempfile tmpfile = tempfile.NamedTemporaryFile(prefix='%s-%s_' % (pid, dsid), delete=False) print('writing to ', tmpfile.name) size_read = 0 try: for chunk in ds.get_chunked_content(): size_read += len(chunk) pbar.update(size_read) tmpfile.write(chunk) except Exception: raise
def main(argv): repo = Repository(root='%s/fedora/' % HOST, username='******' % fedoraUser, password='******' % fedoraPass) philologic_pids = repo.get_objects_with_cmodel(cmodel_uri='info:fedora/niu-objects:cmodel') phil_doc = open('phil_doc.csv', 'w') image_ids = [] d = defaultdict(int) for p in philologic_pids: philologic = p.getDatastreamObject('OBJ').content substring = 'ARTFL-figure-missing' if substring in philologic: print 'Processing %s' % p images = [] image_count = 0 url = '%s/fedora/objects/%s/datastreams/OBJ/content' % (HOST, p) passwordManager = urllib2.HTTPPasswordMgrWithDefaultRealm() fedoraAdmin = "%s/fedora" % HOST passwordManager.add_password(None, fedoraAdmin, fedoraUser, fedoraPass) handler = urllib2.HTTPBasicAuthHandler(passwordManager) fedoraOpener = urllib2.build_opener(handler) soup = BeautifulSoup(fedoraOpener.open(url), 'html.parser') spans = soup.find_all('span', 'ARTFL-figure-missing') for span in spans: image = span['sysid'] images.append(image) image_count+= 1 image_ids.extend(images) images_string = ';'.join(images) phil_doc.write('%s,%s,%s\n' % (p, image_count, images_string)) print 'Successfully processed %s' % p for i in image_ids: d[i] += 1 with open('phil_image.csv', 'w') as outfile: phil_image = csv.writer(outfile) for key, value in d.items(): phil_image.writerow([key, value]) phil_doc.close()
class DatastreamXml(): def __init__(self, pid, repocls=None, server="Development"): self.repo = repocls if not repo: username, password, root = repo.Get_Configs(server) self.repo = Repository(root=root, username=username, password=password) self.pid = pid self.GetObject() def GetObject(self): self.digital_object = self.repo.get_object(self.pid) def ReplaceDs(self, dsid, xml_path): self.dsid = dsid self.xml_path = xml_path xml_object = self._MakeXmlObject() digital_object = self.repo.get_object(self.pid) datastream = DatastreamObject(digital_object, self.dsid) datastream.content = xml_object new_datastream.label = "_".join(self.pid.replace(":", ""), dsid) new_datastream.save() def _MakeXmlObject(self): return xmlmap.load_xmlobject_from_file(self.xml_path) def GetMarcDs(self): self.marcxml_object = self.digital_object.getDatastreamObject( "MARCXML") self.marcxml_content = self.marcxml_object.content.serialize() self.marc_tree = etree.fromstring(self.marcxml_content) return self.marc_tree
def main(argv): csvfile = open ("C:/Users/a1691506/Desktop/repo_size.csv", 'wb') csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) repo = Repository(root='%s/fedora/' % HOST, username='******' % fedoraUser, password='******' % fedoraPass) risearch = repo.risearch query = 'select ?pid ?date where {?pid <fedora-model:hasModel> <info:fedora/fedora-system:FedoraObject-3.0> ; <fedora-model:createdDate> ?date . } ORDER BY ASC(?date)' pids = risearch.find_statements(query, language='sparql', type='tuples', flush=None) repo_size = 0 for dictionary in pids: p = dictionary['pid'] pid = p.replace('info:fedora/', '') dateCreated = dictionary['date'] total_size = 0 obj = repo.get_object(pid) datastreams = obj.ds_list for datastream in datastreams: ds = obj.getDatastreamObject(datastream) size = ds.size total_size += size repo_size += total_size print "Total size for %s: %s" % (pid, total_size) csvwriter.writerow([pid, dateCreated, total_size, repo_size])
def main(argv): repo = Repository(root='%s/fedora/' % HOST, username='******' % fedoraUser, password='******' % fedoraPass) risearch = repo.risearch query = 'select ?pid where {?pid <fedora-view:disseminates> ?ds . ?pid <fedora-model:hasModel> <info:fedora/islandora:pageCModel> . ?ds <fedora-view:disseminationType> <info:fedora/*/PDF>}' pids = risearch.find_statements(query, language='sparql', type='tuples', flush=None) #total = 0 for dictionary in pids: for key in dictionary: p = dictionary[key] pid = p.replace('info:fedora/', '') obj = repo.get_object(pid) pdf = obj.getDatastreamObject("PDF") #size = pdf.size #total += size obj.api.purgeDatastream(pid, "PDF") obj.save() print "Purged PDF for %s" % pid
def get_object(self, queryset=None): # kwargs are set based on configured url pattern pid = self.kwargs['pid'] repo = Repository(request=self.request) vol = repo.get_object(pid, type=Volume) if not vol.exists or not vol.is_a_volume: raise Http404 return vol
def setUp(self): self.repo = Repository(settings.FEDORA_ROOT, settings.FEDORA_USER, settings.FEDORA_PASSWORD) with open(self.pdf_filepath) as pdf: self.pdfobj = self.repo.get_object(type=TestPdfObject) self.pdfobj.label = 'eulindexer test pdf object' self.pdfobj.pdf.content = pdf self.pdfobj.save()
def rdf_profile(request, username): '''Profile information comparable to the human-readable content returned by :meth:`profile`, but in RDF format.''' # retrieve user & publications - same logic as profile above user, userprofile = _get_profile_user(username) articles = userprofile.recent_articles(limit=10) # build an rdf graph with information author & publications rdf = RdfGraph() for prefix, ns in ns_prefixes.iteritems(): rdf.bind(prefix, ns) author_node = BNode() profile_uri = URIRef(request.build_absolute_uri(reverse('accounts:profile', kwargs={'username': username}))) profile_data_uri = URIRef(request.build_absolute_uri(reverse('accounts:profile-data', kwargs={'username': username}))) # author information rdf.add((profile_uri, FOAF.primaryTopic, author_node)) rdf.add((author_node, RDF.type, FOAF.Person)) rdf.add((author_node, FOAF.nick, Literal(user.username))) rdf.add((author_node, FOAF.publications, profile_uri)) try: esd_data = userprofile.esd_data() except EsdPerson.DoesNotExist: esd_data = None if esd_data: rdf.add((author_node, FOAF.name, Literal(esd_data.directory_name))) else: rdf.add((author_node, FOAF.name, Literal(user.get_full_name()))) if esd_data and not userprofile.suppress_esd_data: mbox_sha1sum = hashlib.sha1(esd_data.email).hexdigest() rdf.add((author_node, FOAF.mbox_sha1sum, Literal(mbox_sha1sum))) if esd_data.phone: rdf.add((author_node, FOAF.phone, URIRef('tel:' + esd_data.phone))) # TODO: use ESD profile data where appropriate # (and honor internet/directory suppressed, suppression override) # article information repo = Repository(request=request) for record in articles: obj = repo.get_object(record['pid'], type=Publication) obj_node = BNode() # info:fedora/ uri is not public # relate to author rdf.add((author_node, FRBR.creatorOf, obj_node)) rdf.add((author_node, FOAF.made, obj_node)) # add object rdf rdf += obj.as_rdf(node=obj_node) response = HttpResponse(rdf.serialize(), content_type='application/rdf+xml') response['Content-Location'] = profile_data_uri return response
def file(request, pid): dsid = FileObject.file.id repo = Repository() obj = repo.get_object(pid, type=FileObject) filename = os.path.basename(obj.dc.content.title) extra_headers = { 'Content-Disposition': "attachment; filename=%s" % filename, } return raw_datastream(request, pid, dsid, type=FileObject, headers=extra_headers)
def all(): """ Returns all collections in the repository as :class:`~genrepo.collection.models.CollectionObject` """ repo = Repository() colls = repo.get_objects_with_cmodel(CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) return colls
def rdfxml(request, aggId): dsid = AggregationObject.rdfxml.id repo = Repository() obj = repo.get_object(aggId, type=AggregationObject) filename = os.path.basename(obj.dc.content.title) extra_headers = { 'Content-Disposition': "attachment; filename=%s" % filename, } return raw_datastream(request, aggId, dsid, type=AggregationObject, headers=extra_headers)
def get_object(self, queryset=None): # kwargs are set based on configured url pattern pid = self.kwargs['pid'] repo = Repository(request=self.request) obj = repo.get_object(pid, type=Collection) # if pid doesn't exist or isn't a collection, 404 if not obj.exists or not obj.has_requisite_content_models: raise Http404 return obj
def datastream_lastmodified(request, pid, dsid, type): repo = Repository() try: obj = repo.get_object(pid, type=type) ds = obj.getDatastreamObject(dsid) if ds and ds.exists: return ds.created except RequestFailed: pass
def __init__(self, pid, repocls=None, server="Development"): self.repo = repocls if not repo: username, password, root = repo.Get_Configs(server) self.repo = Repository(root=root, username=username, password=password) self.pid = pid self.GetObject()
def all(): """ Returns all collections in the repository as :class:`~genrepo.collection.models.CollectionObject` """ repo = Repository() colls = repo.get_objects_with_cmodel( CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) return colls
def view_postcard_large(request, pid): '''View a large image of postcard with title only.''' repo = Repository() try: obj = repo.get_object(pid, type=ImageObject) obj.label # access object label to trigger 404 before we get to the template return render(request, 'postcards/view_postcard_large.html', {'card': obj}) except RequestFailed: raise Http404
def get_object(self, queryset=None): # kwargs are set based on configured url pattern pid = self.kwargs['pid'] repo = Repository(request=self.request) vol = repo.get_object(pid, type=Volume) # 404 if object doesn't exist, isn't a volume, or doesn't have tei if not vol.exists or not vol.is_a_volume or not vol.has_tei: raise Http404 # NOTE: not currently an error if volume doesn't have any # annotations, but export is probably not meaningful return vol
def members(self): '''Return all Fedora objects in the repository that are related to the current collection via isMemberOfCollection.''' # FIXME: loses repo permissions/credentials here... repo = Repository() members = repo.risearch.get_subjects(relsext.isMemberOfCollection, self.uri) # for now, just returning as generic DigitalObject instances for pid in members: # TODO: should we restrict to accessible objects only? # (requires passing correct credentials through...) yield repo.get_object(pid)
def view_collection(request, pid): """view an existing :class:`~genrepo.collection.models.CollectionObject` identified by pid. """ repo = Repository(request=request) obj = repo.get_object(pid, type=CollectionObject) # if the object does not exist or the current user doesn't have # permission to see that it exists, 404 if not obj.exists: raise Http404 return render(request, "collection/view.html", {"obj": obj})
def test_init_retries(self): # default repo = Repository('http://fedo.ra', 'user', 'passwd') self.assertEqual(Repository.retries, repo.retries) # number specified repo = Repository('http://fedo.ra', 'user', 'passwd', retries=5) self.assertEqual(5, repo.retries) # No retries specified repo = Repository('http://fedo.ra', 'user', 'passwd', retries=None) self.assertEqual(None, repo.retries)
def view_collection(request, pid): '''view an existing :class:`~genrepo.collection.models.CollectionObject` identified by pid. ''' repo = Repository(request=request) obj = repo.get_object(pid, type=CollectionObject) # if the object does not exist or the current user doesn't have # permission to see that it exists, 404 if not obj.exists: raise Http404 return render(request, 'collection/view.html', {'obj': obj})
def setUp(self): # load test object to test views with repo = Repository() self.obj = repo.get_object(type=SimpleDigitalObject) self.obj.dc.content.title = 'test object for generic views' self.obj.text.content = 'sample plain-text content' img_file = os.path.join(settings.FEDORA_FIXTURES_DIR, 'test.png') self.obj.image.content = open(img_file) # force datastream checksums so we can test response headers for ds in [self.obj.dc, self.obj.rels_ext, self.obj.text, self.obj.image]: ds.checksum_type = 'MD5' self.obj.save()
def handle(self, *args, **kwargs): verbosity = kwargs.get('verbosity', self.v_normal) # pids specified on command-line take precedence pids = kwargs.get('pids', []) repo = Repository() # if no pids were specified, find all AFFs if not pids: objs = repo.get_objects_with_cmodel(DiskImage.DISKIMAGE_CONTENT_MODEL, type=DiskImage) for obj in objs: # objects found by risearch *should* exist, but # just in case of discrepancies (hopefully only in QA), # ignore non-existent objects if not obj.exists: self.stderr.write(self.style.WARNING('%s does not exist' % obj.pid)) continue # check premis for to find Disk Images in AFF format; # exclude any that have already been migrated if obj.provenance.exists: premis = obj.provenance.content if premis.object and premis.object.format \ and premis.object.format.name == 'AFF' \ and not obj.migrated: pids.append(obj.pid) # create a celery result set and queue conversion of each pid requested # or found in fedora migration_tasks = celery.result.ResultSet([]) for pid in pids: migration_tasks.add(migrate_aff_diskimage.delay(pid)) # wait for tasks to complete while migration_tasks.waiting(): try: migration_tasks.join() except Exception: # exceptions from tasks gets propagated here, but ignore # them and report based on success/failure pass print '%d migrations completed, %s failures' % \ (migration_tasks.completed_count(), 'some' if migration_tasks.failed() else 'no') for result in migration_tasks.results: if result.state == celery.states.FAILURE: print 'Error: %s' % result.result else: print 'Success: %s' % result.result
def handle(self, *args, **options): self.oe_user = User.objects.get(username='******') self.options = options self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # create language code list by name l = language_codes() self.lang_codes = dict((v, k) for k, v in l.items()) #counters self.counts = defaultdict(int) # check required options if not self.options['username']: raise CommandError('Username is required') else: if not self.options['password'] or self.options['password'] == '': self.options['password'] = getpass("Password for %s:" % self.options['username']) #connection to repository self.repo = Repository(username=self.options['username'], password=self.options['password']) try: #if ids specified, use that list if len(args) != 0: ids = list(args) #TODO symplectic query here for id in ids: self.counts['total']+=1 self.output(1, "Processing %s" % id) self.symplectic_to_oe_by_id(id) else: #search for Articles #TODO symplectic query here articles = [] except Exception as e: print traceback.print_exc() raise CommandError('Error gettings ids (%s)' % e.message) # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Errors: %s\n" % self.counts['errors']) self.stdout.write("Created: %s\n" % self.counts['created'])
def setUp(self): # load test object to test views with repo = Repository() self.obj = repo.get_object(type=SimpleDigitalObject) self.obj.dc.content.title = 'test object for generic views' self.obj.text.content = 'sample plain-text content' img_file = os.path.join(settings.FEDORA_FIXTURES_DIR, 'test.png') self.obj.image.content = open(img_file, mode='rb') # force datastream checksums so we can test response headers for ds in [ self.obj.dc, self.obj.rels_ext, self.obj.text, self.obj.image ]: ds.checksum_type = 'MD5' self.obj.save()
def test_index_data(self): # create a test object for testing index data view repo = Repository() testobj = repo.get_object(type=SimpleObject) testobj.label = 'test object' testobj.owner = 'tester' testobj.save() self.pids.append(testobj.pid) # test with request IP not allowed to access the service with override_settings(EUL_INDEXER_ALLOWED_IPS=['0.13.23.134']): response = index_data(self.request, testobj.pid) expected, got = 403, response.status_code self.assertEqual(expected, got, 'Expected %s but returned %s for index_data view with request IP not in configured list' \ % (expected, got)) # test with request IP allowed to hit the service with override_settings(EUL_INDEXER_ALLOWED_IPS=[self.request_ip]): response = index_data(self.request, testobj.pid) expected, got = 200, response.status_code self.assertEqual(expected, got, 'Expected %s but returned %s for index_data view' \ % (expected, got)) expected, got = 'application/json', response['Content-Type'] self.assertEqual(expected, got, 'Expected %s but returned %s for mimetype on index_data view' \ % (expected, got)) response_data = json.loads(response.content.decode('utf-8')) self.assertEqual( testobj.index_data(), response_data, 'Response content loaded from JSON should be equal to object indexdata' ) # test with basic auth testuser, testpass = '******', 'testpass' token = base64.b64encode( force_bytes('%s:%s' % (testuser, testpass))) self.request.META['HTTP_AUTHORIZATION'] = 'Basic %s' % force_text( token) with patch('eulfedora.indexdata.views.TypeInferringRepository' ) as typerepo: typerepo.return_value.get_object.return_value.index_data.return_value = {} index_data(self.request, testobj.pid) typerepo.assert_called_with(username=testuser, password=testpass) # non-existent pid should generate a 404 self.assertRaises(Http404, index_data, self.request, 'bogus:testpid')
def process_items(self): digwf_api = Client(self.options.digwf_url) repo = Repository(self.options.fedora_url) for item_id in self.options.item_ids: try: result = digwf_api.get_items(item_id=item_id) except requests.exceptions.HTTPError as err: print 'Domokun Connection Error! Unable to query DigWF REST API for %s: %s' % ( item_id, err) continue try: r = requests.head(self.options.fedora_url) # prints the int of the status code. except requests.ConnectionError: print 'Fedora Connection Error! Unable to query Fedora REST API' continue if result.count == 1: item = result.items[0] print 'Found item %s (pid %s, control key %s, marc %s)' % \ (item_id, item.pid or '-', item.control_key, item.marc_path) try: repo.get_object(pid=item.pid) except requests.exceptions.HTTPError as err: print 'Fedora Connection Error! Unable to query Fedora REST API for %s: %s' % ( item.pid, err) continue elif result.count == 0: print 'No item found for this item id %s' % item_id continue else: # shouldn't get more than one match when looking up by # item id, but just in case print 'Error! DigWF returned %d matches for this item id %s' % \ (result.count, item_id) continue # returns a bagit bag object. newbag = LsdiBaggee(item, repo).create_bag(self.options.output) # generate source organization summary for this bag # self.load_source_summary(newbag) print 'Bag created at %s' % newbag
def object_tags(request, pid): '''Set & display private tags on a particular :class:`~eulfedora.models.DigitalObject` (saved in the database by way of :class:`~openemory.accounts.models.Bookmark`). On an HTTP GET, returns a JSON list of the tags for the specified object, or 404 if the object has not been tagged. On an HTTP PUT, will replace any existing tags with tags from the body of the request. Uses :meth:`taggit.utils.parse_tags` to parse tags, with the same logic :mod:`taggit` uses for parsing keyword and phrase tags on forms. After a successul PUT, returns the a JSON response with a list of the updated tags. If the Fedora object does not exist, returns a 404 error. ''' # bookmark options that will be used to create a new or find an # existing bookmark for either GET or PUT bkmark_opts = {'user': request.user, 'pid': pid} status_code = 200 # if all goes well, unless creating a new bookmark if request.method == 'PUT': # don't allow tagging non-existent objects # NOTE: this will 404 if a bookmark is created and an object # subsequently is removed or otherwise becomes unavailable in # the repository repo = Repository(request=request) obj = repo.get_object(pid) # if this fedora API call becomes expensive, may want to # consider querying Solr instead if not obj.exists: raise Http404 bookmark, created = Bookmark.objects.get_or_create(**bkmark_opts) if created: status_code = 201 bookmark.tags.set(*parse_tags(request.read())) # fall through to GET handling and display the newly-updated tags # should we return 201 when creating a new bookmark ? if request.method == 'GET': bookmark = get_object_or_404(Bookmark, **bkmark_opts) # GET or successful PUT tags = [tag.name for tag in bookmark.tags.all()] return HttpResponse(json_serializer.encode(tags), status=status_code, mimetype='application/json')
def curl_download_file(pid, dsid): repo = Repository(testsettings.FEDORA_ROOT_NONSSL, testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD) obj = repo.get_object(pid) ds = obj.getDatastreamObject(dsid) tmpfile = tempfile.NamedTemporaryFile(prefix='%s-%s_' % (pid, dsid), delete=False) print('writing to ', tmpfile.name) widgets = [ 'Download: ', progressbar.widgets.Percentage(), ' ', progressbar.widgets.Bar(), ' ', progressbar.widgets.ETA(), ' ', progressbar.widgets.FileTransferSpeed() ] # set initial progressbar size based on file; will be slightly larger because # of multipart boundary content pbar = progressbar.ProgressBar(widgets=widgets, max_value=ds.size).start() def progress(dl_total, dl, up_total, up): # update current status pbar.update(dl) c = pycurl.Curl() auth = base64.b64encode( force_bytes("%s:%s" % (testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD))) headers = {'Authorization': 'Basic %s' % force_text(auth)} c.setopt(pycurl.VERBOSE, 1) c.setopt(pycurl.HTTPHEADER, ["%s: %s" % t for t in headers.items()]) # /objects/{pid}/datastreams/{dsID}/content ? [asOfDateTime] [download] c.setopt(c.URL, '%sobjects/%s/datastreams/%s/content' % \ (testsettings.FEDORA_ROOT_NONSSL, pid, dsid)) # c.setopt(c.WRITEDATA, buffer) c.setopt(c.WRITEFUNCTION, tmpfile.write) c.setopt(c.XFERINFOFUNCTION, progress) c.setopt(c.NOPROGRESS, False) c.perform() # HTTP response code, e.g. 200. print('Status: %d' % c.getinfo(c.RESPONSE_CODE)) # Elapsed time for the transfer. print('Time: %f' % c.getinfo(c.TOTAL_TIME)) c.close()
def cascade_updated_articles(self): '''Reindex all articles associated with faculty who have been updated (either article-indexed person data has changed or a previously-indexed faculty member is no longer in ESD). ''' updated_articles = set() for username in self.updated_faculty: for article in self.articles_by_faculty(username): updated_articles.add(article['pid']) repo = Repository() for pid in updated_articles: if self.verbosity >= self.v_all: print 'Indexing article', pid article = repo.get_object(pid, type=Article) self.solr.add(article.index_data())