def main(argv):

    repo = Repository(root='%s/fedora/' % HOST, username='******' % fedoraUser, password='******' % fedoraPass)
    
    philologic_pids = repo.get_objects_with_cmodel(cmodel_uri='info:fedora/niu-objects:cmodel')

    phil_doc = open('phil_doc.csv', 'w')

    image_ids = []
    d = defaultdict(int)
        
    for p in philologic_pids:
        
        philologic = p.getDatastreamObject('OBJ').content

        substring = 'ARTFL-figure-missing'

        if substring in philologic:

            print 'Processing %s' % p
           
            images = []
            image_count = 0

            url = '%s/fedora/objects/%s/datastreams/OBJ/content' % (HOST, p)

            passwordManager = urllib2.HTTPPasswordMgrWithDefaultRealm()
            fedoraAdmin = "%s/fedora" % HOST
            passwordManager.add_password(None, fedoraAdmin, fedoraUser, fedoraPass)
            handler = urllib2.HTTPBasicAuthHandler(passwordManager)
            fedoraOpener = urllib2.build_opener(handler)

            soup = BeautifulSoup(fedoraOpener.open(url), 'html.parser')

            spans = soup.find_all('span', 'ARTFL-figure-missing')

            for span in spans:
                
                image = span['sysid']
                images.append(image)
                image_count+= 1

            image_ids.extend(images)
            images_string = ';'.join(images)

            phil_doc.write('%s,%s,%s\n' % (p, image_count, images_string))

            print 'Successfully processed %s' % p

    for i in image_ids:
        d[i] += 1

    with open('phil_image.csv', 'w') as outfile:

        phil_image = csv.writer(outfile)

        for key, value in d.items():
            phil_image.writerow([key, value])
            
    phil_doc.close()
def main(argv):

    # Make Fedora connection
    repo = Repository(root='http://localhost:8080/fedora/', username='******', password='******')
    
    # Retreive pids using content model
    philologic_pids = repo.get_objects_with_cmodel(cmodel_uri='info:fedora/niu-objects:cmodel')

    # Loop through Philologic pids and retreive each object
    for p in philologic_pids:

        print 'Processing %s' % p

        # Extract the text

        philologic = p.getDatastreamObject('OBJ').content
        text=strip_tags(philologic)

        # Add FULL_TEXT
        
        full_text = p.getDatastreamObject('FULL_TEXT')
        full_text.label='Full text'
        full_text.mimetype='text/plain'
        full_text.versionable=True
        full_text.state='A'
        full_text.checksum_type='MD5'

        full_text.content = text

        full_text.save()
class Command(BaseCommand):
    ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo
    '''
    args = "[netid netid ...]"
    help = __doc__

    option_list = BaseCommand.option_list + (
        make_option('--noact', '-n',
                    action='store_true',
                    default=False,
                    help='Fixed all caps title in articles'),
        )

    def handle(self, *args, **options):

        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1


        #connection to repository
        self.repo = Repository(settings.FEDORA_ROOT, username=settings.FEDORA_MANAGEMENT_USER, password=settings.FEDORA_PASSWORD)
        pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication)
        coll =  self.repo.get_object(pid=settings.PID_ALIASES['oe-collection'])
        try:
            articles = Paginator(pid_set, 100)

        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(0, "Skipping %s because pid does not exist" % article.pid)
                        continue
                    else:
                        
                        print coll
                        print article.pid
                        article.collection = coll
                        ark_uri = '%sark:/25593/%s' % (settings.PIDMAN_HOST, article.pid.split(':')[1])
                        article.dc.content.identifier_list.extend([ark_uri])
                        article.save()
        
                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    # self.counts['errors'] +=1


    def output(self, v, msg):
        '''simple function to handle logging output based on verbosity'''
        if self.verbosity >= v:
            self.stdout.write("%s\n" % msg)
Example #4
0
 def all():
     """
     Returns all collections in the repository as
     :class:`~genrepo.collection.models.CollectionObject`
     """
     repo = Repository()
     colls = repo.get_objects_with_cmodel(CollectionObject.COLLECTION_CONTENT_MODEL,
                                          type=CollectionObject)
     return colls
Example #5
0
 def all():
     """
     Returns all collections in the repository as
     :class:`~genrepo.collection.models.CollectionObject`
     """
     repo = Repository()
     colls = repo.get_objects_with_cmodel(
         CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject)
     return colls
Example #6
0
    def handle(self, *args, **kwargs):
        verbosity = kwargs.get('verbosity', self.v_normal)

        # pids specified on command-line take precedence
        pids = kwargs.get('pids', [])
        repo = Repository()
        # if no pids were specified, find all AFFs
        if not pids:
            objs = repo.get_objects_with_cmodel(DiskImage.DISKIMAGE_CONTENT_MODEL,
                type=DiskImage)
            for obj in objs:
                # objects found by risearch *should* exist, but
                # just in case of discrepancies (hopefully only in QA),
                # ignore non-existent objects
                if not obj.exists:
                    self.stderr.write(self.style.WARNING('%s does not exist' % obj.pid))
                    continue

                # check premis for to find Disk Images in AFF format;
                # exclude any that have already been migrated
                if obj.provenance.exists:
                    premis = obj.provenance.content
                    if premis.object and premis.object.format \
                                     and premis.object.format.name == 'AFF' \
                                     and not obj.migrated:
                        pids.append(obj.pid)

        # create a celery result set and queue conversion of each pid requested
        # or found in fedora
        migration_tasks = celery.result.ResultSet([])
        for pid in pids:
            migration_tasks.add(migrate_aff_diskimage.delay(pid))

        # wait for tasks to complete
        while migration_tasks.waiting():
            try:
                migration_tasks.join()
            except Exception:
                # exceptions from tasks gets propagated here, but ignore
                # them and report based on success/failure
                pass

        print '%d migrations completed, %s failures' % \
            (migration_tasks.completed_count(),
            'some' if migration_tasks.failed() else 'no')

        for result in migration_tasks.results:
            if result.state == celery.states.FAILURE:
                print 'Error: %s' % result.result
            else:
                print 'Success: %s' % result.result
Example #7
0
    def handle(self, *args, **kwargs):
        verbosity = kwargs.get('verbosity', self.v_normal)
        repo = Repository()
        objs = repo.get_objects_with_cmodel(DiskImage.DISKIMAGE_CONTENT_MODEL,
            type=DiskImage)
        for obj in objs:
            img_fmt = None

            # use premis object format to distinguish AD1 disk images
            if obj.provenance.exists:
                premis = obj.provenance.content
                if premis.object and premis.object.format:
                    img_fmt = premis.object.format.name

            if img_fmt == 'AD1':
                print '%s %s' % (obj.pid, obj.content.label)
            if img_fmt is None and verbosity >= self.v_normal:
                self.stderr.write('Warning: %s has no premis object format' % obj.pid)
Example #8
0
    def handle(self, *args, **options):
        self.verbosity = int(
            options['verbosity'])  # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        counts = defaultdict(int)

        # check required options
        if not options['username']:
            raise CommandError('Username is required')
        else:
            if not options['password'] or options['password'] == '':
                options['password'] = getpass()

        #connection to repository
        repo = Repository(username=options['username'],
                          password=options['password'])

        coll = repo.get_object(pid=settings.PID_ALIASES['oe-collection'])

        #if pids specified, use that list
        try:
            if len(args) != 0:
                pids = list(args)
                pid_set = [repo.get_object(pid=p, type=Article) for p in pids]

            else:
                #search for Articles.
                pid_set = repo.get_objects_with_cmodel(
                    Article.ARTICLE_CONTENT_MODEL, Article)

        except Exception as e:
            raise CommandError('Error getting pid list (%s)' % e.message)

        try:
            articles = Paginator(pid_set, 20)
            counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0, "Error getting page: %s : %s " % (p, e.message))
                counts['errors'] += 1
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(
                            1, "Skipping %s because pid does not exist" %
                            article.pid)
                        counts['skipped'] += 1
                        continue
                    else:
                        self.output(0, "Processing %s" % article.pid)

                        # Add to collection
                        article.collection = coll
                        self.output(
                            1, "Adding %s to collection %s" %
                            (article.pid, coll.pid))
                        counts['collection'] += 1

                        # Add itemID for OAI
                        if article.is_published:
                            article.oai_itemID = "oai:ark:/25593/%s" % article.noid
                            self.output(1, "Adding itemID to %s" % article.pid)
                            counts['itemId'] += 1

                        # Modify DB NS
                        article._prep_dc_for_oai()
                        self.output(
                            1, "Modified DC namespaces for %s" % (article.pid))
                        counts['DC'] += 1

                        # save article
                        if not options['noact']:
                            article.save()
                except Exception as e:
                    self.output(
                        0, "Error processing pid: %s : %s " %
                        (article.pid, e.message))
                    counts['errors'] += 1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % counts['total'])
        self.stdout.write("Added to collection: %s\n" % counts['collection'])
        self.stdout.write("Added itemID: %s\n" % counts['itemId'])
        self.stdout.write("Modified DC NS: %s\n" % counts['DC'])
        self.stdout.write("Skipped: %s\n" % counts['skipped'])
        self.stdout.write("Errors: %s\n" % counts['errors'])
    def handle(self, *args, **options):
        #counters
        counts = defaultdict(int)



        # check required options
        if not options['username']:
            raise CommandError('Username is required')
        else:
            if not options['password'] or options['password'] == '':
                options['password'] = getpass()

        if not options['file']:
            raise CommandError('File is required')
        

        with open(options['file'], 'r') as myfile:
            data=myfile.read().splitlines()

        #connection to repository
        repo = Repository(username=options['username'], password=options['password'])

        try:
            #if pids specified, use that list
            if len(data) != 0:
                pids = list(data)
                pid_set = [repo.get_object(pid=p, type=Video) for p in pids]

            else:
                #search for Articles
                pid_set = repo.get_objects_with_cmodel(Video.VIDEO_CONTENT_MODEL, Video)

        except Exception as e:
            raise CommandError('Error gettings pids (%s)' % e.message)

        try:
            objects = Paginator(pid_set, 20)
            counts['total'] = objects.count
        except Exception as e:
            self.output("Error paginating items: : %s " % (e.message))

        #process all Objects
        for p in objects.page_range:
            try:
                objs = objects.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output("Error getting page: %s : %s " % (p, e.message))
                counts['errors'] +=1
                continue
            for a in objs:
                try:
                    if not a.exists:
                        self.output("Skipping %s because pid does not exist" % a.pid)
                        counts['skipped'] +=1
                        continue
                    else:
                        self.output("Processing %s" % a.pid)
                        a.content.mimetype = 'video/quicktime'

                        # save object
                        if not options['noact']:
                            a.save("cleanup mimetype")
                            self.output("SAVED %s" % a.pid)
                            counts['saved'] +=1
                        counts['processed'] +=1
                except Exception as e:
                    self.output("Error processing pid: %s : %s " % (a.pid, e.message))
                    counts['errors'] +=1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % counts['total'])
        self.stdout.write("Total number processed: %s\n" % counts['processed'])
        self.stdout.write("Total number saved: %s\n" % counts['saved'])
        self.stdout.write("Skipped: %s\n" % counts['skipped'])
        self.stdout.write("Errors: %s\n" % counts['errors'])
Example #10
0
    def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        counts = defaultdict(int)

        # check required options
        if not options['username']:
            raise CommandError('Username is required')
        else:
            if not options['password'] or options['password'] == '':
                options['password'] = getpass()

        #connection to repository
        repo = Repository(username=options['username'], password=options['password'])

        coll = repo.get_object(pid=settings.PID_ALIASES['oe-collection'])

        #if pids specified, use that list
        try:
            if len(args) != 0:
                pids = list(args)
                pid_set = [repo.get_object(pid=p,type=Article) for p in pids]


            else:
                #search for Articles.
                pid_set = repo.get_objects_with_cmodel(Article.ARTICLE_CONTENT_MODEL, Article)

        except Exception as e:
            raise CommandError('Error getting pid list (%s)' % e.message)

        try:
            articles = Paginator(pid_set, 20)
            counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                counts['errors'] +=1
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(1, "Skipping %s because pid does not exist" % article.pid)
                        counts['skipped'] +=1
                        continue
                    else:
                        self.output(0,"Processing %s" % article.pid)

                        # Add to collection
                        article.collection = coll
                        self.output(1, "Adding %s to collection %s" % (article.pid, coll.pid))
                        counts['collection']+= 1


                        # Add itemID for OAI
                        if article.is_published:
                            article.oai_itemID = "oai:ark:/25593/%s" % article.noid
                            self.output(1, "Adding itemID to %s" % article.pid)
                            counts['itemId']+= 1

                        # Modify DB NS
                        article._prep_dc_for_oai()
                        self.output(1, "Modified DC namespaces for %s" % (article.pid))
                        counts['DC']+= 1

                        # save article
                        if not options['noact']:
                            article.save()
                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    counts['errors'] +=1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % counts['total'])
        self.stdout.write("Added to collection: %s\n" % counts['collection'])
        self.stdout.write("Added itemID: %s\n" % counts['itemId'])
        self.stdout.write("Modified DC NS: %s\n" % counts['DC'])
        self.stdout.write("Skipped: %s\n" % counts['skipped'])
        self.stdout.write("Errors: %s\n" % counts['errors'])
Example #11
0
    def handle(self, *pids, **options):
        # testPid
        # settings.PIDMAN_HOST = 'https://testpid.library.emory.edu/'  # the web root where we'll ask for pids
        # settings.PIDMAN_USER = ''
        # settings.PIDMAN_PASSWORD = ''
        # settings.PIDMAN_DOMAIN = 'https://testpid.library.emory.edu/domains/18/'  # default domain (e.g. when minting pids)

        # prodPid
        # PIDMAN_HOST = 'https://pidqas.library.emory.edu/'

        # get a pidman client
        client = DjangoPidmanRestClient()

        # testFedora
        repo = Repository(settings.FEDORA_ROOT, username=settings.FEDORA_MANAGEMENT_USER, password=settings.FEDORA_MANAGEMENT_PASSWORD)

        # prodFedora
        #repo = Repository('https://fedora.library.emory.edu:8443/fedora/', username='******', password='******')

        # constants
        REPOMGMT = Namespace(rdflib.URIRef('info:fedora/fedora-system:def/relations-external#'))
        vol_list = repo.get_objects_with_cmodel('info:fedora/emory-control:ScannedVolume-1.0')

        print "Found " + str(len(vol_list)) + " books."

        # Get a file logger
        filename = "ecds/" + str(datetime.datetime.now().strftime("%I-%M-%S %B-%d-%Y")) + ".csv"
        f = open(filename, 'w+')

        # report all books
        f.write("Found " + str(len(vol_list)) + " books.")
        f.write("\n")

        # report titles
        f.write("TYPE,")
        f.write("PID,")
        f.write("NOID,")
        f.write("O_URI,")
        f.write("N_URI,")
        f.write("PAGE,")
        f.write("POST_URI,")
        # f.write("POST_PDF_URI,")
        f.write("\n")



        # go over all books
        for vol in vol_list:
            volDobj = repo.get_object(vol.pid.rstrip(), type=ScannedVolume)

            # get attributes
            pid = volDobj.pid
            noid = pid.split(":")[1]
            try:
                pidmanObj = client.get_pid("ark", noid)
            except Exception as e:
                f.write(str(pid))
                f.write("\n")
                f.write(str(e))
                continue # continue to the next item
            oriTargetUri = pidmanObj["targets"][0]["target_uri"]
            newTargetUri = oriTargetUri

            # if it has emory%3A
            if newTargetUri.find("emory%3A") != -1:
                newTargetUri = newTargetUri.replace("emory%3A", "emory:")

            # if it has readux%3A
            if newTargetUri.find("readux%3A") != -1:
                newTargetUri = newTargetUri.replace("readux%3A", "emory:")

            # if it has readux:
            if newTargetUri.find("readux:") != -1:
                newTargetUri = newTargetUri.replace("readux:", "emory:")

            # if it has webprd001.library.emory.edu/readux
            if newTargetUri.find("webprd001.library.emory.edu/readux") != -1:
                newTargetUri = newTargetUri.replace("webprd001.library.emory.edu/readux", "testreadux.ecds.emory.edu")

            # if it has webprd001.library.emory.edu
            if newTargetUri.find("webprd001.library.emory.edu/") != -1:
                newTargetUri = newTargetUri.replace("webprd001.library.emory.edu/", "testreadux.ecds.emory.edu/")

            # if it has /readux/
            if newTargetUri.find("/readux/") != -1:
                newTargetUri = newTargetUri.replace("/readux/", "/")


            newTargetUri = unicode(newTargetUri)

            # log attributes
            f.write("BOOK" + ", ")
            f.write(str(pid) + ", ")
            f.write(str(noid) + ", ")
            f.write(str(oriTargetUri) + ", ")
            f.write(str(newTargetUri) + ", ")
            f.write(str(len(volDobj.pageDObjs)) + ", ")
            f.write("\n")

            # report attributes
            print("BOOK - " + str(pid) + " - " + str(len(volDobj.pageDObjs)) + " pages")

            #TODO update target
            # if newTargetUri != oriTargetUri:
            #     response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri)
            #     updated_target_uri = response["target_uri"]
            #     response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri, qualifier="PDF")
            #     updated_pdf_target_uri = response["target_uri"]
            #     f.write(str(updated_target_uri) + ", ")
            #     f.write(str(updated_pdf_target_uri) + ", ")

            # update pages
            page_count = 0
            for p in volDobj.get_pages():
                page_count = page_count + 1

                # Get all relevant attributes
                pid = p
                noid = pid.split(":")[1]
                try:
                    pidmanObj = client.get_pid("ark", noid)
                except Exception as e:
                    f.write(str(pid))
                    f.write("\n")
                    f.write(str(e))
                    continue # continue to the next item
                oriTargetUri = pidmanObj["targets"][0]["target_uri"]
                newTargetUri = unicode(oriTargetUri)

                # if it has readux%3A
                if newTargetUri.find("readux%3A%7B%25PID%25%7D") != -1:
                    newTargetUri = newTargetUri.replace("readux%3A%7B%25PID%25%7D", pid)

                # if it has readux:abc1234
                if newTargetUri.find("readux:") != -1:
                    newTargetUri = newTargetUri.replace("readux:", "emory:")

                # if it has readux%3A
                if newTargetUri.find("readux%3A") != -1:
                    newTargetUri = newTargetUri.replace("readux%3A", "emory:")

                # if it has /readux/
                if newTargetUri.find("/readux/") != -1:
                    newTargetUri = newTargetUri.replace("/readux/", "/")

                # if it has webprd001.library.
                if newTargetUri.find("webprd001.library.emory.") != -1:
                    newTargetUri = newTargetUri.replace("webprd001.library.emory.", "testreadux.ecds.emory.")

                newTargetUri = unicode(newTargetUri)

                # Log attributes
                f.write("page"+ ", ")
                f.write(str(pid) + ", ")
                f.write(str(noid) + ", ")
                f.write(str(oriTargetUri) + ", ")
                f.write(str(newTargetUri) + ", ")
                f.write(str(page_count) + ", ")

                try:
                    print(str(page_count) + "/" + str(len(volDobj.pageDObjs)) + " - " + str(noid) + " - page update")
                    #TODO update target
                    # if newTargetUri != oriTargetUri:
                    #     response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri)
                    #     updated_target_uri = response["target_uri"]
                    #     response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri, qualifier="PDF")
                    #     updated_pdf_target_uri = response["target_uri"]
                    #     f.write(str(noid) + " - page success" + ", ")
                    #     f.write(str(noid) + " - page pdf success" + ", ")
                except:
                    print(str(page_count) + "/" + str(len(volDobj.pageDObjs)) + " - " + str(noid) + " - page fail")
                    f.write(str(noid) + " - page fail" + ", ")

                f.write("\n")

            f.write("\n")

        f.close()
Example #12
0
    def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1



        # check required options
        if (not options['div']) and (not options['author']) and (not options['lead']):
            raise CommandError('At least one of the options div, author or lead is required')
        if not options['username']:
            raise CommandError('Username is required')
        else:
            if not options['password'] or options['password'] == '':
                options['password'] = getpass()

        #connection to repository
        repo = Repository(username=options['username'], password=options['password'])
        pid_set = repo.get_objects_with_cmodel(Article.ARTICLE_CONTENT_MODEL, Article)

        try:
            articles = Paginator(pid_set, 100)
            self.counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                self.counts['errors'] +=1
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(0, "Skipping %s because pid does not exist" % article.pid)
                        self.counts['skipped'] +=1
                        continue
                    else:
                        self.output(2,"Processing %s" % article.pid)
                        if options['div']:
                            self.division(article)
                        if options['author']:
                            self.author(article)
                        if options['lead']:
                            self.lead(article)
                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    self.counts['errors'] +=1

        # write files
        if options['div']:
            writer = csv.writer(open("division_report.csv", 'w'))
            writer.writerow(['Division', 'Count'])
            for key, count in self.div_counts.items():
                writer.writerow([key, count])

        if options['author']:
            writer = csv.writer(open("author_report.csv", 'w'))
            writer.writerow(['Author', 'Division', 'Department', 'Count'])
            for netid, count in self.author_counts.items():
                try:
                    person = User.objects.get(username=netid).get_profile().esd_data()
                    writer.writerow([person.directory_name, person.division_name, person.department_shortname, count])
                except (User.DoesNotExist, UserProfile.DoesNotExist, EsdPerson.DoesNotExist) as e :
                    self.output(0, "At least one part (User, Profile, ESD) for netid  %s could not be found" % netid)

        if options['lead']:
            writer = csv.writer(open("lead_report.csv", 'w'))
            writer.writerow(['Division', 'Count'])
            for key, count in self.lead_counts.items():
                writer.writerow([key, count])

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % self.counts['total'])
        self.stdout.write("Skipped: %s\n" % self.counts['skipped'])
        self.stdout.write("Errors: %s\n" % self.counts['errors'])
Example #13
0
    def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        self.counts = defaultdict(int)

        #connection to repository
        repo = Repository(username=settings.FEDORA_MANAGEMENT_USER, password=settings.FEDORA_MANAGEMENT_PASSWORD)

        #Symplectic-Elements setup
        self.session = requests.Session()
        self.session.auth = (settings.SYMPLECTIC_USER, settings.SYMPLECTIC_PASSWORD)
        self.session.verify=False
        self.session.stream=True
        self.session.headers.update({'Content-Type': 'text/xml'})

        self.pub_query_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publications")
        self.pub_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publication/records/manual")
        self.relation_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "relationships")


        #if pids specified, use that list
        try:
            if len(args) != 0:
                pids = list(args)
                pid_set = [repo.get_object(pid=p,type=Article) for p in pids]


            else:
                #search for Articles.
                pid_set = repo.get_objects_with_cmodel(Article.ARTICLE_CONTENT_MODEL, Article)

        except Exception as e:
            raise CommandError('Error getting pid list (%s)' % e.message)

        try:
            articles = Paginator(pid_set, 20)
            self.counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                self.counts['errors'] +=1
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(1, "Skipping %s because pid does not exist" % article.pid)
                        self.counts['skipped'] +=1
                        continue
                    title = article.descMetadata.content.title_info.title if (article.descMetadata.content.title_info and article.descMetadata.content.title_info.title) else None
                    if title is None or title == '':
                        self.output(1, "Skipping %s because OE Title does not exist" % (article.pid))
                        self.counts['skipped'] +=1
                        continue

                    if not article.is_published:
                        self.output(1, "Skipping %s because pid is not published" % article.pid)
                        self.counts['skipped'] +=1
                        continue

                    # try to detect article by PMC
                    if article.pmcid and not options['force']:
                        response = self.session.get(self.pub_query_url, params = {'query' : 'external-identifiers.pmc="PMC%s"' % article.pmcid, 'detail': 'full'})
                        entries = load_xmlobject_from_string(response.raw.read(), OESympImportArticle).entries
                        self.output(2, "Query for PMC Match: GET %s %s" % (response.url, response.status_code))
                        if response.status_code == 200:
                            if len(entries) >= 1:
                                self.output(1, "Skipping %s because PMC PMC%s already exists" % (article.pid, article.pmcid))
                                self.counts['skipped'] +=1

                                if options['rel']:
                                    symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id)
                                    self.process_relations(entries[0].source_id, relations, options)
                                    sleep(1)
                                continue
                        else:
                            self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title))
                            self.counts['skipped'] +=1
                            continue

                    # try to detect article by Title if it does not have PMC
                    if not options['force']:
                        response = self.session.get(self.pub_query_url, params = {'query' : 'title~"%s"' % title, 'detail': 'full'})
                        entries = load_xmlobject_from_string(response.raw.read(), OESympImportArticle).entries
                        # Accouont for mutiple results
                        titles = [e.title for e in entries]
                        self.output(2, "Query for Title Match: GET %s %s" % (response.url, response.status_code))
                        if response.status_code == 200:
                            found = False
                            for t in titles:
                                success, percent = percent_match(title, t, 90)
                                self.output(1, "Percent Title Match '%s' '%s' %s " % (title, t, percent))
                                if success:
                                    found = True
                            if found:
                                self.output(1, "Skipping %s because Title \"%s\" already exists" % (article.pid, title))
                                self.counts['skipped'] +=1

                                # update relations if rel is set
                                if options['rel']:
                                    symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id)
                                    self.process_relations(entries[0].source_id, relations, options)
                                    sleep(1)
                                continue
                        else:
                            self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title))
                            self.counts['skipped'] +=1
                            continue

                    # Process article and relations
                    symp_pub, relations = article.as_symp()
                    self.process_article(article.pid, symp_pub, options)
                    self.process_relations(article.pid, relations, options)
                    sleep(1)

                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    import traceback
                    traceback.print_exc()
                    self.counts['errors'] +=1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % self.counts['total'])
        self.stdout.write("Skipped: %s\n" % self.counts['skipped'])
        self.stdout.write("Errors: %s\n" % self.counts['errors'])
        self.stdout.write("Warnings: %s\n" % self.counts['warnings'])
        self.stdout.write("Articles Processed: %s\n" % self.counts['articles_processed'])
        self.stdout.write("Relations Processed: %s\n" % self.counts['relations_processed'])
Example #14
0
    def handle(self, *args, **options):
        self.verbosity = int(
            options['verbosity'])  # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        self.counts = defaultdict(int)

        #connection to repository
        repo = Repository(username=settings.FEDORA_MANAGEMENT_USER,
                          password=settings.FEDORA_MANAGEMENT_PASSWORD)

        #Symplectic-Elements setup
        self.session = requests.Session()
        self.session.auth = (settings.SYMPLECTIC_USER,
                             settings.SYMPLECTIC_PASSWORD)
        self.session.verify = False
        self.session.stream = True
        self.session.headers.update({'Content-Type': 'text/xml'})

        self.pub_query_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL,
                                        "publications")
        self.pub_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL,
                                         "publication/records/manual")
        self.relation_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL,
                                              "relationships")

        #if pids specified, use that list
        try:
            if len(args) != 0:
                pids = list(args)
                pid_set = [repo.get_object(pid=p, type=Article) for p in pids]

            else:
                #search for Articles.
                pid_set = repo.get_objects_with_cmodel(
                    Article.ARTICLE_CONTENT_MODEL, Article)

        except Exception as e:
            raise CommandError('Error getting pid list (%s)' % e.message)

        try:
            articles = Paginator(pid_set, 20)
            self.counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0, "Error getting page: %s : %s " % (p, e.message))
                self.counts['errors'] += 1
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(
                            1, "Skipping %s because pid does not exist" %
                            article.pid)
                        self.counts['skipped'] += 1
                        continue
                    title = article.descMetadata.content.title_info.title if (
                        article.descMetadata.content.title_info
                        and article.descMetadata.content.title_info.title
                    ) else None
                    if title is None or title == '':
                        self.output(
                            1, "Skipping %s because OE Title does not exist" %
                            (article.pid))
                        self.counts['skipped'] += 1
                        continue

                    if not article.is_published:
                        self.output(
                            1, "Skipping %s because pid is not published" %
                            article.pid)
                        self.counts['skipped'] += 1
                        continue

                    # try to detect article by PMC
                    if article.pmcid and not options['force']:
                        response = self.session.get(
                            self.pub_query_url,
                            params={
                                'query':
                                'external-identifiers.pmc="PMC%s"' %
                                article.pmcid,
                                'detail':
                                'full'
                            })
                        entries = load_xmlobject_from_string(
                            response.raw.read(), OESympImportArticle).entries
                        self.output(
                            2, "Query for PMC Match: GET %s %s" %
                            (response.url, response.status_code))
                        if response.status_code == 200:
                            if len(entries) >= 1:
                                self.output(
                                    1,
                                    "Skipping %s because PMC PMC%s already exists"
                                    % (article.pid, article.pmcid))
                                self.counts['skipped'] += 1

                                if options['rel']:
                                    symp_pub, relations = article.as_symp(
                                        source=entries[0].source,
                                        source_id=entries[0].source_id)
                                    self.process_relations(
                                        entries[0].source_id, relations,
                                        options)
                                    sleep(1)
                                continue
                        else:
                            self.output(
                                1,
                                "Skipping %s because trouble with request %s %s"
                                % (article.pid, response.status_code,
                                   entries[0].title))
                            self.counts['skipped'] += 1
                            continue

                    # try to detect article by Title if it does not have PMC
                    if not options['force']:
                        response = self.session.get(self.pub_query_url,
                                                    params={
                                                        'query':
                                                        'title~"%s"' % title,
                                                        'detail': 'full'
                                                    })
                        entries = load_xmlobject_from_string(
                            response.raw.read(), OESympImportArticle).entries
                        # Accouont for mutiple results
                        titles = [e.title for e in entries]
                        self.output(
                            2, "Query for Title Match: GET %s %s" %
                            (response.url, response.status_code))
                        if response.status_code == 200:
                            found = False
                            for t in titles:
                                success, percent = percent_match(title, t, 90)
                                self.output(
                                    1, "Percent Title Match '%s' '%s' %s " %
                                    (title, t, percent))
                                if success:
                                    found = True
                            if found:
                                self.output(
                                    1,
                                    "Skipping %s because Title \"%s\" already exists"
                                    % (article.pid, title))
                                self.counts['skipped'] += 1

                                # update relations if rel is set
                                if options['rel']:
                                    symp_pub, relations = article.as_symp(
                                        source=entries[0].source,
                                        source_id=entries[0].source_id)
                                    self.process_relations(
                                        entries[0].source_id, relations,
                                        options)
                                    sleep(1)
                                continue
                        else:
                            self.output(
                                1,
                                "Skipping %s because trouble with request %s %s"
                                % (article.pid, response.status_code,
                                   entries[0].title))
                            self.counts['skipped'] += 1
                            continue

                    # Process article and relations
                    symp_pub, relations = article.as_symp()
                    self.process_article(article.pid, symp_pub, options)
                    self.process_relations(article.pid, relations, options)
                    sleep(1)

                except Exception as e:
                    self.output(
                        0, "Error processing pid: %s : %s " %
                        (article.pid, e.message))
                    import traceback
                    traceback.print_exc()
                    self.counts['errors'] += 1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % self.counts['total'])
        self.stdout.write("Skipped: %s\n" % self.counts['skipped'])
        self.stdout.write("Errors: %s\n" % self.counts['errors'])
        self.stdout.write("Warnings: %s\n" % self.counts['warnings'])
        self.stdout.write("Articles Processed: %s\n" %
                          self.counts['articles_processed'])
        self.stdout.write("Relations Processed: %s\n" %
                          self.counts['relations_processed'])
Example #15
0
    def handle(self, *args, **options):
        self.verbosity = int(
            options['verbosity'])  # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        counts = defaultdict(int)

        # check required options
        if not options['username']:
            raise CommandError('Username is required')
        else:
            if not options['password'] or options['password'] == '':
                options['password'] = getpass()

        #connection to repository
        repo = Repository(username=options['username'],
                          password=options['password'])

        try:
            #if pids specified, use that list
            if len(args) != 0:
                pids = list(args)
                pid_set = [repo.get_object(pid=p, type=Article) for p in pids]

            else:
                #search for Articles
                pid_set = repo.get_objects_with_cmodel(
                    Article.ARTICLE_CONTENT_MODEL, Article)

        except Exception as e:
            raise CommandError('Error gettings pids (%s)' % e.message)

        try:
            articles = Paginator(pid_set, 20)
            counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0, "Error getting page: %s : %s " % (p, e.message))
                counts['errors'] += 1
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(
                            1, "Skipping %s because pid does not exist" %
                            article.pid)
                        counts['skipped'] += 1
                        continue
                    else:
                        self.output(0, "Processing %s" % article.pid)

                        mods = article.descMetadata.content
                        nlm = article.contentMetadata.content if article.contentMetadata.exists else None
                        identifiers = []

                        #PMC info
                        if nlm:
                            pmc = nlm.docid
                            pmc_id = 'PMC%s' % pmc
                            access_url = pmc_access_url(pmc)
                            identifiers.extend([pmc_id, access_url])

                        if mods.ark_uri:
                            identifiers.append(mods.ark_uri)

                        identifiers.append(article.pid)

                        article.dc.content.identifier_list = identifiers

                        ##########REMOVE dc.relation###########
                        #                                     #
                        article.dc.content.relation_list = []  #
                        #                                     #
                        #######################################

                        # save article
                        if not options['noact']:
                            article.save()
                            self.output(1, "SAVED")
                except Exception as e:
                    self.output(
                        0, "Error processing pid: %s : %s " %
                        (article.pid, e.message))
                    counts['errors'] += 1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % counts['total'])
        self.stdout.write("Skipped: %s\n" % counts['skipped'])
        self.stdout.write("Errors: %s\n" % counts['errors'])
def main(argv):

    # Connect to repository
    repo = Repository(root='%s/fedora/' % HOST, username='******' % fedoraUser, password='******' % fedoraPass)
    # Get philologic pids using content model
    philologic_pids = repo.get_objects_with_cmodel(cmodel_uri='info:fedora/niu-objects:cmodel')

    # Logging
    phil_doc = open('phil_doc_dev.csv', 'w')
    image_ids = []
    d = defaultdict(int)

    for pid in philologic_pids:

        # Logging
        images = []
        image_count = 0

        # Get the OBJ's content as string
        philologic = pid.getDatastreamObject('OBJ').content
        # Take the opportunity to replace deprecated HTML entity reference
        philologic = re.sub("˙", ".", philologic)
        # Load OBJ content into soup. Must specify html5lib parser, b/c lxml causes fatal exception (memory leak)
        soup = BeautifulSoup(philologic, "html5lib")
        # Find all ARTFL spans and <a>'s
        spans = soup.find_all("span", "ARTFL-figure-missing")
        links = soup.find_all("a", "ARTFL-figure")

        # Replace /fedora/repository with /islandora/object in existing links
        for a in links:

            href = a['href']
            if href.startswith('/fedora/repository/'):
                a['href'] = '/islandora/object/%s' % href[19:]

        for span in spans:

            # Retreive the sysid and strip the file format.
            title = span['sysid'].split('.')[0]
            # Use sysid as title to send RI query for pid
            results = repo.risearch.sparql_query('select ?pid where {?pid <dc:title> "%s"}' % title)
            try:
                # sparql_query returns CSV object; next will retreive first row.
                # If no results, throw exception and log that image
                p = next(results)['pid'].replace('info:fedora/', '')
                # Create <a> tag with @href pointing to object
                new_tag = soup.new_tag("a", href="/islandora/object/%s/datastream/OBJ/view" % p)
                # B/c it's a reserved word, we have to add @class seperately
                new_tag['class']="ARTFL-figure"
                # Grab and add the <span> string
                new_tag.string = span.string
                # Replace <span> with <a>
                span.replace_with(new_tag)
                print "Successfully changed %s in %s" % (title, pid)
            except:
                print "Failed to locate %s in %s" % (title, pid)
                # Logging
                images.append(title)
                image_count+= 1
                pass

        # Retreive entire OBJ datastream
        obj = pid.getDatastreamObject('OBJ')
        # Replace OBJ content with soup. Encoding as html to maintain entity references.
        obj.content = soup.encode(formatter="html")
        # Save and we're done.
        obj.save()

        # Because GSearch isn't listening, we have to index the update
        url = '%s/fedoragsearch/rest?operation=updateIndex&action=fromPid&value=%s' % (HOST, pid)
        gsearchOpener.open(url)

        # Rest is all logging not founds and errors
        image_ids.extend(images)
        images_string = ';'.join(images)

        phil_doc.write('%s,%s,%s\n' % (pid, image_count, images_string))

    for i in image_ids:
        d[i] += 1

    with open('phil_image_dev.csv', 'w') as outfile:

        phil_image = csv.writer(outfile)

        for key, value in d.items():
            phil_image.writerow([key, value])
            
    phil_doc.close()
Example #17
0
    def handle(self, *pids, **options):

        dry_run = options.get('dry_run', False)
        verbosity = int(options.get('verbosity', self.v_normal))

        repo = Repository()
        try:
            pidman = DjangoPidmanRestClient()
        except Exception as err:
            # error if pid manager config options not in localsettings
            raise CommandError(err)

        # if pids are specified on command line, only process those objects
        if pids:
            objs = [repo.get_object(pid, type=Volume) for pid in pids]

        # otherwise, look for all volume objects in fedora
        else:
            objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL,
                                                type=Volume)

        stats = defaultdict(int)
        for obj in objs:
            if not obj.exists:
                if verbosity >= self.v_normal:
                    self.stdout.write(
                        '%s does not exist or is not accessible' % obj.pid)
                stats['skipped'] += 1
                continue

            stats['objs'] += 1
            if is_ark(obj.dc.content.identifier):
                parsed_ark = parse_ark(obj.dc.content.identifier)
                noid = parsed_ark['noid']
                try:
                    ark_info = pidman.get_ark(noid)
                except Exception as err:
                    # requested ARK is not in the configured pid manager
                    # (this should ONLY happen in dev/QA)
                    if verbosity >= self.v_normal:
                        if '404: NOT FOUND' in str(err):
                            msg = 'not found'
                            self.stdout.write(
                                'Error retriving ARK information for %s: Not Found'
                                % obj.pid)
                        else:
                            self.stdout.write(
                                'Error retriving ARK information for %s' %
                                obj.pid)
                    stats['skipped'] += 1
                    continue

                # update unqualified ark to resolve to readux volume landing page
                if not dry_run:
                    pidman.update_ark_target(noid,
                                             target_uri=self.volume_url(obj),
                                             active=True)

                # we expected a qualified ARK target for the PDF; update whether
                # it currently exists or not
                qual = 'PDF'
                stats[
                    'updated'] += 1  # count as updated in dry run mode (would be updated)
                if not dry_run:
                    pidman.update_ark_target(noid,
                                             qual,
                                             target_uri=self.pdf_url(obj),
                                             active=True)
                    # FIXME: catch possible exceptions here?

        # output summary
        if verbosity >= self.v_normal:
            msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats
            msg = msg % ('s' if stats['objs'] != 1 else '',
                         ' would have' if dry_run else '')
            self.stdout.write(msg)
class Command(BaseCommand):
    '''Fetches `~openemory.publication.models.Article` objects from Fedora and fixes the DC and MODS checksumes:
     1. Replaces '\r' with '' in abstract field.
     2. Save object. Note: this will make a new version of the mods and copy some fields to the DC
     If PIDs are provided in the arguments, that list of pids will be used instead of searching Fedora.
    '''
    args = "[pid pid ...]"
    help = __doc__

    option_list = BaseCommand.option_list + (
        make_option('--noact', '-n',
                    action='store_true',
                    default=False,
                    help='Reports the pid and total number of Articles that would be processed but does not really do anything.'),
        make_option('--username',
                    action='store',
                    help='Username of fedora user to connect as'),
        make_option('--password',
                    action='store',
                    help='Password for fedora user,  password=  will prompt for password'),
        make_option('--host',
                    action='store',
                    default=settings.FEDORA_ROOT,
                    help='Hostname of fedora instance.  Defaults to localsettings:FEDORA_ROOT.'),
        )


    
    def handle(self, *args, **options):

        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        counts = defaultdict(int)

        # check required options
        if not options['username']:
            raise CommandError('Username is required')
        else:
            if not options['password'] or options['password'] == '':
                options['password'] = getpass()

        #connection to repository
        self.repo = Repository(options['host'], username=options['username'], \
                               password=options['password'],)


        #if pids specified, use that list
        if len(args) != 0:
            article_set = self.get_pids(args)

        else:
            #search for Articles in Fedora.
            article_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication)

        #counts['total'] = article_set.count

#        self.stdout.write(article_set)
        #process all Articles
        for a in article_set:
            try:
                self.output(1, "Processing %s" % a.pid)

                if (a.descMetadata.content.abstract is not None) \
                    and (a.descMetadata.content.abstract.text) \
                    and ('\r' in a.descMetadata.content.abstract.text) \
                    and (not a.dc.validate_checksum()):
                    a.descMetadata.content.abstract.text = a.descMetadata.content.abstract.text.replace('\r', '')
                    # save article
                    try:
                        if not options['noact']:
                            a.save("Removing backslash-r to fix checksums")
                    except Exception as e:
                        self.output(0, "Error processing pid: %s : %s " % (a.pid, e.message))
                        counts['errors'] +=1
                    counts['fixed'] +=1
                else:
                    self.output(1, "Skipping %s" % a.pid)
                    counts['skip']+=1
            except Exception as e:
                self.output(1, "Error on %s: %s" % (a.pid, e.message ))


        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Fixed: %s\n" % counts['fixed'])
        self.stdout.write("Skipped: %s\n" % counts['skip'])
        self.stdout.write("Errors: %s\n" % counts['errors'])



    def get_pids(self, pids):
        # get objects only if they are Articles
        # Return generator
        for p in pids:
            obj = self.repo.get_object(pid=p, type=Publication)
            if str(obj.get_models()[0]) == Publication.ARTICLE_CONTENT_MODEL:
                yield obj


    def output(self, v, msg):
        '''simple function to handle logging output based on verbosity'''
        if self.verbosity >= v:
            self.stdout.write("%s\n" % msg)
Example #19
0
    def handle(self, *args, **options):
        self.verbosity = int(
            options['verbosity'])  # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        # check required options
        if (not options['div']) and (not options['author']) and (
                not options['lead']):
            raise CommandError(
                'At least one of the options div, author or lead is required')
        if not options['username']:
            raise CommandError('Username is required')
        else:
            if not options['password'] or options['password'] == '':
                options['password'] = getpass()

        #connection to repository
        repo = Repository(username=options['username'],
                          password=options['password'])
        pid_set = repo.get_objects_with_cmodel(Article.ARTICLE_CONTENT_MODEL,
                                               Article)

        try:
            articles = Paginator(pid_set, 100)
            self.counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0, "Error getting page: %s : %s " % (p, e.message))
                self.counts['errors'] += 1
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(
                            0, "Skipping %s because pid does not exist" %
                            article.pid)
                        self.counts['skipped'] += 1
                        continue
                    else:
                        self.output(2, "Processing %s" % article.pid)
                        if options['div']:
                            self.division(article)
                        if options['author']:
                            self.author(article)
                        if options['lead']:
                            self.lead(article)
                except Exception as e:
                    self.output(
                        0, "Error processing pid: %s : %s " %
                        (article.pid, e.message))
                    self.counts['errors'] += 1

        # write files
        if options['div']:
            writer = csv.writer(open("division_report.csv", 'w'))
            writer.writerow(['Division', 'Count'])
            for key, count in self.div_counts.items():
                writer.writerow([key, count])

        if options['author']:
            writer = csv.writer(open("author_report.csv", 'w'))
            writer.writerow(['Author', 'Division', 'Department', 'Count'])
            for netid, count in self.author_counts.items():
                try:
                    person = User.objects.get(
                        username=netid).get_profile().esd_data()
                    writer.writerow([
                        person.directory_name, person.division_name,
                        person.department_shortname, count
                    ])
                except (User.DoesNotExist, UserProfile.DoesNotExist,
                        EsdPerson.DoesNotExist) as e:
                    self.output(
                        0,
                        "At least one part (User, Profile, ESD) for netid  %s could not be found"
                        % netid)

        if options['lead']:
            writer = csv.writer(open("lead_report.csv", 'w'))
            writer.writerow(['Division', 'Count'])
            for key, count in self.lead_counts.items():
                writer.writerow([key, count])

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % self.counts['total'])
        self.stdout.write("Skipped: %s\n" % self.counts['skipped'])
        self.stdout.write("Errors: %s\n" % self.counts['errors'])
Example #20
0
    def handle(self, *args, **options):
        #counters
        counts = defaultdict(int)

        # check required options
        if not options['username']:
            raise CommandError('Username is required')
        else:
            if not options['password'] or options['password'] == '':
                options['password'] = getpass()

        #connection to repository
        repo = Repository(username=options['username'],
                          password=options['password'])

        try:
            #if pids specified, use that list
            if len(args) != 0:
                pids = list(args)
                pid_set = [
                    repo.get_object(pid=p, type=AudioObject) for p in pids
                ]

            else:
                #search for Articles
                pid_set = repo.get_objects_with_cmodel(
                    AudioObject.AUDIO_CONTENT_MODEL, AudioObject)

        except Exception as e:
            raise CommandError('Error gettings pids (%s)' % e.message)

        try:
            objects = Paginator(pid_set, 20)
            counts['total'] = objects.count
        except Exception as e:
            self.output("Error paginating items: : %s " % (e.message))

        #process all Objects
        for p in objects.page_range:
            try:
                objs = objects.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output("Error getting page: %s : %s " % (p, e.message))
                counts['errors'] += 1
                continue
            for a in objs:
                try:
                    if not a.exists:
                        self.output("Skipping %s because pid does not exist" %
                                    a.pid)
                        counts['skipped'] += 1
                        continue
                    else:
                        self.output("Processing %s" % a.pid)

                        a._update_dc()

                        # save object
                        if not options['noact']:
                            a.save("cleanup DC")
                            self.output("SAVED %s" % a.pid)
                            counts['saved'] += 1
                        counts['processed'] += 1
                except Exception as e:
                    self.output("Error processing pid: %s : %s " %
                                (a.pid, e.message))
                    counts['errors'] += 1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % counts['total'])
        self.stdout.write("Total number processed: %s\n" % counts['processed'])
        self.stdout.write("Total number saved: %s\n" % counts['saved'])
        self.stdout.write("Skipped: %s\n" % counts['skipped'])
        self.stdout.write("Errors: %s\n" % counts['errors'])
Example #21
0
    def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        counts = defaultdict(int)

        # check required options
        if not options['username']:
            raise CommandError('Username is required')
        else:
            if not options['password'] or options['password'] == '':
                options['password'] = getpass()

        #connection to repository
        repo = Repository(username=options['username'], password=options['password'])



        try:
            #if pids specified, use that list
            if len(args) != 0:
                pids = list(args)
                pid_set = [repo.get_object(pid=p, type=Publication) for p in pids]

            else:
                #search for Articles
                pid_set = repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, Article)

        except Exception as e:
            raise CommandError('Error gettings pids (%s)' % e.message)

        try:
            articles = Paginator(pid_set, 20)
            counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                counts['errors'] +=1
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(1, "Skipping %s because pid does not exist" % article.pid)
                        counts['skipped'] +=1
                        continue
                    else:
                        self.output(0,"Processing %s" % article.pid)

                        mods = article.descMetadata.content
                        nlm = article.contentMetadata.content if article.contentMetadata.exists else None
                        identifiers = []

                        #PMC info
                        if nlm:
                            pmc = nlm.docid
                            pmc_id = 'PMC%s' % pmc
                            access_url = pmc_access_url(pmc)
                            identifiers.extend([pmc_id, access_url])

                        if mods.ark_uri:
                            identifiers.append(mods.ark_uri)

                        identifiers.append(article.pid)

                        article.dc.content.identifier_list = identifiers

                        ##########REMOVE dc.relation###########
                        #                                     #
                        article.dc.content.relation_list = [] #
                        #                                     #
                        #######################################

                        # save article
                        if not options['noact']:
                            article.save()
                            self.output(1, "SAVED")
                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    counts['errors'] +=1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % counts['total'])
        self.stdout.write("Skipped: %s\n" % counts['skipped'])
        self.stdout.write("Errors: %s\n" % counts['errors'])
Example #22
0
    def handle(self, *pids, **options):

        dry_run = options.get('dry_run', False)
        verbosity = int(options.get('verbosity', self.v_normal))

        repo = Repository()
        try:
            pidman = DjangoPidmanRestClient()
        except Exception as err:
            # error if pid manager config options not in localsettings
            raise CommandError(err)

        # if pids are specified on command line, only process those objects
        if pids:
            objs = [repo.get_object(pid, type=Volume) for pid in pids]

        # otherwise, look for all volume objects in fedora
        else:
            objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL,
                                                type=Volume)

        stats = defaultdict(int)
        for obj in objs:
            if not obj.exists:
                if verbosity >= self.v_normal:
                    self.stdout.write('%s does not exist or is not accessible' % obj.pid)
                stats['skipped'] += 1
                continue

            stats['objs'] += 1
            if is_ark(obj.dc.content.identifier):
                parsed_ark = parse_ark(obj.dc.content.identifier)
                noid = parsed_ark['noid']
                try:
                    ark_info = pidman.get_ark(noid)
                except Exception as err:
                    # requested ARK is not in the configured pid manager
                    # (this should ONLY happen in dev/QA)
                    if verbosity >= self.v_normal:
                        if '404: NOT FOUND' in str(err):
                            msg = 'not found'
                            self.stdout.write('Error retriving ARK information for %s: Not Found' % obj.pid)
                        else:
                            self.stdout.write('Error retriving ARK information for %s' % obj.pid)
                    stats['skipped'] += 1
                    continue

                # update unqualified ark to resolve to readux volume landing page
                if not dry_run:
                    pidman.update_ark_target(noid,
                        target_uri=self.volume_url(obj),
                        active=True)

                # we expected a qualified ARK target for the PDF; update whether
                # it currently exists or not
                qual = 'PDF'
                stats['updated'] += 1   # count as updated in dry run mode (would be updated)
                if not dry_run:
                    pidman.update_ark_target(noid, qual,
                        target_uri=self.pdf_url(obj),
                        active=True)
                    # FIXME: catch possible exceptions here?

        # output summary
        if verbosity >= self.v_normal:
            msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats
            msg = msg % ('s' if stats['objs'] != 1 else '', ' would have' if dry_run else '')
            self.stdout.write(msg)
Example #23
0
class Command(BaseCommand):
    '''Fetches `~openemory.publication.models.Article` objects from Fedora and fixes the DC and MODS checksumes:
     1. Replaces '\r' with '' in abstract field.
     2. Save object. Note: this will make a new version of the mods and copy some fields to the DC
     If PIDs are provided in the arguments, that list of pids will be used instead of searching Fedora.
    '''
    args = "[pid pid ...]"
    help = __doc__

    option_list = BaseCommand.option_list + (
        make_option(
            '--noact',
            '-n',
            action='store_true',
            default=False,
            help=
            'Reports the pid and total number of Articles that would be processed but does not really do anything.'
        ),
        make_option('--username',
                    action='store',
                    help='Username of fedora user to connect as'),
        make_option(
            '--password',
            action='store',
            help=
            'Password for fedora user,  password=  will prompt for password'),
        make_option(
            '--host',
            action='store',
            default=settings.FEDORA_ROOT,
            help=
            'Hostname of fedora instance.  Defaults to localsettings:FEDORA_ROOT.'
        ),
    )

    def handle(self, *args, **options):

        self.verbosity = int(
            options['verbosity'])  # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        counts = defaultdict(int)

        # check required options
        if not options['username']:
            raise CommandError('Username is required')
        else:
            if not options['password'] or options['password'] == '':
                options['password'] = getpass()

        #connection to repository
        self.repo = Repository(options['host'], username=options['username'], \
                               password=options['password'],)

        #if pids specified, use that list
        if len(args) != 0:
            article_set = self.get_pids(args)

        else:
            #search for Articles in Fedora.
            article_set = self.repo.get_objects_with_cmodel(
                Article.ARTICLE_CONTENT_MODEL, type=Article)

        #counts['total'] = article_set.count

#        self.stdout.write(article_set)
#process all Articles
        for a in article_set:
            try:
                self.output(1, "Processing %s" % a.pid)

                if (a.descMetadata.content.abstract is not None) \
                    and (a.descMetadata.content.abstract.text) \
                    and ('\r' in a.descMetadata.content.abstract.text) \
                    and (not a.dc.validate_checksum()):
                    a.descMetadata.content.abstract.text = a.descMetadata.content.abstract.text.replace(
                        '\r', '')
                    # save article
                    try:
                        if not options['noact']:
                            a.save("Removing backslash-r to fix checksums")
                    except Exception as e:
                        self.output(
                            0, "Error processing pid: %s : %s " %
                            (a.pid, e.message))
                        counts['errors'] += 1
                    counts['fixed'] += 1
                else:
                    self.output(1, "Skipping %s" % a.pid)
                    counts['skip'] += 1
            except Exception as e:
                self.output(1, "Error on %s: %s" % (a.pid, e.message))

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Fixed: %s\n" % counts['fixed'])
        self.stdout.write("Skipped: %s\n" % counts['skip'])
        self.stdout.write("Errors: %s\n" % counts['errors'])

    def get_pids(self, pids):
        # get objects only if they are Articles
        # Return generator
        for p in pids:
            obj = self.repo.get_object(pid=p, type=Article)
            if str(obj.get_models()[0]) == Article.ARTICLE_CONTENT_MODEL:
                yield obj

    def output(self, v, msg):
        '''simple function to handle logging output based on verbosity'''
        if self.verbosity >= v:
            self.stdout.write("%s\n" % msg)