Ejemplo n.º 1
0
class TestMigrateRushdie(TestCase):
    MM_FIXTURE ='''<macfs:document xmlns:macfs="info:fedora/emory-control:Rushdie-MacFsData-1.0">
  <macfs:md5>ffcf48e5df673fc7de985e1b859eeeec</macfs:md5>
  <macfs:file>
    <macfs:computer>Performa 5400</macfs:computer>
    <macfs:path>/Hard Disk/MIDNIGHT&apos;S CHILDREN/MISC. MATERIAL/x - the roles</macfs:path>
    <macfs:rawpath>L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=</macfs:rawpath>
    <macfs:attributes>avbstclInmedz</macfs:attributes>
    <macfs:created>1997-01-19T19:29:32</macfs:created>
    <macfs:modified>1997-01-19T19:29:32</macfs:modified>
    <macfs:type>TEXT</macfs:type>
    <macfs:creator>ttxt</macfs:creator>
  </macfs:file>
</macfs:document>'''

    MA_FIXTURE ='''<marbl:analysis xmlns:marbl="info:fedora/emory-control:Rushdie-MarblAnalysis-1.0">
  <marbl:series>Writings by Rushdie</marbl:series>
  <marbl:subseries>Fiction</marbl:subseries>
  <marbl:verdict>As is</marbl:verdict>
</marbl:analysis>'''

    SERIES_FIXTURE = {'Writings by Rushdie':
              { 'series_info':
                   {'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk',
                        'id': 'rushdie1000_series2',
                        'short_id': 'series2',
                        'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2'},
              'subseries_info': {   'Fiction': {   'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk',
                                            'id': 'rushdie1000_subseries2.1',
                                            'short_id': 'subseries2.1',
                                            'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2/subseries2.1'}}}}

    def setUp(self):
        self.repo = Repository()
        self.pids = []

        #Create a simple Collection
        self.sc = self.repo.get_object(type=SimpleCollection)
        self.sc.label = "SimpleCollection For Test"
        self.sc.save()
        self.pids.append(self.sc.pid)

        #Create a Master Collection
        self.mc = self.repo.get_object(type=CollectionObject)
        self.mc.label = "MasterCollection For Test"
        self.mc.save()
        self.pids.append(self.mc.pid)

        #Create a a DigitalObject
        self.digObj = self.repo.get_object(type=RushdieArrangementFile)
        self.digObj.label = "Object For Test"
        self.digObj.save()
        self.pids.append(self.digObj.pid)
        self.digObj.api.addDatastream(self.digObj.pid, "MARBL-MACTECH",
                                           "MARBL-MACTECH",  mimeType="application/xml", content= self.MM_FIXTURE)
        self.digObj.api.addDatastream(self.digObj.pid, "MARBL-ANALYSIS",
                                           "MARBL-ANALYSIS",  mimeType="application/xml", content= self.MA_FIXTURE)
        #Remove Arrangement model so it can be added later
        relation = (self.digObj.uriref, modelns.hasModel, "info:fedora/emory-control:Arrangement-1.0")
        self.digObj.rels_ext.content.remove(relation)
        self.digObj.save()


        #Setup Command
        self.cmd = migrate_rushdie.Command()
        self.cmd.verbosity = 1
        self.cmd.v_normal = 1
        self.cmd.v_none = 0
        self.cmd.simple_collection = self.sc
        self.cmd.stdout = sys.stdout
        self.cmd.CONTENT_MODELS = CONTENT_MODELS
        self.cmd.repo = self.repo

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)


    def test__add_to_simple_collection(self):
        self.cmd._add_to_simple_collection(self.digObj)
        self.assertTrue((self.sc.uriref, relsextns.hasMember,
                     self.digObj.uriref) in self.sc.rels_ext.content, "%s shold be a member of the Simplecollection" % self.digObj.pid )


    def test__get_unique_objects(self):
        #duplicate pids are processed only once
        objs = self.cmd._get_unique_objects([self.digObj.pid, self.digObj.pid])
        self.assertEqual(len(objs), 1, "No dup pids should be processed")

    def test__convert_ds(self):
        obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False)
        #Check all fields are moved over correctly

        #filetech
        self.assertEqual(obj.filetech.content.file[0].md5, "ffcf48e5df673fc7de985e1b859eeeec")
        self.assertEqual(obj.filetech.content.file[0].computer, "Performa 5400")
        self.assertEqual(obj.filetech.content.file[0].path, "/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles")
        self.assertEqual(obj.filetech.content.file[0].rawpath, "L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=")
        self.assertEqual(obj.filetech.content.file[0].attributes, "avbstclInmedz")
        self.assertEqual(obj.filetech.content.file[0].created, "1997-01-19T19:29:32")
        self.assertEqual(obj.filetech.content.file[0].modified, "1997-01-19T19:29:32")
        self.assertEqual(obj.filetech.content.file[0].type, "TEXT")
        self.assertEqual(obj.filetech.content.file[0].creator, "ttxt")
        #MODS
        self.assertEqual(obj.mods.content.series.title, "Fiction")
        self.assertEqual(obj.mods.content.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["uri"])
        self.assertEqual(obj.mods.content.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["base_ark"])
        self.assertEqual(obj.mods.content.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["id"])
        self.assertEqual(obj.mods.content.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["short_id"])
        self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie")
        self.assertEqual(obj.mods.content.series.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["uri"])
        self.assertEqual(obj.mods.content.series.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["base_ark"])
        self.assertEqual(obj.mods.content.series.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["id"])
        self.assertEqual(obj.mods.content.series.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["short_id"])
        #Rights
        self.assertEqual(obj.rights.content.access_status.code, "2")
        #RELS-EXT
        self.assertTrue((obj.uriref, relsextns.isMemberOf, self.mc.uriref) in obj.rels_ext.content, "Object should have isMember relation to master collection")
        self.assertTrue((obj.uriref, modelns.hasModel, URIRef("info:fedora/emory-control:ArrangementAccessAllowed-1.0")) in obj.rels_ext.content, "Object should have Allowed Content Model")
        #Label and DS
        self.assertEqual(obj.label, "x - the roles", "Label should be set to last part of path")
        self.assertEqual(obj.owner, "thekeep-project", "owner should be set to 'thekeep-project'")
        self.assertEqual(obj.dc.content.title, "x - the roles", "DC title should be set to last part of path")
        #DataStreams
        #have to reload obj from repository to get DS update
        obj = self.repo.get_object(pid=obj.pid, type=ArrangementObject)
        self.assertFalse("MARBL-MACTECH" in obj.ds_list, "MARBL-MACTECH should have been removed")
        self.assertFalse("MARBL-ANALYSIS" in obj.ds_list, "MARBL-ANALYSIS should have been removed")

    def test_missing_series_info(self):
        #Remove subseries info from lookup
        series = self.SERIES_FIXTURE.copy()
        del series["Writings by Rushdie"]["subseries_info"]
        obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False)

        self.assertEqual(obj.mods.content.series.title, "Fiction")
        self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie")
Ejemplo n.º 2
0
class EmailMessageTest(KeepTestCase):

    def setUp(self):
        self.repo = Repository()
        self.pids = []

        # test EmailMessage
        self.email = self.repo.get_object(type=EmailMessage)
        self.email.cerp.content.from_list = ['*****@*****.**']
        self.email.cerp.content.to_list = ['*****@*****.**']
        self.email.cerp.content.subject_list = ['Interesting Subject']

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)

    def test_headers(self):
        h1 = cerp.Header()
        h1.name = "HEADER 1"
        h1.value = "value for header 1"
        h2 = cerp.Header()
        h2.name = "HEADER 2"
        h2.value = "value for header 2"
        self.email.cerp.content.headers.append(h1)
        self.email.cerp.content.headers.append(h2)
        self.assertEqual(self.email.headers['HEADER 1'], 'value for header 1')
        self.assertEqual(self.email.headers['HEADER 2'], 'value for header 2')


    def test_email_label(self):
        # no object label and one person in to field
        label = self.email.email_label()
        self.assertEqual('Email from [email protected] to [email protected] Interesting Subject',
                         label,
                         'Should construct label when it does not exist')

        # more then one person in to list
        self.email.cerp.content.to_list.append('*****@*****.**')
        label = self.email.email_label()
        self.assertEqual('Email from [email protected] to [email protected] et al. Interesting Subject',
                         label,
                         'only show first to email address when there are more than one')

        # no subject
        self.email.cerp.content.subject_list = []
        self.assertEqual('Email from [email protected] to [email protected] et al.',
                         self.email.email_label(),
                         'Display message without subject when no subject is present')

        # has a date
        date_header = cerp.Header()
        date_header.name = 'Date'
        date_header.value = 'Friday 13 200 13:00'
        self.email.cerp.content.headers.append(date_header)
        label = self.email.email_label()
        self.assertEqual('Email from [email protected] to [email protected] et al. on Friday 13 200 13:00',
                         label,
                         'only show first to email address when there are more than one')

        # object label already exists
        self.email.label = "label we want to keep"
        label = self.email.email_label()
        self.assertEqual(self.email.label, label, 'label should be preserved when it exists')

    def test_index_data(self):
        # NOTE: logic for creating the label is in the label test

        # test to make sure label exists in index data
        data = self.email.index_data()
        self.assertIn('label', data.keys())
        # mime_data does not exist, so no c
        self.assert_('content_md5' not in data,
                     'content_md5 should not be set when mime data does not exist')

        # patch mime data to test exists /cchecksum
        with patch.object(self.email, 'mime_data', Mock()) as mock_mime:
            mock_mime.exists = True
            mock_mime.checksum = 'test checksum value'

            data = self.email.index_data()
            self.assertEqual(self.email.mime_data.checksum, data['content_md5'])

    @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface)
    def test_by_checksum(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist, EmailMessage.by_checksum,
                          42)
        solr = mocksolr.return_value
        solr.query.assert_called_with(content_md5=42,
                                      content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')

        # too many matches
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'},
                                                            {'pid': 'pid:2'}]
        self.assertRaises(MultipleObjectsReturned, EmailMessage.by_checksum,
                          42)

        # one match
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}]
        em = EmailMessage.by_checksum(42)
        self.assert_(isinstance(em, EmailMessage))

        # custom repo object
        mockrepo = Mock()
        em = EmailMessage.by_checksum(42, mockrepo)
        mockrepo.get_object.assert_called_with('pid:1', type=EmailMessage)

    @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface)
    def test_by_message_id(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist, EmailMessage.by_message_id,
                          '<*****@*****.**>')
        solr = mocksolr.return_value
        solr.query.assert_called_with(arrangement_id='<*****@*****.**>',
                                      content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')
Ejemplo n.º 3
0
class Command(BaseCommand):
    '''Read CSV file and creates (or adds to) a Simple Collection and associated ArrangementObjects
    with the SimpleCollection and the Master collection'''
    def get_password_option(option, opt, value, parser):
        setattr(parser.values, option.dest, getpass())

    #Set up additional options
    option_list = BaseCommand.option_list + (
        make_option(
            '--noact',
            '-n',
            action='store_true',
            dest='no-act',
            default=False,
            help=
            'Does not create PIDs or ingest anything into Fedora. Only parses file and outputs results'
        ),
        make_option(
            '--add',
            '-a',
            action='store',
            dest='add',
            help=
            'adds to the SimpleCollection specified by pid, does not create a new SimpleCollection'
        ),
        make_option('--username',
                    '-u',
                    dest='username',
                    action='store',
                    help='''Username to connect to fedora'''),
        make_option(
            '--password',
            dest='password',
            action='callback',
            callback=get_password_option,
            help='''Prompt for password required when username used'''),
    )

    args = '<CSV file> <master collection pid> <new simple collection name>'
    help = __doc__

    def _create_series_lookup(self):
        #series / subseries info
        self.series = {}

        #exist query params
        return_fields = ['eadid']
        search_fields = {'eadid': 'rushdie1000'}

        queryset = Series.objects.also(*return_fields).filter(**search_fields)
        for s in queryset:
            #series info
            self.series[s.title] = {}
            self.series[s.title]['series_info'] = {}
            self.series[s.title]['series_info']['id'] = s.id
            self.series[s.title]['series_info']['short_id'] = s.short_id
            self.series[s.title]['series_info']['base_ark'] = s.eadid.url
            self.series[s.title]['series_info']['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s" % \
                (s.eadid.value, s.short_id)
            #subseries info
            if s.subseries:
                self.series[s.title]['subseries_info'] = {}
                for sub in s.subseries:
                    self.series[s.title]['subseries_info'][sub.title] = {}
                    self.series[s.title]['subseries_info'][
                        sub.title]['id'] = sub.id
                    self.series[s.title]['subseries_info'][
                        sub.title]['short_id'] = sub.short_id
                    self.series[s.title]['subseries_info'][
                        sub.title]['base_ark'] = s.eadid.url
                    self.series[s.title]['subseries_info'][sub.title]['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s/%s" % \
                    (s.eadid.value, s.short_id, sub.short_id)

    def _create_arrangement(self, row):
        #Account for unicode characters
        #Preserve unicode characters for raw path,
        #but remove unicode character for other mappings
        rawpath = base64.encodestring(row["filename"])

        path = row["filename"]
        path = unicode(path, 'utf8')
        creator = row["creator"]
        creator = unicode(creator, 'utf8')

        # set values in filetech DS
        obj = self.repo.get_object(type=ArrangementObject)
        obj.label = path.rpartition('/')[2]
        obj.filetech.content.file.append(FileMasterTech_Base())
        obj.filetech.content.file[0].local_id = row['id']
        obj.filetech.content.file[0].md5 = row['checksum']
        obj.filetech.content.file[0].computer = row['computer']
        obj.filetech.content.file[0].path = path
        obj.filetech.content.file[0].rawpath = rawpath
        obj.filetech.content.file[0].attributes = row['attrib']
        obj.filetech.content.file[0].created = row['created']
        obj.filetech.content.file[0].modified = row['modified']
        obj.filetech.content.file[0].creator = creator

        #map DC title
        obj.dc.content.title = path.rpartition('/')[2]

        #map default verdict of 10 "Undetermined" in rights DS
        obj.rights.content.create_access_status()
        obj.rights.content.access_status.code = "10"

        #map series in MODS
        #RecordType used to lookup series info
        rec_type = row["rec_type"]
        rec_type = rec_type.strip()
        if rec_type not in self.series:
            rec_type = None

        if rec_type is not None:
            obj.mods.content.create_series()
            obj.mods.content.series.title = rec_type
            obj.mods.content.series.uri = self.series[rec_type]["series_info"][
                "uri"]
            obj.mods.content.series.base_ark = self.series[rec_type][
                "series_info"]["base_ark"]
            obj.mods.content.series.full_id = self.series[rec_type][
                "series_info"]["id"]
            obj.mods.content.series.short_id = self.series[rec_type][
                "series_info"]["short_id"]
        else:
            if self.verbosity > self.v_none:
                self.stdout.write("Series %s not found\n" % row["rec_type"])

        # set association to master collection
        relation = (obj.uriref, relsextns.isMemberOf, self.master_obj.uriref)
        obj.rels_ext.content.add(relation)
        if self.verbosity > self.v_normal:
            self.stdout.write(
                "Adding %s isMemberOf %s relation on ArrangementObject\n" %
                (obj.label, self.master_obj.pid))

        #set state to inactive by default
        obj.state = "I"
        return obj

    def handle(self, *args, **options):
        #collect arrangement pids here to delete later if SimpleCollection fails to save
        self.arrangement_pids = []
        self._create_series_lookup()

        #0 = none, 1 = normal, 2 = all
        self.v_none = 0
        self.v_normal = 1

        if 'verbosity' in options:
            self.verbosity = int(options['verbosity'])
        else:
            self.verbosity = self.v_normal
        #Create the repo
        repo_args = {}
        if options.get('username') is not None:
            repo_args['username'] = options.get('username')
        if options.get('password') is not None:
            repo_args['password'] = options.get('password')
        self.repo = Repository(**repo_args)

        #Check to make sure all args and options are present
        try:
            file = args[0]
        except IndexError:
            raise CommandError("No CSV file specified")

        try:
            self.master_pid = args[1]
        except IndexError:
            raise CommandError("No master collection pid specified")

        #if -a or --add is used the new SimpleCollection name is ignored
        try:
            if not options["add"]:
                self.simple_collection_name = args[2]
            else:
                self.simple_collection_pid = options["add"]

        except IndexError:
            raise CommandError(
                "An existing SimpleCollection pid must be specified with the -a option or \
            a new SimpleCollection name must be specified as an argument")

        #If Master collection does not exist then raise an exception
        self.master_obj = self.repo.get_object(type=CollectionObject,
                                               pid=self.master_pid)

        if not self.master_obj.exists:
            raise CommandError("Master Collection %s does not exist" %
                               (self.master_pid))
        else:
            if self.verbosity > self.v_none:
                self.stdout.write("Using Master Collection: %s(%s)\n" %
                                  (self.master_obj.label, self.master_obj.pid))

        #Get or create SimpleColletion object
        #TODO Not sure why I have to do a try block to prevent a 404 here when I don't in other places
        try:
            if options["add"]:
                simple_collection = self.repo.get_object(
                    type=SimpleCollection, pid=self.simple_collection_pid)
            else:
                simple_collection = self.repo.get_object(type=SimpleCollection)
                simple_collection.label = self.simple_collection_name
                simple_collection.dc.content.title = self.simple_collection_name
                simple_collection.mods.content.create_restrictions_on_access()
                simple_collection.mods.content.restrictions_on_access.text = "Accessioned"
        except:
            raise CommandError("Pid %s does not exist" %
                               self.simple_collection_pid)

        #try to read file into a dict and assign the field names
        try:
            reader = csv.DictReader(open(file, 'rb'),
                                    fieldnames=[
                                        "id", "checksum", "filename",
                                        "rec_type", "file_type", "creator",
                                        "attrib", "created", "modified",
                                        "computer", "size"
                                    ])
            if self.verbosity > self.v_none:
                self.stdout.write("Reading CSV: %s\n" % (file))
        except IOError:
            raise CommandError("Could not read file %s" % file)

        # skip the header row in CSV file
        reader.next()

        #read each field
        csv_read = 0
        arrangement_saved = 0
        errors = 0
        for row in reader:
            try:
                csv_read += 1
                arrangement_object = self._create_arrangement(row)

                if not options['no-act']:
                    try:
                        arrangement_object.save()
                        arrangement_saved += 1
                        self.arrangement_pids.append(arrangement_object.pid)
                        if self.verbosity > self.v_none:
                            self.stdout.write(
                                "Saved ArrangementObject %s(%s)\n" %
                                (arrangement_object.label,
                                 arrangement_object.pid))
                    except Exception as e:
                        if self.verbosity > self.v_none:
                            self.stdout.write(
                                "Error saving ArrangementObject %s: %s\n" %
                                (arrangement_object.label, e.message))
                        errors += 1
                else:
                    if self.verbosity > self.v_none:
                        self.stdout.write("TEST ArrangementObject %s\n" %
                                          (arrangement_object.label))

                if self.verbosity > self.v_normal:
                    self.stdout.write("===RELS-EXT===\n")
                    for entry in arrangement_object.rels_ext.content:
                        self.stdout.write("%s\n" % list(entry))
                    self.stdout.write("===MODS===\n")
                    self.stdout.write(
                        "%s\n" % arrangement_object.mods.content.serialize())

                #Add each ArrangementObject to the SimpleCollection
                relation = (simple_collection.uriref, relsextns.hasMember,
                            arrangement_object.uriref)
                simple_collection.rels_ext.content.add(relation)
                if self.verbosity > self.v_normal:
                    self.stdout.write(
                        "Adding hasMember %s relation on SimpleCollection\n" %
                        (arrangement_object.pid))
            except Exception as e:
                self.stdout.write("Error in record id %s: %s\n" %
                                  (row["id"], e))
                errors += 1

        if not options['no-act']:
            try:
                simple_collection.save()
                self.stdout.write(
                    "Saved SimpleCollection %s(%s)\n" %
                    (simple_collection.label, simple_collection.pid))
            except Exception as e:
                if self.verbosity > self.v_none:
                    self.stdout.write(
                        "Error saving SimpleCollection %s: %s\n" %
                        (simple_collection.label, e.message))
                    self.stdout.write(
                        "Deleting Arrangement pids so they will not be Orphans\n"
                    )
                errors += 1
                for pid in self.arrangement_pids:
                    self.repo.purge_object(pid)
                    if self.verbosity > self.v_none:
                        self.stdout.write("Deleting: %s\n" % (pid))
                    arrangement_saved -= 1

        else:
            if self.verbosity > self.v_none:
                self.stdout.write("TEST SimpleCollection %s\n" %
                                  (simple_collection.label))

        if self.verbosity > self.v_normal:
            self.stdout.write("===RELS-EXT===\n")
            for entry in simple_collection.rels_ext.content:
                self.stdout.write("%s\n" % list(entry))
            self.stdout.write("===DC===\n")
            self.stdout.write("%s\n" %
                              simple_collection.dc.content.serialize())
            self.stdout.write("===MODS===\n")
            self.stdout.write("%s\n" %
                              simple_collection.mods.content.serialize())

        #print Summary
        self.stdout.write("\n\nSUMMARY\n=======\n")
        self.stdout.write("SimpleCollection: %s(%s)\n" %
                          (simple_collection.label, simple_collection.pid))
        self.stdout.write("Master Collection Object: %s(%s)\n" %
                          (self.master_obj.label, self.master_obj.pid))
        self.stdout.write("%s Records read from CSV file\n" % (csv_read))
        self.stdout.write("%s Records created\n" % (arrangement_saved))
        self.stdout.write("%s Errors\n" % (errors))
Ejemplo n.º 4
0
class ArrangementObjectTest(KeepTestCase):

    def setUp(self):
        self.repo = Repository()
        self.pids = []

        # create test collection
        coll = self.repo.get_object(type=CollectionObject)
        coll.pid = '%s:parent-1' % settings.FEDORA_PIDSPACE
        coll.mods.content.source_id = '12345'
        coll.save()
        self.pids.append(coll.pid)

        #create test arrangement object
        self.arr = self.repo.get_object(type=ArrangementObject)
        self.arr.pid = 'foo:1'
        self.arr.collection = coll

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)

    @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface)
    def test_by_arrangement_id(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist, ArrangementObject.by_arrangement_id,
                          42)
        solr = mocksolr.return_value
        solr.query.assert_called_with(arrangement_id=42,
                                      content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')

        # too many matches
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'},
                                                            {'pid': 'pid:2'}]
        self.assertRaises(MultipleObjectsReturned, ArrangementObject.by_arrangement_id,
                          42)

        # one match
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}]
        ao = ArrangementObject.by_arrangement_id(42)
        self.assert_(isinstance(ao, ArrangementObject))

        # custom repo object
        mockrepo = Mock()
        ao = ArrangementObject.by_arrangement_id(42, mockrepo)
        mockrepo.get_object.assert_called_with('pid:1', type=ArrangementObject)

    def test_arrangement_status(self):
        obj = ArrangementObject(Mock())
        obj.arrangement_status = 'processed'
        self.assertEqual('A', obj.state)
        self.assertEqual('processed', obj.arrangement_status)

        obj.arrangement_status = 'accessioned'
        self.assertEqual('I', obj.state)
        self.assertEqual('accessioned', obj.arrangement_status)

        value_error = None
        try:
            obj.arrangement_status = 'bogus'
        except ValueError:
            value_error = True

        self.assertTrue(value_error,
                        'attempting to assign an unknown status should raise a ValueError')

    def test_update_access_cmodel(self):
        obj = ArrangementObject(Mock())
        # no status set - should be set to restricted
        obj._update_access_cmodel()

        self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL))
                     in obj.rels_ext.content)
        self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL))
                     not in obj.rels_ext.content)

        # set to status code 2 = access allowed
        obj.rights.content.create_access_status()
        obj.rights.content.access_status.code = '2'

        obj._update_access_cmodel()

        self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL))
                     not in obj.rels_ext.content)
        self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL))
                     in obj.rels_ext.content)

    def test_index_data(self):
        idx_data = self.arr.index_data()
        self.assertEqual('born-digital', idx_data['object_type'])
        self.assertEqual(self.arr.pid, idx_data['pid'])
        self.assertIn(self.arr.owner, idx_data['owner'])
        self.assertEquals(self.arr.collection.pid, idx_data['collection_id'])
        self.assertEquals(self.arr.collection.mods.content.source_id, idx_data['collection_source_id'])

    # Test the update_ark_label method in the keep.common.fedora
    # Note that this test is a simplified version of keep.common.fedora:ArkPidDigitalObject.test_update_ark_label
    # The udpate_ark_label here is an overriden method that is more specifc, and is used on Arrangement objects
    @patch('keep.arrangement.models.pidman')  # mock the pidman client (the API service)
    def test_update_ark_label(self, mockpidman):

        # Create a ArrangementObject
        arrangement_object = ArrangementObject(Mock())

        # Set a pid on the object so that it could internally generate a noid etc.
        arrangement_object.pid = "test:1234"

        # Simulate when the object doesn't exist (or hasn't been saved)
        # By default it appears as if it doesn't exist
        arrangement_object.update_ark_label()

        # What we should expect is that the update_ark_label is not called on pidman
        # Also there shouldn't be any errors
        # Use the mock assertFalse to check if a method is called or not
        self.assertFalse(mockpidman.get_ark.called)

        # Mock when the object exists (returns True)
        # Note: Need to set the Mock on the class and not the object because
        # this (exists) is a property method
        with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)):
            arrangement_object.update_ark_label()
            self.assertFalse(mockpidman.get_ark.called)

        # Set the label before the object exists so we don't trigger API calls
        arrangement_object.dc.content.title = "testpid"
        with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)):
            mockpidman.get_ark.return_value = {"name": arrangement_object.dc.content.title}
            arrangement_object.update_ark_label()
            mockpidman.get_ark.assert_called_with(arrangement_object.noid) # assert that it is called with a noid too
            self.assertFalse(mockpidman.update_ark.called)

            # When the label is different from that in Pidman
            mockpidman.get_ark.return_value = {"name": "another pid"}
            arrangement_object.update_ark_label()
            mockpidman.get_ark.assert_called_with(arrangement_object.noid) # assert that it is called with a noid too
            mockpidman.update_ark.assert_called_with(noid=arrangement_object.noid, name=arrangement_object.dc.content.title)

    def test_set_premis_object(self):
        mockapi = Mock()
        arrangement_object = ArrangementObject(mockapi)
        arrangement_object.pid = "test:1234"
        arrangement_object.mods.content.ark = 'ark:/1234/987'

        # return empty iterator for original data to checksum
        mockapi.getDatastreamDissemination.return_value = []
        with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds:
            mockgetds.return_value.checksum = '123456789'
            mockgetds.return_value.mimetype = 'text/plain'
            arrangement_object.set_premis_object()

        self.assert_(arrangement_object.provenance.content.object)
        premis = arrangement_object.provenance.content
        # FIXME: placeholder tests for placeholder functionality,
        # should be updated to use ARK uri once that is implemented
        self.assertEqual('ark', premis.object.id_type)
        self.assertEqual(arrangement_object.mods.content.ark, premis.object.id)
        self.assertEqual('p:file', premis.object.type)
        self.assertEqual(0, premis.object.composition_level)
        self.assertEqual('MD5', premis.object.checksums[0].algorithm)
        self.assertEqual('123456789',
                         premis.object.checksums[0].digest)
        # sha1 for an empty file
        empty_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
        self.assertEqual('SHA-1', premis.object.checksums[1].algorithm)
        self.assertEqual(empty_sha1,
                         premis.object.checksums[1].digest)
        # object format should be original mietype
        self.assertEqual('text/plain', premis.object.format.name)

        # generated premis should be valid
        self.assertTrue(premis.is_valid())

    def test_identifier_change_event(self):
        mockapi = Mock()
        mockapi.username = '******'
        arrangement_object = ArrangementObject(mockapi)
        arrangement_object.pid = 'test:1234'
        arrangement_object.mods.content.ark = 'ark:/1234/987'

        # set object premis so we can validate
        mockapi.getDatastreamDissemination.return_value = []
        with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds:
            mockgetds.return_value.checksum = '123456789'
            mockgetds.return_value.mimetype = 'text/plain'
            arrangement_object.set_premis_object()

        arrangement_object.identifier_change_event('old-pid:1')
        premis = arrangement_object.provenance.content
        self.assertEqual(1, len(premis.events))
        event = premis.events[0]
        self.assertEqual('UUID', event.id_type)
        # id should be set, we don't care what it is exactly
        self.assert_(event.id)
        self.assertEqual('identifier assignment', event.type)
        self.assertEqual('program="keep"; version="%s"' % __version__,
                         event.detail)
        self.assertEqual('Pass', event.outcome)
        msg = 'Persistent identifier reassigned from %s to %s' % \
            ('old-pid:1', arrangement_object.pid)
        self.assertEqual(msg, event.outcome_detail)
        self.assertEqual('fedora user', event.agent_type)
        self.assertEqual('fedoraAdmin', event.agent_id)

        # generated premis should be valid
        self.assertTrue(premis.is_valid())
Ejemplo n.º 5
0
class EmailMessageTest(KeepTestCase):
    def setUp(self):
        self.repo = Repository()
        self.pids = []

        # test EmailMessage
        self.email = self.repo.get_object(type=EmailMessage)
        self.email.cerp.content.from_list = ['*****@*****.**']
        self.email.cerp.content.to_list = ['*****@*****.**']
        self.email.cerp.content.subject_list = ['Interesting Subject']

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)

    def test_headers(self):
        h1 = cerp.Header()
        h1.name = "HEADER 1"
        h1.value = "value for header 1"
        h2 = cerp.Header()
        h2.name = "HEADER 2"
        h2.value = "value for header 2"
        self.email.cerp.content.headers.append(h1)
        self.email.cerp.content.headers.append(h2)
        self.assertEqual(self.email.headers['HEADER 1'], 'value for header 1')
        self.assertEqual(self.email.headers['HEADER 2'], 'value for header 2')

    def test_email_label(self):
        # no object label and one person in to field
        label = self.email.email_label()
        self.assertEqual(
            'Email from [email protected] to [email protected] Interesting Subject',
            label, 'Should construct label when it does not exist')

        # more then one person in to list
        self.email.cerp.content.to_list.append('*****@*****.**')
        label = self.email.email_label()
        self.assertEqual(
            'Email from [email protected] to [email protected] et al. Interesting Subject',
            label,
            'only show first to email address when there are more than one')

        # no subject
        self.email.cerp.content.subject_list = []
        self.assertEqual(
            'Email from [email protected] to [email protected] et al.',
            self.email.email_label(),
            'Display message without subject when no subject is present')

        # has a date
        date_header = cerp.Header()
        date_header.name = 'Date'
        date_header.value = 'Friday 13 200 13:00'
        self.email.cerp.content.headers.append(date_header)
        label = self.email.email_label()
        self.assertEqual(
            'Email from [email protected] to [email protected] et al. on Friday 13 200 13:00',
            label,
            'only show first to email address when there are more than one')

        # object label already exists
        self.email.label = "label we want to keep"
        label = self.email.email_label()
        self.assertEqual(self.email.label, label,
                         'label should be preserved when it exists')

    def test_index_data(self):
        # NOTE: logic for creating the label is in the label test

        # test to make sure label exists in index data
        data = self.email.index_data()
        self.assertIn('label', data.keys())
        # mime_data does not exist, so no c
        self.assert_(
            'content_md5' not in data,
            'content_md5 should not be set when mime data does not exist')

        # patch mime data to test exists /cchecksum
        with patch.object(self.email, 'mime_data', Mock()) as mock_mime:
            mock_mime.exists = True
            mock_mime.checksum = 'test checksum value'

            data = self.email.index_data()
            self.assertEqual(self.email.mime_data.checksum,
                             data['content_md5'])

    @patch('keep.arrangement.models.solr_interface',
           spec=sunburnt.SolrInterface)
    def test_by_checksum(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist, EmailMessage.by_checksum, 42)
        solr = mocksolr.return_value
        solr.query.assert_called_with(
            content_md5=42,
            content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')

        # too many matches
        solr.query.return_value.field_limit.return_value = [{
            'pid': 'pid:1'
        }, {
            'pid': 'pid:2'
        }]
        self.assertRaises(MultipleObjectsReturned, EmailMessage.by_checksum,
                          42)

        # one match
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}]
        em = EmailMessage.by_checksum(42)
        self.assert_(isinstance(em, EmailMessage))

        # custom repo object
        mockrepo = Mock()
        em = EmailMessage.by_checksum(42, mockrepo)
        mockrepo.get_object.assert_called_with('pid:1', type=EmailMessage)

    @patch('keep.arrangement.models.solr_interface',
           spec=sunburnt.SolrInterface)
    def test_by_message_id(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist, EmailMessage.by_message_id,
                          '<*****@*****.**>')
        solr = mocksolr.return_value
        solr.query.assert_called_with(
            arrangement_id='<*****@*****.**>',
            content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')
Ejemplo n.º 6
0
class ArrangementObjectTest(KeepTestCase):
    def setUp(self):
        self.repo = Repository()
        self.pids = []

        # create test collection
        coll = self.repo.get_object(type=CollectionObject)
        coll.pid = '%s:parent-1' % settings.FEDORA_PIDSPACE
        coll.mods.content.source_id = '12345'
        coll.save()
        self.pids.append(coll.pid)

        #create test arrangement object
        self.arr = self.repo.get_object(type=ArrangementObject)
        self.arr.pid = 'foo:1'
        self.arr.collection = coll

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)

    @patch('keep.arrangement.models.solr_interface',
           spec=sunburnt.SolrInterface)
    def test_by_arrangement_id(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist,
                          ArrangementObject.by_arrangement_id, 42)
        solr = mocksolr.return_value
        solr.query.assert_called_with(
            arrangement_id=42,
            content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')

        # too many matches
        solr.query.return_value.field_limit.return_value = [{
            'pid': 'pid:1'
        }, {
            'pid': 'pid:2'
        }]
        self.assertRaises(MultipleObjectsReturned,
                          ArrangementObject.by_arrangement_id, 42)

        # one match
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}]
        ao = ArrangementObject.by_arrangement_id(42)
        self.assert_(isinstance(ao, ArrangementObject))

        # custom repo object
        mockrepo = Mock()
        ao = ArrangementObject.by_arrangement_id(42, mockrepo)
        mockrepo.get_object.assert_called_with('pid:1', type=ArrangementObject)

    def test_arrangement_status(self):
        obj = ArrangementObject(Mock())
        obj.arrangement_status = 'processed'
        self.assertEqual('A', obj.state)
        self.assertEqual('processed', obj.arrangement_status)

        obj.arrangement_status = 'accessioned'
        self.assertEqual('I', obj.state)
        self.assertEqual('accessioned', obj.arrangement_status)

        value_error = None
        try:
            obj.arrangement_status = 'bogus'
        except ValueError:
            value_error = True

        self.assertTrue(
            value_error,
            'attempting to assign an unknown status should raise a ValueError')

    def test_update_access_cmodel(self):
        obj = ArrangementObject(Mock())
        # no status set - should be set to restricted
        obj._update_access_cmodel()

        self.assert_(
            (obj.uriref, modelns.hasModel,
             URIRef(ACCESS_RESTRICTED_CMODEL)) in obj.rels_ext.content)
        self.assert_(
            (obj.uriref, modelns.hasModel,
             URIRef(ACCESS_ALLOWED_CMODEL)) not in obj.rels_ext.content)

        # set to status code 2 = access allowed
        obj.rights.content.create_access_status()
        obj.rights.content.access_status.code = '2'

        obj._update_access_cmodel()

        self.assert_(
            (obj.uriref, modelns.hasModel,
             URIRef(ACCESS_RESTRICTED_CMODEL)) not in obj.rels_ext.content)
        self.assert_((obj.uriref, modelns.hasModel,
                      URIRef(ACCESS_ALLOWED_CMODEL)) in obj.rels_ext.content)

    def test_index_data(self):
        idx_data = self.arr.index_data()
        self.assertEqual('born-digital', idx_data['object_type'])
        self.assertEqual(self.arr.pid, idx_data['pid'])
        self.assertIn(self.arr.owner, idx_data['owner'])
        self.assertEquals(self.arr.collection.pid, idx_data['collection_id'])
        self.assertEquals(self.arr.collection.mods.content.source_id,
                          idx_data['collection_source_id'])

    # Test the update_ark_label method in the keep.common.fedora
    # Note that this test is a simplified version of keep.common.fedora:ArkPidDigitalObject.test_update_ark_label
    # The udpate_ark_label here is an overriden method that is more specifc, and is used on Arrangement objects
    @patch('keep.arrangement.models.pidman'
           )  # mock the pidman client (the API service)
    def test_update_ark_label(self, mockpidman):

        # Create a ArrangementObject
        arrangement_object = ArrangementObject(Mock())

        # Set a pid on the object so that it could internally generate a noid etc.
        arrangement_object.pid = "test:1234"

        # Simulate when the object doesn't exist (or hasn't been saved)
        # By default it appears as if it doesn't exist
        arrangement_object.update_ark_label()

        # What we should expect is that the update_ark_label is not called on pidman
        # Also there shouldn't be any errors
        # Use the mock assertFalse to check if a method is called or not
        self.assertFalse(mockpidman.get_ark.called)

        # Mock when the object exists (returns True)
        # Note: Need to set the Mock on the class and not the object because
        # this (exists) is a property method
        with patch.object(ArrangementObject,
                          'exists',
                          new=Mock(return_value=True)):
            arrangement_object.update_ark_label()
            self.assertFalse(mockpidman.get_ark.called)

        # Set the label before the object exists so we don't trigger API calls
        arrangement_object.dc.content.title = "testpid"
        with patch.object(ArrangementObject,
                          'exists',
                          new=Mock(return_value=True)):
            mockpidman.get_ark.return_value = {
                "name": arrangement_object.dc.content.title
            }
            arrangement_object.update_ark_label()
            mockpidman.get_ark.assert_called_with(
                arrangement_object.noid
            )  # assert that it is called with a noid too
            self.assertFalse(mockpidman.update_ark.called)

            # When the label is different from that in Pidman
            mockpidman.get_ark.return_value = {"name": "another pid"}
            arrangement_object.update_ark_label()
            mockpidman.get_ark.assert_called_with(
                arrangement_object.noid
            )  # assert that it is called with a noid too
            mockpidman.update_ark.assert_called_with(
                noid=arrangement_object.noid,
                name=arrangement_object.dc.content.title)

    def test_set_premis_object(self):
        mockapi = Mock()
        arrangement_object = ArrangementObject(mockapi)
        arrangement_object.pid = "test:1234"
        arrangement_object.mods.content.ark = 'ark:/1234/987'

        # return empty iterator for original data to checksum
        mockapi.getDatastreamDissemination.return_value = []
        with patch.object(arrangement_object,
                          'getDatastreamObject') as mockgetds:
            mockgetds.return_value.checksum = '123456789'
            mockgetds.return_value.mimetype = 'text/plain'
            arrangement_object.set_premis_object()

        self.assert_(arrangement_object.provenance.content.object)
        premis = arrangement_object.provenance.content
        # FIXME: placeholder tests for placeholder functionality,
        # should be updated to use ARK uri once that is implemented
        self.assertEqual('ark', premis.object.id_type)
        self.assertEqual(arrangement_object.mods.content.ark, premis.object.id)
        self.assertEqual('p:file', premis.object.type)
        self.assertEqual(0, premis.object.composition_level)
        self.assertEqual('MD5', premis.object.checksums[0].algorithm)
        self.assertEqual('123456789', premis.object.checksums[0].digest)
        # sha1 for an empty file
        empty_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
        self.assertEqual('SHA-1', premis.object.checksums[1].algorithm)
        self.assertEqual(empty_sha1, premis.object.checksums[1].digest)
        # object format should be original mietype
        self.assertEqual('text/plain', premis.object.format.name)

        # generated premis should be valid
        self.assertTrue(premis.is_valid())

    def test_identifier_change_event(self):
        mockapi = Mock()
        mockapi.username = '******'
        arrangement_object = ArrangementObject(mockapi)
        arrangement_object.pid = 'test:1234'
        arrangement_object.mods.content.ark = 'ark:/1234/987'

        # set object premis so we can validate
        mockapi.getDatastreamDissemination.return_value = []
        with patch.object(arrangement_object,
                          'getDatastreamObject') as mockgetds:
            mockgetds.return_value.checksum = '123456789'
            mockgetds.return_value.mimetype = 'text/plain'
            arrangement_object.set_premis_object()

        arrangement_object.identifier_change_event('old-pid:1')
        premis = arrangement_object.provenance.content
        self.assertEqual(1, len(premis.events))
        event = premis.events[0]
        self.assertEqual('UUID', event.id_type)
        # id should be set, we don't care what it is exactly
        self.assert_(event.id)
        self.assertEqual('identifier assignment', event.type)
        self.assertEqual('program="keep"; version="%s"' % __version__,
                         event.detail)
        self.assertEqual('Pass', event.outcome)
        msg = 'Persistent identifier reassigned from %s to %s' % \
            ('old-pid:1', arrangement_object.pid)
        self.assertEqual(msg, event.outcome_detail)
        self.assertEqual('fedora user', event.agent_type)
        self.assertEqual('fedoraAdmin', event.agent_id)

        # generated premis should be valid
        self.assertTrue(premis.is_valid())
Ejemplo n.º 7
0
class TestMigrateRushdie(TestCase):
    MM_FIXTURE = '''<macfs:document xmlns:macfs="info:fedora/emory-control:Rushdie-MacFsData-1.0">
  <macfs:md5>ffcf48e5df673fc7de985e1b859eeeec</macfs:md5>
  <macfs:file>
    <macfs:computer>Performa 5400</macfs:computer>
    <macfs:path>/Hard Disk/MIDNIGHT&apos;S CHILDREN/MISC. MATERIAL/x - the roles</macfs:path>
    <macfs:rawpath>L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=</macfs:rawpath>
    <macfs:attributes>avbstclInmedz</macfs:attributes>
    <macfs:created>1997-01-19T19:29:32</macfs:created>
    <macfs:modified>1997-01-19T19:29:32</macfs:modified>
    <macfs:type>TEXT</macfs:type>
    <macfs:creator>ttxt</macfs:creator>
  </macfs:file>
</macfs:document>'''

    MA_FIXTURE = '''<marbl:analysis xmlns:marbl="info:fedora/emory-control:Rushdie-MarblAnalysis-1.0">
  <marbl:series>Writings by Rushdie</marbl:series>
  <marbl:subseries>Fiction</marbl:subseries>
  <marbl:verdict>As is</marbl:verdict>
</marbl:analysis>'''

    SERIES_FIXTURE = {
        'Writings by Rushdie': {
            'series_info': {
                'base_ark':
                'http://testpid.library.emory.edu/ark:/25593/80mvk',
                'id':
                'rushdie1000_series2',
                'short_id':
                'series2',
                'uri':
                'https://findingaids.library.emory.edu/documents/rushdie1000/series2'
            },
            'subseries_info': {
                'Fiction': {
                    'base_ark':
                    'http://testpid.library.emory.edu/ark:/25593/80mvk',
                    'id':
                    'rushdie1000_subseries2.1',
                    'short_id':
                    'subseries2.1',
                    'uri':
                    'https://findingaids.library.emory.edu/documents/rushdie1000/series2/subseries2.1'
                }
            }
        }
    }

    def setUp(self):
        self.repo = Repository()
        self.pids = []

        #Create a simple Collection
        self.sc = self.repo.get_object(type=SimpleCollection)
        self.sc.label = "SimpleCollection For Test"
        self.sc.save()
        self.pids.append(self.sc.pid)

        #Create a Master Collection
        self.mc = self.repo.get_object(type=CollectionObject)
        self.mc.label = "MasterCollection For Test"
        self.mc.save()
        self.pids.append(self.mc.pid)

        #Create a a DigitalObject
        self.digObj = self.repo.get_object(type=RushdieArrangementFile)
        self.digObj.label = "Object For Test"
        self.digObj.save()
        self.pids.append(self.digObj.pid)
        self.digObj.api.addDatastream(self.digObj.pid,
                                      "MARBL-MACTECH",
                                      "MARBL-MACTECH",
                                      mimeType="application/xml",
                                      content=self.MM_FIXTURE)
        self.digObj.api.addDatastream(self.digObj.pid,
                                      "MARBL-ANALYSIS",
                                      "MARBL-ANALYSIS",
                                      mimeType="application/xml",
                                      content=self.MA_FIXTURE)
        #Remove Arrangement model so it can be added later
        relation = (self.digObj.uriref, modelns.hasModel,
                    "info:fedora/emory-control:Arrangement-1.0")
        self.digObj.rels_ext.content.remove(relation)
        self.digObj.save()

        #Setup Command
        self.cmd = migrate_rushdie.Command()
        self.cmd.verbosity = 1
        self.cmd.v_normal = 1
        self.cmd.v_none = 0
        self.cmd.simple_collection = self.sc
        self.cmd.stdout = sys.stdout
        self.cmd.CONTENT_MODELS = CONTENT_MODELS
        self.cmd.repo = self.repo

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)

    def test__add_to_simple_collection(self):
        self.cmd._add_to_simple_collection(self.digObj)
        self.assertTrue(
            (self.sc.uriref, relsextns.hasMember, self.digObj.uriref)
            in self.sc.rels_ext.content,
            "%s shold be a member of the Simplecollection" % self.digObj.pid)

    def test__get_unique_objects(self):
        #duplicate pids are processed only once
        objs = self.cmd._get_unique_objects([self.digObj.pid, self.digObj.pid])
        self.assertEqual(len(objs), 1, "No dup pids should be processed")

    def test__convert_ds(self):
        obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE,
                                   False)
        #Check all fields are moved over correctly

        #filetech
        self.assertEqual(obj.filetech.content.file[0].md5,
                         "ffcf48e5df673fc7de985e1b859eeeec")
        self.assertEqual(obj.filetech.content.file[0].computer,
                         "Performa 5400")
        self.assertEqual(
            obj.filetech.content.file[0].path,
            "/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles")
        self.assertEqual(
            obj.filetech.content.file[0].rawpath,
            "L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM="
        )
        self.assertEqual(obj.filetech.content.file[0].attributes,
                         "avbstclInmedz")
        self.assertEqual(obj.filetech.content.file[0].created,
                         "1997-01-19T19:29:32")
        self.assertEqual(obj.filetech.content.file[0].modified,
                         "1997-01-19T19:29:32")
        self.assertEqual(obj.filetech.content.file[0].type, "TEXT")
        self.assertEqual(obj.filetech.content.file[0].creator, "ttxt")
        #MODS
        self.assertEqual(obj.mods.content.series.title, "Fiction")
        self.assertEqual(
            obj.mods.content.series.uri,
            self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]
            ["Fiction"]["uri"])
        self.assertEqual(
            obj.mods.content.series.base_ark,
            self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]
            ["Fiction"]["base_ark"])
        self.assertEqual(
            obj.mods.content.series.full_id,
            self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]
            ["Fiction"]["id"])
        self.assertEqual(
            obj.mods.content.series.short_id,
            self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]
            ["Fiction"]["short_id"])
        self.assertEqual(obj.mods.content.series.series.title,
                         "Writings by Rushdie")
        self.assertEqual(
            obj.mods.content.series.series.uri,
            self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["uri"])
        self.assertEqual(
            obj.mods.content.series.series.base_ark,
            self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]
            ["base_ark"])
        self.assertEqual(
            obj.mods.content.series.series.full_id,
            self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["id"])
        self.assertEqual(
            obj.mods.content.series.series.short_id,
            self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]
            ["short_id"])
        #Rights
        self.assertEqual(obj.rights.content.access_status.code, "2")
        #RELS-EXT
        self.assertTrue(
            (obj.uriref, relsextns.isMemberOf, self.mc.uriref)
            in obj.rels_ext.content,
            "Object should have isMember relation to master collection")
        self.assertTrue(
            (obj.uriref, modelns.hasModel,
             URIRef("info:fedora/emory-control:ArrangementAccessAllowed-1.0"))
            in obj.rels_ext.content,
            "Object should have Allowed Content Model")
        #Label and DS
        self.assertEqual(obj.label, "x - the roles",
                         "Label should be set to last part of path")
        self.assertEqual(obj.owner, "thekeep-project",
                         "owner should be set to 'thekeep-project'")
        self.assertEqual(obj.dc.content.title, "x - the roles",
                         "DC title should be set to last part of path")
        #DataStreams
        #have to reload obj from repository to get DS update
        obj = self.repo.get_object(pid=obj.pid, type=ArrangementObject)
        self.assertFalse("MARBL-MACTECH" in obj.ds_list,
                         "MARBL-MACTECH should have been removed")
        self.assertFalse("MARBL-ANALYSIS" in obj.ds_list,
                         "MARBL-ANALYSIS should have been removed")

    def test_missing_series_info(self):
        #Remove subseries info from lookup
        series = self.SERIES_FIXTURE.copy()
        del series["Writings by Rushdie"]["subseries_info"]
        obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE,
                                   False)

        self.assertEqual(obj.mods.content.series.title, "Fiction")
        self.assertEqual(obj.mods.content.series.series.title,
                         "Writings by Rushdie")
Ejemplo n.º 8
0
class Command(BaseCommand):
    help = '''Remove outdated email message metadata objects from the repository
and replace them with email folder and message objects based on 5300c
Eudora files. (One-time import for 5300c content)

   <batch id> pid for the 5300c processing batch object (used to find
	email records to be removed/replaced)

   <eudora base path>   base path for Eudora folder data and index files

'''
    args = '<batch id> <eudora base path>'
    option_list = BaseCommand.option_list + (
        make_option('-n', '--noact', action='store_true', default=False,
                    help='''Test run: report what would be done, but do not modify
                    anything in the repository'''),
        make_option('-m', '--max', metavar='MAX_NUM', dest='max_ingest', type='int',
                    help='''Stop after ingesting MAX_NUM items'''),
        make_option('--skip-purge', action='store_true', default=False,
                    help='''Skip purging old metadata email records and only
                    ingest email messages (e.g., if purge has already been completed)'''),

        make_option('--purge-only', action='store_true', default=False,
                    help='''Only purge old metadata email records; do not
                    ingest email messages'''),

        # optional fedora credentials
        make_option('--user', metavar='FEDORA_USER', dest='user',
            help='''Connect to Fedora as the specified user'''),
        make_option('--password', metavar='FEDORA_PASSWORD', dest='password',
            help='''Connect to Fedora with the specified password (leave blank for prompt)''',
            action="callback", callback=get_password_opt, type='string'),

        )
    # default django verbosity levels: 0 = none, 1 = normal, 2 = all
    v_normal = 1

    # email folder names for 5300c
    # key is fake 'path' in arrangement objects; value is original filename
    email_folders = {
        'In': 'In',
        'Out': 'Out',
        'Old-In': 'OLD "IN"',
        'Old-Out': 'OLD "OUT"',
    }
    'known email folders on the 5300c, for identifying current records'
    email_path_regex = '^(%s)/' % '|'.join(email_folders.keys())

    max_ingest = None

    def handle(self, batch_id=None, folder_path=None, verbosity=1, noact=False,
               max_ingest=None, skip_purge=False, purge_only=False, *args, **options):

        # check batch object
        if batch_id is None:
            raise CommandError('Processing batch id is required')
        self.verbosity = int(verbosity)  # ensure we compare int to int
        if max_ingest is not None:
            self.max_ingest = int(max_ingest)

        # check folder path
        if folder_path is None:
            raise CommandError('Eudora folder base path is required')
        if not os.path.isdir(folder_path):
            raise CommandError('Eudora folder path "%s" is not a directory' % folder_path)
        self.noact = noact

        # check for any specified fedora credentials
        fedora_opts = {}
        if 'username' in options:
            fedora_opts['username'] = options['username']
        if 'password' in options:
            fedora_opts['password'] = options['password']
        self.repo = Repository(**fedora_opts)
        batch = self.repo.get_object(batch_id, type=ProcessingBatch)
        if not batch.exists:
            raise CommandError('Processing batch %s not found' % batch_id)
        print 'Looking for email messages in processing batch "%s"' \
              % batch.label

        try:
            pidman = DjangoPidmanRestClient()
        except:
            raise CommandError('Error initializing PID manager client; ' +
                               'please check settings.')

        self.stats = defaultdict(int)
        # purge old metadata email 'arrangement' objects that belong to this batch
        if not skip_purge:
            self.remove_arrangement_emails(batch)
        # ingest new objects for email mailboxes & messages
        if not purge_only:
            self.ingest_email(folder_path)


    def remove_arrangement_emails(self, batch):
        '''Find and iterate over all items that are part of the specified batch.
        Purge email message objects and update the correspending ARK records
        for re-use on ingest.
        '''
        items = list(batch.rels_ext.content.objects(batch.uriref,
                                                    relsext.hasMember))
        for i in items:
            # for now, init as arrangement objects
            obj = self.repo.get_object(str(i), type=ArrangementObject)
            # NOTE: in dev/test, collection currently references all items
            # but only a handful actually exist in dev/test repo; just skip
            if not obj.exists:
                continue

            # number of objects
            self.stats['count'] += 1

            if not obj.filetech.exists or not obj.filetech.content.file:
                print 'Error: no file tech for %s; skipping' % obj.pid
                continue

            # 5300c email messages should only have one file path.
            # Identify email messages by file path starting with
            # email folder name and  no checksum
            file_info = obj.filetech.content.file[0]
            if not re.match(self.email_path_regex, file_info.path) or \
               file_info.md5:
                # not an email message - skip to next item
                continue

            self.stats['email'] += 1

            # if in no-act mode, nothing else to do
            if self.noact:
                continue

            # not in no-act mode : update pid, purge object
            try:
                # reinit client as a workaround for pidman errors (?)
                pidman = DjangoPidmanRestClient()
                # update ark name/domain
                pidman.update_ark(obj.noid,
                                  name=UNUSED_PID_NAME,
                                  domain=settings.PIDMAN_DOMAIN)
                # mark default target as inactive
                pidman.update_ark_target(obj.noid, active=False,
                                         target_uri=UNUSED_PID_URL)
                self.stats['pids'] +=1
                if self.verbosity > self.v_normal:
                    print 'Updated ARK for %s' % obj.noid

            except Exception as e:
                print 'Error updating ARK for %s: %s' % \
                      (obj.noid, e)

            # purge record
            try:
                self.repo.purge_object(obj.pid,
                                  'removing metadata arrangement 5300c email record')
                self.stats['purged'] += 1
                if self.verbosity > self.v_normal:
                    print 'Purged %s' % obj.pid

            except RequestFailed as e:
                self.stats['purge_error'] += 1
                print 'Error purging %s: %s' % (obj.pid, e)

        # summary
        if self.verbosity >= self.v_normal:
            print '''\nChecked %(count)d records, found %(email)d emails''' % self.stats
            if not self.noact:
                print 'Updated %(pids)d ARK(s); purged %(purged)d objects, error purging %(purge_error)d objects' \
                      % self.stats



    def ingest_email(self, folder_base):

        for folder_name, folder_file in self.email_folders.iteritems():
            self.stats['folder'] += 1

            folder_path = os.path.join(folder_base, folder_file)
            folder_toc = os.path.join(folder_base, folder_file + '.toc')
            # if either data or index file is not present, bail out
            if not os.path.isfile(folder_path) or \
                   not os.path.isfile(folder_toc):
                print 'Error: folder files %s not found at base path "%s"' % \
                      (folder_file, folder_base)
                continue

            # find the index/data file objects for this folder in fedora
            # by checksums from the originals;
            # check if they are associated with an existing mailbox object
            mailbox = None

            mbox_obj = self.find_file_object(folder_path)
            if mbox_obj is None:
                # these records should be found in production
                print 'Warning: record not found for folder data file "%s"' % folder_file
            elif mbox_obj.mailbox:
                mailbox = mbox_obj.mailbox

            toc_obj = self.find_file_object(folder_toc)
            if toc_obj is None:
                print 'Warning: record not found for folder index file "%s.toc"' % folder_file
            elif toc_obj.mailbox:
                mailbox = toc_obj.mailbox

            # mailbox not found via folder file objects, so create it
            if mailbox is None:
                if self.verbosity > self.v_normal:
                    print 'Mailbox object for %s not found; creating one' % folder_name

                mailbox = self.repo.get_object(type=MailboxPidReuse)
                desc = 'Rushdie\'s email from his PowerBook 5300c: "%s" folder' % \
                       folder_name
                mailbox.label = desc
                mailbox.dc.content.title = desc
                # mailbox should belong to same collection mailbox files do
                if mbox_obj.collection:
                    mailbox.collection = mbox_obj.collection
                elif mbox_obj._deprecated_collection:
                    mailbox.collection = mbox_obj._deprecated_collection

                # save to get a pid, add mailbox rel to file objects
                if not self.noact:
                    # TODO: fedora error handling
                    try:
                        mailbox.save('email folder object for %s' % folder_name)
                        self.stats['ingested'] += 1
                        if self.verbosity >= self.v_normal:
                            print 'Created new mailbox object for %s as %s' % \
                                  (folder_name, mailbox.pid)
                    except RequestFailed as rf:
                        self.stats['ingest_error'] += 1
                        print 'Failed to create folder object for %s in Fedora: %s' % \
                              (folder_name, rf)

                    if mbox_obj:
                        mbox_obj.mailbox = mailbox
                        mbox_obj.save('associating with mailbox object')
                        self.stats['updated'] += 1
                    if toc_obj:
                        toc_obj.mailbox = mailbox
                        toc_obj.save('associating with mailbox object')
                        self.stats['updated'] += 1

                # NOTE: should be able to get rushdie collection
                # object from toc/mbox objects, but they seem to have
                # isMemberOf rel instead of isMemberOfCollection (?)
            else:
                # FIXME: boda rel is giving us boda mailbox instead of local
                # arrangement mailbox; re-init as local mailbox
                # for access to parent collection
                mailbox = self.repo.get_object(mailbox.pid, type=MailboxPidReuse)

            with open(folder_toc) as tocdata:
                with open(folder_path) as mbox:
                    toc = eudora.Toc(tocdata)   # load as eudora toc binfile

                    # eudora Toc returns messages in folder order;
                    # pass order in to store in CERP for sorting/display
                    folder_order = 0
                    for msg in toc.messages:
                        self.stats['message'] += 1

                        # get data from mbox file based on msg offset/size
                        mbox.seek(msg.offset)
                        # read message content from mailbox data file
                        msg_data = mbox.read(msg.size)

                        self.ingest_message(msg_data, mailbox, folder_order)
                        folder_order += 1
                        # max to ingest for testing
                        if self.max_ingest and self.stats['ingested'] >= self.max_ingest:
                            break

        # summary

        if self.verbosity >= self.v_normal:
            print '''\nProcessed %(folder)d mail folders and %(message)d messages; %(previously_ingested)d messages previously ingested''' % self.stats
            if not self.noact:
                print '''\nCreated %(ingested)d records, updated %(updated)d''' % self.stats
                if self.stats['ingest_error']:
                    print '''Error ingesting %(ingest_error)d records''' % self.stats

    def find_file_object(self, file_path):
        '''Find a file object by checksum in fedora based on a file
        path.  Returns a file object if one matches the checksum for
        the file specified, or else None if no match is found.

        :returns:  :class:`keep.arrangement.models.RushdieArrangementFile` or
        None
        '''
        file_md5 = md5sum(file_path)
        solr = solr_interface()
        q = solr.query(content_md5=file_md5).field_limit('pid')
        if len(q):
            return self.repo.get_object(q[0]['pid'], type=RushdieArrangementFile)

    def ingest_message(self, msg_data, mailbox, folder_order):

        # read content and redact IP addresses / email addresses
        msg_data = redact_email(msg_data)

        # generate email object from data
        email_msg = email.message_from_string(msg_data,
                                              _class=MacEncodedMessage)

        # check and warn if email has attachments
        attachments = self.email_attachments(email_msg)
        if attachments:
            print 'Warning! Email has attachments (not yet handled): %s' % \
                  ','.join(attachments)

        # get current content type to preserve the original value,
        # and also to determine how to decode
        content_type = email_msg.get('Content-Type', '')
        orig_content_type = email_msg.get_content_type()
        orig_content_charset = email_msg.get_content_charset()

        # at least one email in this set has a charset of 'unknown-8bit',
        # but the \xa0 in the content indicates it is probably latin 1
        if 'charset=unknown-8bit' in content_type:
            latin1_charset = email.charset.Charset('latin_1')
            email_msg.set_charset(latin1_charset)

        # otherwise, if charset is not set, assume mac roman
        elif not email_msg.get_charset():
            # tell email that charset should be mac roman,
            # so it can decode special characters
            mac_charset = email.charset.Charset('mac_roman')
            email_msg.set_charset(mac_charset)
            # decode headers from mac roman charset
            # (some messages contain improperly formatted
            # accented characters in a from/to header)
            email_msg.decode_headers()

        # create a new object to populate with data
        msg_obj = self.repo.get_object(type=EmailMessagePidReuse)

        # generate cerp from mime message
        # - store folder order as message local id
        msg_obj.cerp.content = cerp.Message.from_email_message(email_msg,
                                                               local_id=folder_order)

        # The generated CERP may have modified mac roman charset headers
        # which were needed to convert instead of the original;
        # update thex ml to store the original value,  NOT the encoding
        # that was used to decode the content.
        if content_type:
            if msg_obj.cerp.content.single_body:
                msg_obj.cerp.content.single_body.content_type_list[0] = orig_content_type
                msg_obj.cerp.content.single_body.charset_list[0] = orig_content_charset

        else:
            if msg_obj.cerp.content.single_body:
                del msg_obj.cerp.content.single_body.content_type_list[0]
                del msg_obj.cerp.content.single_body.charset_list[0]
        # loop through headers to set/remove content type
        for h in msg_obj.cerp.content.headers:
            if h.name == 'Content-Type':
                if content_type:
                    h.value = content_type
                else:
                    h.value = None
                    h.name = None
                break

        # construct an object label based on from/to/date/subject
        msg_from = email_msg['From']
        # NOTE: it would be nice to suppress redundant redaction email text here;
        # at least simplify label for rushdie, since that is what we'll see most
        if 'REDACTED: Salman Rushdie\'s email' in msg_from:
            msg_from = 'Salman Rushdie'
        label = u'Email from %s' %  msg_from
        if email_msg.get('To', None):
            # FIXME: could have multiple recipients
            # we *should* be able to get split-out version from email.Message ...
            to = email_msg['To']
            label += u' to %s' % email_msg['To']
        # date/subject not always present, but add if they are
        if email_msg.get('Date', None):
            label += u' on %s' % email_msg['Date']
        if email_msg.get('Subject', None):
            label += u' %s' % email_msg['Subject']

        # set as object label and dc:title
        msg_obj.label = label
        msg_obj.dc.content.title = label

        # in verbose noact mode, print label so user can see what is being done
        if self.verbosity > self.v_normal and self.noact:
            print label

        # generate a pristine email Message for saving fedora
        # (don't save modified charset, content type, etc.)
        msg_obj.mime_data.content = email.message_from_string(msg_data,
                                              _class=MacEncodedMessage)
        # calculate an MD5 of the email content *as it will be serialized*
        md5 = hashlib.md5()
        md5.update(str(msg_obj.mime_data.content))
        email_md5 = md5.hexdigest()
        msg_obj.mime_data.checksum = email_md5


        # check if this email has already been ingested via checksum;
        # don't re-ingest if it is already in the repository
        solr = solr_interface()
        q = solr.query(content_md5=msg_obj.mime_data.checksum).field_limit('pid')
        if len(q):
            if self.verbosity >= self.v_normal:
                print 'Email message has already been ingested as %s; skipping' \
                      % q[0]['pid']
            self.stats['previously_ingested'] += 1
            return


        # associate with current mailbox object
        msg_obj.mailbox = mailbox
        # belongs to same collection as its mailbox
        if mailbox.collection:
            msg_obj.collection = mailbox.collection
        # ingest items as accessioned/unprocessed
        msg_obj.arrangement_status = 'accessioned'
        # ingest with a default rights code of 10 "Undetermined" in rights DS
        msg_obj.rights.content.create_access_status()
        msg_obj.rights.content.access_status.code = "10"
        msg_obj.rights.content.access_status.text = rights_access_terms_dict["10"].text

        if not self.noact:
            try:
                msg_obj.save('ingesting email message from rushdie 5300c')
                if self.verbosity >= self.v_normal:
                    print 'Ingested message %s : %s' % \
                          (msg_obj.pid, msg_obj.label)
                    self.stats['ingested'] += 1
            except RequestFailed as rf:
                self.stats['ingest_error'] += 1
                print 'Error ingesting email message %s: %s' % \
                      (msg_obj.label, rf)

    def email_attachments(self, msg):
        attachments = []
        if msg.is_multipart():
            payload = msg.get_payload()
            # NOTE: sub parts could themselves be multipart...
            for p in payload:
                if 'attachment' in p.get('Content-Disposition', '') \
                       or p.get_filename():
                    attachments.append(p.get_filename())

        return attachments
Ejemplo n.º 9
0
class Command(BaseCommand):
    '''Read CSV file and creates (or adds to) a Simple Collection and associated ArrangementObjects
    with the SimpleCollection and the Master collection'''

    def get_password_option(option, opt, value, parser):
        setattr(parser.values, option.dest, getpass())

    #Set up additional options
    option_list = BaseCommand.option_list + (
        make_option('--noact', '-n',
            action='store_true',
            dest='no-act',
            default=False,
            help='Does not create PIDs or ingest anything into Fedora. Only parses file and outputs results'),
        make_option('--add', '-a',
            action='store',
            dest='add',
            help='adds to the SimpleCollection specified by pid, does not create a new SimpleCollection'),
        make_option('--username', '-u',
            dest='username',
            action='store',
            help='''Username to connect to fedora'''),
        make_option('--password',
            dest='password',
            action='callback', callback=get_password_option,
            help='''Prompt for password required when username used'''),
    )

    args = '<CSV file> <master collection pid> <new simple collection name>'
    help = __doc__

    def _create_series_lookup(self):
        #series / subseries info
        self.series = {}

        #exist query params
        return_fields = ['eadid']
        search_fields = {'eadid' : 'rushdie1000'}

        queryset = Series.objects.also(*return_fields).filter(**search_fields)
        for s in queryset:
            #series info
            self.series[s.title]= {}
            self.series[s.title]['series_info'] = {}
            self.series[s.title]['series_info']['id'] = s.id
            self.series[s.title]['series_info']['short_id'] = s.short_id
            self.series[s.title]['series_info']['base_ark'] = s.eadid.url
            self.series[s.title]['series_info']['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s" % \
                (s.eadid.value, s.short_id)
            #subseries info
            if s.subseries:
                self.series[s.title]['subseries_info'] = {}
                for sub in s.subseries:
                    self.series[s.title]['subseries_info'][sub.title] = {}
                    self.series[s.title]['subseries_info'][sub.title]['id'] = sub.id
                    self.series[s.title]['subseries_info'][sub.title]['short_id'] = sub.short_id
                    self.series[s.title]['subseries_info'][sub.title]['base_ark'] = s.eadid.url
                    self.series[s.title]['subseries_info'][sub.title]['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s/%s" % \
                    (s.eadid.value, s.short_id, sub.short_id)


    def _create_arrangement(self, row):
        #Account for unicode characters
        #Preserve unicode characters for raw path,
        #but remove unicode character for other mappings
        rawpath =  base64.encodestring(row["filename"])

        path = row["filename"]
        path =  unicode(path, 'utf8')
        creator = row["creator"]
        creator = unicode(creator, 'utf8')

        # set values in filetech DS
        obj = self.repo.get_object(type=ArrangementObject)
        obj.label = path.rpartition('/')[2]
        obj.filetech.content.file.append(FileMasterTech_Base())
        obj.filetech.content.file[0].local_id = row['id']
        obj.filetech.content.file[0].md5 = row['checksum']
        obj.filetech.content.file[0].computer = row['computer']
        obj.filetech.content.file[0].path = path
        obj.filetech.content.file[0].rawpath = rawpath
        obj.filetech.content.file[0].attributes = row['attrib']
        obj.filetech.content.file[0].created = row['created']
        obj.filetech.content.file[0].modified = row['modified']
        obj.filetech.content.file[0].creator = creator

        #map DC title
        obj.dc.content.title = path.rpartition('/')[2]

        #map default verdict of 10 "Undetermined" in rights DS
        obj.rights.content.create_access_status()
        obj.rights.content.access_status.code = "10"

         #map series in MODS
        #RecordType used to lookup series info
        rec_type= row["rec_type"]
        rec_type = rec_type.strip()
        if rec_type not in self.series:
            rec_type = None

        if rec_type is not None:
            obj.mods.content.create_series()
            obj.mods.content.series.title = rec_type
            obj.mods.content.series.uri = self.series[rec_type]["series_info"]["uri"]
            obj.mods.content.series.base_ark = self.series[rec_type]["series_info"]["base_ark"]
            obj.mods.content.series.full_id = self.series[rec_type]["series_info"]["id"]
            obj.mods.content.series.short_id = self.series[rec_type]["series_info"]["short_id"]
        else:
            if self.verbosity > self.v_none:
                self.stdout.write("Series %s not found\n" % row["rec_type"])

        # set association to master collection
        relation = (obj.uriref, relsextns.isMemberOf, self.master_obj.uriref)
        obj.rels_ext.content.add(relation)
        if self.verbosity > self.v_normal:
            self.stdout.write("Adding %s isMemberOf %s relation on ArrangementObject\n" % (obj.label, self.master_obj.pid))

        #set state to inactive by default
        obj.state = "I"
        return obj



    def handle(self, *args, **options):
        #collect arrangement pids here to delete later if SimpleCollection fails to save
        self.arrangement_pids = []
        self._create_series_lookup()

        #0 = none, 1 = normal, 2 = all
        self.v_none = 0
        self.v_normal = 1

        if 'verbosity' in options:
            self.verbosity = int(options['verbosity'])
        else:
            self.verbosity = self.v_normal
        #Create the repo
        repo_args = {}
        if  options.get('username') is not None:
            repo_args['username'] = options.get('username')
        if options.get('password') is not None:
            repo_args['password'] = options.get('password')
        self.repo = Repository(**repo_args)

        #Check to make sure all args and options are present
        try:
            file =  args[0]
        except IndexError:
            raise CommandError("No CSV file specified")

        try:
            self.master_pid =  args[1]
        except IndexError:
            raise CommandError("No master collection pid specified")

        #if -a or --add is used the new SimpleCollection name is ignored
        try:
            if not options["add"]:
                self.simple_collection_name =  args[2]
            else:
                self.simple_collection_pid = options["add"]

        except IndexError:
            raise CommandError("An existing SimpleCollection pid must be specified with the -a option or \
            a new SimpleCollection name must be specified as an argument")

        #If Master collection does not exist then raise an exception
        self.master_obj = self.repo.get_object(type = CollectionObject, pid=self.master_pid)

        if not self.master_obj.exists:
            raise CommandError("Master Collection %s does not exist" % (self.master_pid))
        else:
            if self.verbosity > self.v_none:
                self.stdout.write("Using Master Collection: %s(%s)\n" % (self.master_obj.label, self.master_obj.pid))

        #Get or create SimpleColletion object
        #TODO Not sure why I have to do a try block to prevent a 404 here when I don't in other places
        try:
            if options["add"]:
                simple_collection = self.repo.get_object(type=SimpleCollection, pid=self.simple_collection_pid)
            else:
                simple_collection = self.repo.get_object(type=SimpleCollection)
                simple_collection.label = self.simple_collection_name
                simple_collection.dc.content.title = self.simple_collection_name
                simple_collection.mods.content.create_restrictions_on_access()
                simple_collection.mods.content.restrictions_on_access.text = "Accessioned"
        except:
            raise CommandError("Pid %s does not exist" % self.simple_collection_pid)

        #try to read file into a dict and assign the field names
        try:
            reader = csv.DictReader(open(file, 'rb'),
                                    fieldnames=["id","checksum","filename","rec_type","file_type",
                                                "creator","attrib","created","modified","computer","size"])
            if self.verbosity > self.v_none:
                self.stdout.write("Reading CSV: %s\n" % (file))
        except IOError:
            raise CommandError("Could not read file %s" % file)


        # skip the header row in CSV file
        reader.next()
        
        #read each field
        csv_read = 0
        arrangement_saved =0
        errors = 0
        for row in reader:
            try:
                csv_read += 1
                arrangement_object = self._create_arrangement(row)

                if not options['no-act']:
                    try:
                        arrangement_object.save()
                        arrangement_saved += 1
                        self.arrangement_pids.append(arrangement_object.pid)
                        if self.verbosity > self.v_none:
                            self.stdout.write("Saved ArrangementObject %s(%s)\n" % (arrangement_object.label, arrangement_object.pid))
                    except Exception as e:
                        if self.verbosity > self.v_none:
                            self.stdout.write("Error saving ArrangementObject %s: %s\n" % (arrangement_object.label, e.message))
                        errors += 1
                else:
                    if self.verbosity > self.v_none:
                        self.stdout.write("TEST ArrangementObject %s\n" % (arrangement_object.label))


                if self.verbosity > self.v_normal:
                    self.stdout.write("===RELS-EXT===\n")
                    for entry in arrangement_object.rels_ext.content:
                        self.stdout.write("%s\n" % list(entry))
                    self.stdout.write("===MODS===\n")
                    self.stdout.write("%s\n" % arrangement_object.mods.content.serialize())

                #Add each ArrangementObject to the SimpleCollection
                relation = (simple_collection.uriref, relsextns.hasMember, arrangement_object.uriref)
                simple_collection.rels_ext.content.add(relation)
                if self.verbosity > self.v_normal:
                    self.stdout.write("Adding hasMember %s relation on SimpleCollection\n" % (arrangement_object.pid))
            except Exception as e:
                self.stdout.write("Error in record id %s: %s\n" % (row["id"], e))
                errors += 1

        if not options['no-act']:
            try:
                simple_collection.save()
                self.stdout.write("Saved SimpleCollection %s(%s)\n" % (simple_collection.label, simple_collection.pid))
            except Exception as e:
                    if self.verbosity > self.v_none:
                        self.stdout.write("Error saving SimpleCollection %s: %s\n" % (simple_collection.label, e.message))
                        self.stdout.write("Deleting Arrangement pids so they will not be Orphans\n")
                    errors += 1
                    for pid in self.arrangement_pids:
                        self.repo.purge_object(pid)
                        if self.verbosity > self.v_none:
                            self.stdout.write("Deleting: %s\n" % (pid))
                        arrangement_saved -= 1

        else:
            if self.verbosity > self.v_none:
                self.stdout.write("TEST SimpleCollection %s\n" % (simple_collection.label))




        if self.verbosity > self.v_normal:
                self.stdout.write("===RELS-EXT===\n")
                for entry in simple_collection.rels_ext.content:
                    self.stdout.write("%s\n" % list(entry))
                self.stdout.write("===DC===\n")
                self.stdout.write("%s\n" % simple_collection.dc.content.serialize())
                self.stdout.write("===MODS===\n")
                self.stdout.write("%s\n" % simple_collection.mods.content.serialize())
            

        #print Summary
        self.stdout.write("\n\nSUMMARY\n=======\n")
        self.stdout.write("SimpleCollection: %s(%s)\n" % (simple_collection.label, simple_collection.pid))
        self.stdout.write("Master Collection Object: %s(%s)\n" % (self.master_obj.label, self.master_obj.pid))
        self.stdout.write("%s Records read from CSV file\n" % (csv_read))
        self.stdout.write("%s Records created\n" % (arrangement_saved))
        self.stdout.write("%s Errors\n" % (errors))