Example #1
0
class UpdateTest(unittest.TestCase):

    def setUp(self):
        self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))
        self.invalid_bag = BagIt(os.path.join(os.getcwd(), 'test', 'invalid_bag'))

    def tearDown(self):
        if os.path.exists(os.path.join(os.getcwd(), 'test', 'invalid_bag')):
            shutil.rmtree(os.path.join(os.getcwd(), 'test', 'invalid_bag'))

    def test_full_update(self):
        self.bag.update(full=True)
        self.assertEquals(len(self.bag.bag_errors), 0)

    def test_partial_update(self):
        self.bag.update(full=False)
        self.assertEquals(len(self.bag.bag_errors), 0)

    def test_is_valid(self):
        self.bag.update()
        self.assertEquals(self.bag.is_valid(), True)

    def test_not_valid(self):
        os.remove(self.invalid_bag.manifest_file)
        self.invalid_bag.validate()
        self.assertEquals(self.invalid_bag.is_valid(), False)
Example #2
0
class UpdateTest(unittest.TestCase):
    def setUp(self):
        self.bag = BagIt(os.path.join(os.getcwd(), "test", "testbag"))
        self.invalid_bag = BagIt(
            os.path.join(os.getcwd(), "test", "invalid_bag"))

    def tearDown(self):
        if os.path.exists(os.path.join(os.getcwd(), "test", "invalid_bag")):
            shutil.rmtree(os.path.join(os.getcwd(), "test", "invalid_bag"))

    def test_full_update(self):
        self.bag.update(full=True)
        self.assertEqual(len(self.bag.bag_errors), 0)

    def test_partial_update(self):
        self.bag.update(full=False)
        self.assertEqual(len(self.bag.bag_errors), 0)

    def test_is_valid(self):
        self.bag.update()
        self.assertEqual(self.bag.is_valid(), True)

    def test_not_valid(self):
        os.remove(self.invalid_bag.manifest_file)
        self.invalid_bag.validate()
        self.assertEqual(self.invalid_bag.is_valid(), False)
Example #3
0
 def test_uncompress_tgz(self):
     # create an empty tgz bag.
     newbag = BagIt(os.path.join(os.getcwd(), 'test', 'newtgzbag'))
     newbag.package(os.path.join(os.getcwd(), 'test'))
     # remove the created bag directory
     shutil.rmtree(os.path.join(os.getcwd(), 'test', 'newtgzbag'))
     # this should leave us with just newtgzbag.tgz
     tgzbag = BagIt(os.path.join(os.getcwd(), 'test', 'newtgzbag.tgz'))
     self.assertTrue(os.path.exists(tgzbag.bag_directory))
Example #4
0
 def test_uncompress_zip(self):
     # create an empty zip bag.
     newbag = BagIt(os.path.join(os.getcwd(), "test", "newzipbag"))
     newbag.package(os.path.join(os.getcwd(), "test"), method="zip")
     # remove the created bag directory
     shutil.rmtree(os.path.join(os.getcwd(), "test", "newzipbag"))
     # this should leave us with just newtgzbag.tgz
     zipbag = BagIt(os.path.join(os.getcwd(), "test", "newzipbag.zip"))
     self.assertTrue(os.path.exists(zipbag.bag_directory))
Example #5
0
 def test_uncompress_zip(self):
     # create an empty zip bag.
     newbag = BagIt(os.path.join(os.getcwd(), 'test', 'newzipbag'))
     newbag.package(os.path.join(os.getcwd(), 'test'), method='zip')
     # remove the created bag directory
     shutil.rmtree(os.path.join(os.getcwd(), 'test', 'newzipbag'))
     # this should leave us with just newtgzbag.tgz
     zipbag = BagIt(os.path.join(os.getcwd(), 'test', 'newzipbag.zip'))
     self.assertTrue(os.path.exists(zipbag.bag_directory))
Example #6
0
 def setUp(self):
     self.bag = BagIt(os.path.join(os.getcwd(), "test", "testbag"))
     self.test_fetch_contents = [{
         "filename":
         "data/bagitspec.pdf",
         "length":
         "-",
         "url":
         "http://www.digitalpreservation.gov/documents/bagitspec.pdf",
     }]
Example #7
0
 def setUp(self):
     self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))
     self.test_fetch_contents = [{
         'filename':
         u'data/bagitspec.pdf',
         'length':
         u'-',
         'url':
         u'http://www.digitalpreservation.gov/documents/bagitspec.pdf'
     }]
Example #8
0
class FetchTest(unittest.TestCase):
    ### THESE MAY FAIL if the Websites change their files. In that case,
    ### just comment this test out, or replace the assertion with a new
    ### SHA1 Checksum.
    def setUp(self):
        self.bag = BagIt(os.path.join(os.getcwd(), "test", "testbag"))
        self.test_fetch_contents = [{
            "filename":
            "data/bagitspec.pdf",
            "length":
            "-",
            "url":
            "http://www.digitalpreservation.gov/documents/bagitspec.pdf",
        }]

    def tearDown(self):
        # if os.path.exists(os.path.join(os.getcwd(), 'test', 'testbag', 'data', 'bagitspec.pdf')):
        #     os.remove(os.path.join(os.getcwd(), 'test', 'testbag', 'data', 'bagitspec.pdf'))
        if os.path.exists(
                os.path.join(os.getcwd(), "test", "testbag", "data",
                             "stealin_mah_bag.jpg")):
            os.remove(
                os.path.join(os.getcwd(), "test", "testbag", "data",
                             "stealin_mah_bag.jpg"))
        if os.path.exists(
                os.path.join(os.getcwd(), "test", "testbag", "data",
                             "bagitspec.pdf")):
            os.remove(
                os.path.join(os.getcwd(), "test", "testbag", "data",
                             "bagitspec.pdf"))
        self.bag.add_fetch_entries(self.test_fetch_contents, append=False)

    def test_fetch_contents(self):
        self.assertEqual(self.bag.fetch_contents, self.test_fetch_contents)

    def test_can_fetch(self):
        self.bag.fetch()
        self.assertTrue(
            os.path.exists(
                os.path.join(os.getcwd(), "test", "testbag", "data",
                             "bagitspec.pdf")))

    def test_can_fetch_and_validate(self):
        self.bag.fetch(validate_downloads=True)
        self.assertEqual(
            self.bag.manifest_contents["data/bagitspec.pdf"],
            "4649c6540ac4e4dcf271ca236abfe62faa4d7f08",
        )

    def set_fetch_contents(self):
        self.bag.add_fetch_entries([{
            "url":
            "http://icanhascheezburger.files.wordpress.com/2007/06/stealing_my_bag.jpg",
            "filename":
            os.path.join("data", "stealin_mah_bag.jpg"),
        }])
        self.assertTrue(
            os.path.exists(
                os.path.join(os.getcwd(), "test", "testbag", "data",
                             "stealin_mah_bag.jpg")))
Example #9
0
class FetchTest(unittest.TestCase):
    ### THESE MAY FAIL if the Websites change their files. In that case,
    ### just comment this test out, or replace the assertion with a new
    ### SHA1 Checksum.
    def setUp(self):
        self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))
        self.test_fetch_contents = [{'filename': u'data/bagitspec.pdf',
          'length': u'-',
          'url': u'http://www.digitalpreservation.gov/documents/bagitspec.pdf'}]

    def tearDown(self):
        # if os.path.exists(os.path.join(os.getcwd(), 'test', 'testbag', 'data', 'bagitspec.pdf')):
        #     os.remove(os.path.join(os.getcwd(), 'test', 'testbag', 'data', 'bagitspec.pdf'))
        if os.path.exists(os.path.join(os.getcwd(), 'test', 'testbag', 'data', 'stealin_mah_bag.jpg')):
            os.remove(os.path.join(os.getcwd(), 'test', 'testbag', 'data', 'stealin_mah_bag.jpg'))
        if os.path.exists(os.path.join(os.getcwd(), 'test', 'testbag', 'data', 'bagitspec.pdf')):
            os.remove(os.path.join(os.getcwd(), 'test', 'testbag', 'data', 'bagitspec.pdf'))
        self.bag.add_fetch_entries(self.test_fetch_contents, append=False)

    def test_fetch_contents(self):
        self.assertEquals(self.bag.fetch_contents, self.test_fetch_contents)

    def test_can_fetch(self):
        self.bag.fetch()
        self.assertTrue(os.path.exists(os.path.join(os.getcwd(), 'test', 'testbag', 'data', 'bagitspec.pdf')))

    def test_can_fetch_and_validate(self):
        self.bag.fetch(validate_downloads=True)
        self.assertEquals(self.bag.manifest_contents['data/bagitspec.pdf'],
            '4649c6540ac4e4dcf271ca236abfe62faa4d7f08')

    def set_fetch_contents(self):
        self.bag.add_fetch_entries([{'url': 'http://icanhascheezburger.files.wordpress.com/2007/06/stealing_my_bag.jpg',
                'filename': os.path.join('data','stealin_mah_bag.jpg')}])
        self.assertTrue(os.path.exists(os.path.join(os.getcwd(), 'test', 'testbag', 'data', 'stealin_mah_bag.jpg')))
Example #10
0
class CompressTest(unittest.TestCase):
    def setUp(self):
        self.bag = BagIt(os.path.join(os.getcwd(), "test", "testbag"))

    def tearDown(self):
        if os.path.exists(os.path.join(os.getcwd(), "test", "testbag.tgz")):
            os.remove(os.path.join(os.getcwd(), "test", "testbag.tgz"))
        if os.path.exists(os.path.join(os.getcwd(), "test", "testbag.zip")):
            os.remove(os.path.join(os.getcwd(), "test", "testbag.zip"))
        if os.path.exists(os.path.join(os.getcwd(), "test", "newzipbag")):
            shutil.rmtree(os.path.join(os.getcwd(), "test", "newzipbag"))
        if os.path.exists(os.path.join(os.getcwd(), "test", "newtgzbag")):
            shutil.rmtree(os.path.join(os.getcwd(), "test", "newtgzbag"))
        if os.path.exists(os.path.join(os.getcwd(), "test", "newzipbag.zip")):
            os.remove(os.path.join(os.getcwd(), "test", "newzipbag.zip"))
        if os.path.exists(os.path.join(os.getcwd(), "test", "newtgzbag.tgz")):
            os.remove(os.path.join(os.getcwd(), "test", "newtgzbag.tgz"))

    def test_compress_tgz(self):
        self.bag.package(os.path.join(os.getcwd(), "test"))
        self.assertTrue(
            os.path.exists(os.path.join(os.getcwd(), "test", "testbag.tgz"))
        )

    def test_compress_zip(self):
        self.bag.package(os.path.join(os.getcwd(), "test"), method="zip")
        self.assertTrue(
            os.path.exists(os.path.join(os.getcwd(), "test", "testbag.zip"))
        )

    def test_uncompress_tgz(self):
        # create an empty tgz bag.
        newbag = BagIt(os.path.join(os.getcwd(), "test", "newtgzbag"))
        newbag.package(os.path.join(os.getcwd(), "test"))
        # remove the created bag directory
        shutil.rmtree(os.path.join(os.getcwd(), "test", "newtgzbag"))
        # this should leave us with just newtgzbag.tgz
        tgzbag = BagIt(os.path.join(os.getcwd(), "test", "newtgzbag.tgz"))
        self.assertTrue(os.path.exists(tgzbag.bag_directory))

    def test_uncompress_zip(self):
        # create an empty zip bag.
        newbag = BagIt(os.path.join(os.getcwd(), "test", "newzipbag"))
        newbag.package(os.path.join(os.getcwd(), "test"), method="zip")
        # remove the created bag directory
        shutil.rmtree(os.path.join(os.getcwd(), "test", "newzipbag"))
        # this should leave us with just newtgzbag.tgz
        zipbag = BagIt(os.path.join(os.getcwd(), "test", "newzipbag.zip"))
        self.assertTrue(os.path.exists(zipbag.bag_directory))
Example #11
0
 def test_extended_bag_creation(self):
     newbag = BagIt(os.path.join(os.getcwd(), "test", "newtestbag"))
     self.assertTrue(os.path.exists(os.path.join(os.getcwd(), "test", "newtestbag")))
     self.assertTrue(
         os.path.exists(os.path.join(os.getcwd(), "test", "newtestbag", "bagit.txt"))
     )
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), "test", "newtestbag", "manifest-sha1.txt")
         )
     )
     self.assertTrue(
         os.path.exists(os.path.join(os.getcwd(), "test", "newtestbag", "data"))
     )
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), "test", "newtestbag", "bag-info.txt")
         )
     )
     self.assertTrue(
         os.path.exists(os.path.join(os.getcwd(), "test", "newtestbag", "fetch.txt"))
     )
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), "test", "newtestbag", "tagmanifest-sha1.txt")
         )
     )
Example #12
0
 def test_extended_bag_creation(self):
     newbag = BagIt(os.path.join(os.getcwd(), 'test', 'newtestbag'))
     self.assertTrue(
         os.path.exists(os.path.join(os.getcwd(), 'test', 'newtestbag')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag', 'bagit.txt')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag',
                          'manifest-sha1.txt')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag', 'data')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag',
                          'bag-info.txt')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag', 'fetch.txt')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag',
                          'tagmanifest-sha1.txt')))
Example #13
0
class CompressTest(unittest.TestCase):

    def setUp(self):
        self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))

    def tearDown(self):
        if os.path.exists(os.path.join(os.getcwd(), 'test', 'testbag.tgz')):
            os.remove(os.path.join(os.getcwd(), 'test', 'testbag.tgz'))
        if os.path.exists(os.path.join(os.getcwd(), 'test', 'testbag.zip')):
            os.remove(os.path.join(os.getcwd(), 'test', 'testbag.zip'))
        if os.path.exists(os.path.join(os.getcwd(), 'test', 'newzipbag')):
            shutil.rmtree(os.path.join(os.getcwd(), 'test', 'newzipbag'))
        if os.path.exists(os.path.join(os.getcwd(), 'test', 'newtgzbag')):
            shutil.rmtree(os.path.join(os.getcwd(), 'test', 'newtgzbag'))
        if os.path.exists(os.path.join(os.getcwd(), 'test', 'newzipbag.zip')):
            os.remove(os.path.join(os.getcwd(), 'test', 'newzipbag.zip'))
        if os.path.exists(os.path.join(os.getcwd(), 'test', 'newtgzbag.tgz')):
            os.remove(os.path.join(os.getcwd(), 'test', 'newtgzbag.tgz'))

    def test_compress_tgz(self):
        self.bag.package(os.path.join(os.getcwd(), 'test'))
        self.assertTrue(os.path.exists(os.path.join(os.getcwd(), 'test', 'testbag.tgz')))

    def test_compress_zip(self):
        self.bag.package(os.path.join(os.getcwd(), 'test'), method='zip')
        self.assertTrue(os.path.exists(os.path.join(os.getcwd(), 'test', 'testbag.zip')))

    def test_uncompress_tgz(self):
        # create an empty tgz bag.
        newbag = BagIt(os.path.join(os.getcwd(), 'test', 'newtgzbag'))
        newbag.package(os.path.join(os.getcwd(), 'test'))
        # remove the created bag directory
        shutil.rmtree(os.path.join(os.getcwd(), 'test', 'newtgzbag'))
        # this should leave us with just newtgzbag.tgz
        tgzbag = BagIt(os.path.join(os.getcwd(), 'test', 'newtgzbag.tgz'))
        self.assertTrue(os.path.exists(tgzbag.bag_directory))

    def test_uncompress_zip(self):
        # create an empty zip bag.
        newbag = BagIt(os.path.join(os.getcwd(), 'test', 'newzipbag'))
        newbag.package(os.path.join(os.getcwd(), 'test'), method='zip')
        # remove the created bag directory
        shutil.rmtree(os.path.join(os.getcwd(), 'test', 'newzipbag'))
        # this should leave us with just newtgzbag.tgz
        zipbag = BagIt(os.path.join(os.getcwd(), 'test', 'newzipbag.zip'))
        self.assertTrue(os.path.exists(zipbag.bag_directory))
Example #14
0
class VersionTest(unittest.TestCase):
    def setUp(self):
        self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))

    def tearDown(self):
        pass

    def test_versions(self):
        self.assertEquals(self.bag.bag_major_version, 0)
        self.assertEquals(self.bag.bag_minor_version, 96)
        binfo = self.bag.get_bag_info()
        self.assertEquals(binfo['version'], '0.96')
        self.assertEquals(binfo['encoding'], 'utf-8')
        self.assertEquals(binfo['hash'], 'sha1')
Example #15
0
class VersionTest(unittest.TestCase):
    def setUp(self):
        self.bag = BagIt(os.path.join(os.getcwd(), "test", "testbag"))

    def tearDown(self):
        pass

    def test_versions(self):
        self.assertEqual(self.bag.bag_major_version, 0)
        self.assertEqual(self.bag.bag_minor_version, 96)
        binfo = self.bag.get_bag_info()
        self.assertEqual(binfo["version"], "0.96")
        self.assertEqual(binfo["encoding"], "utf-8")
        self.assertEqual(binfo["hash"], "sha1")
Example #16
0
    def run(self, package_id, *args, **kwargs):
        resultspackage = ResultsPackage.objects.get(pk=package_id)
        if resultspackage.status == RunJobStatus.CANCELLED:
            return

        resultspackage.status = ResultsPackageStatus.PROCESSING
        resultspackage.save()

        runjobs = resultspackage.workflow_run.run_jobs.select_related(
            'page', 'job').all()

        if not resultspackage.pages.exists():
            pages = set()
            for runjob in runjobs:
                pages.add(runjob.page)
        else:
            pages = resultspackage.pages.all()

        jobs = resultspackage.jobs.all()
        self.package_path = resultspackage.package_path

        # The chunks are intervals used to update the percent_completed field.
        if len(pages) > 0:
            page_chunk = 70.00 / len(pages)
        completed = 0.0

        bag = BagIt(resultspackage.bag_path)

        for page in pages:
            page_dir = os.path.join(bag.data_directory, page.name)
            os.makedirs(page_dir)
            page_runjobs = runjobs.filter(page=page)

            if not jobs:
                # If no jobs are provided, we will just make a list of jobs from the available runjobs.
                jobs = []
                if len(page_runjobs) > 0:
                    runjob_chunk = page_chunk / len(page_runjobs)

                for runjob in page_runjobs:
                    _add_result_to_bag(page_dir, runjob, bag)

                    completed += runjob_chunk
                    _ensure_db_state(resultspackage)
                    _update_progress(resultspackage, completed)

                    if runjob.workflow_job.job not in jobs:
                        jobs.append(runjob.workflow_job.job)

            else:
                if len(jobs) > 0:
                    job_chunk = page_chunk / len(jobs)

                for job in jobs:
                    matcthing_runjobs = page_runjobs.filter(
                        workflow_job__job=job)
                    if len(matcthing_runjobs) > 0:
                        runjob_chunk = job_chunk / len(matcthing_runjobs)

                    for runjob in matcthing_runjobs:
                        _add_result_to_bag(page_dir, runjob, bag)

                        completed += runjob_chunk
                        _ensure_db_state(resultspackage)
                        _update_progress(resultspackage, completed)

        bag.update()
        errors = bag.validate()
        if not bag.is_valid:
            _ensure_db_state(resultspackage)
            resultspackage.status = ResultsPackageStatus.FAILED
            resultspackage.save()
            raise BagNotValidError("The bag failed validation.\n" +
                                   str(errors))

        bag.package(resultspackage.package_path, method='zip')
        resultspackage.download_url = resultspackage.file_url
        resultspackage.percent_completed = 100
        resultspackage.status = ResultsPackageStatus.COMPLETE

        # If pages and jobs were not provided, we populate these fields now
        # since we have figured them out.
        resultspackage.pages = pages
        resultspackage.jobs = jobs

        _ensure_db_state(resultspackage)
        resultspackage.save()
        shutil.rmtree(resultspackage.bag_path)
Example #17
0
 def test_unicode_characters_in_bagnam(self):
     newbag = BagIt(os.path.join(os.getcwd(), 'test', 'tëst'))
     self.assertTrue(
         os.path.exists(os.path.join(os.getcwd(), 'test', 'tëst')))
Example #18
0
 def setUp(self):
     self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))
     self.test_fetch_contents = [{'filename': u'data/bagitspec.pdf',
       'length': u'-',
       'url': u'http://www.digitalpreservation.gov/documents/bagitspec.pdf'}]
Example #19
0
class ManifestTest(unittest.TestCase):
    def setUp(self):
        self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))

    def set_hash_md5(self):
        self.bag.set_hash_encoding('md5')
        self.assertEquals(self.hash_encoding, u'md5')

    def set_hash_sha1(self):
        self.bag.set_hash_encoding('sha1')
        self.assertEquals(self.hash_encoding, u'sha1')

    def test_sha1(self):
        self.bag.set_hash_encoding('sha1')
        self.bag.update()
        self.assertEquals(self.bag.manifest_contents[os.path.join('data', 'subdir', 'subsubdir', 'angry.jpg')],
                u'c5913ae67aa40398f1182e52d2fa2c2e4c08f696')

    def test_md5(self):
        self.bag.set_hash_encoding('md5')
        self.bag.update()
        self.assertEquals(self.bag.manifest_contents[os.path.join('data', 'subdir', 'subsubdir', 'angry.jpg')],
                '5f294603675cb6c0f83cef9316bb5be7')

    def test_sha1_manifest(self):
        self.bag.set_hash_encoding('sha1')
        self.bag.update()
        self.assertEquals(os.path.basename(self.bag.manifest_file),
                'manifest-sha1.txt')

    def test_md5_manifest(self):
        self.bag.set_hash_encoding('md5')
        self.bag.update()
        self.assertEquals(os.path.basename(self.bag.manifest_file),
                'manifest-md5.txt')
Example #20
0
 def setUp(self):
     print "Setting up version."
     self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))
Example #21
0
class ManifestTest(unittest.TestCase):
    def setUp(self):
        self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))

    def set_hash_md5(self):
        self.bag.set_hash_encoding('md5')
        self.assertEquals(self.hash_encoding, u'md5')

    def set_hash_sha1(self):
        self.bag.set_hash_encoding('sha1')
        self.assertEquals(self.hash_encoding, u'sha1')

    def test_sha1(self):
        self.bag.set_hash_encoding('sha1')
        self.bag.update()
        self.assertEquals(
            self.bag.manifest_contents[os.path.join('data', 'subdir',
                                                    'subsubdir', 'angry.jpg')],
            u'c5913ae67aa40398f1182e52d2fa2c2e4c08f696')

    def test_md5(self):
        self.bag.set_hash_encoding('md5')
        self.bag.update()
        self.assertEquals(
            self.bag.manifest_contents[os.path.join('data', 'subdir',
                                                    'subsubdir', 'angry.jpg')],
            '5f294603675cb6c0f83cef9316bb5be7')

    def test_sha1_manifest(self):
        self.bag.set_hash_encoding('sha1')
        self.bag.update()
        self.assertEquals(os.path.basename(self.bag.manifest_file),
                          'manifest-sha1.txt')

    def test_md5_manifest(self):
        self.bag.set_hash_encoding('md5')
        self.bag.update()
        self.assertEquals(os.path.basename(self.bag.manifest_file),
                          'manifest-md5.txt')
Example #22
0
File: core.py Project: DDMAL/Rodan
    def run(self, rp_id):
        rp_query = ResultsPackage.objects.filter(uuid=rp_id)
        rp_query.update(status=task_status.PROCESSING, celery_task_id=self.request.id)
        rp = rp_query.first()
        mode = rp.packaging_mode
        package_path = get_package_path(rp_id)

        output_objs = Output.objects.filter(
            run_job__workflow_run=rp.workflow_run
        ).select_related(
            'resource', 'resource__resource_type', 'resource_list', 'run_job'
        ).prefetch_related(
            'resource_list__resources'
        ).annotate(
            is_endpoint=Case(
                When(
                    condition=(
                        Q(resource__isnull=False)
                        & (
                            Q(resource__inputs__isnull=True)
                            | ~Q(resource__inputs__run_job__workflow_run=rp.workflow_run)
                        )
                    ) | (
                        Q(resource_list__isnull=False)
                        & (
                            Q(resource_list__inputs__isnull=True)
                            | ~Q(resource_list__inputs__run_job__workflow_run=rp.workflow_run)
                        )
                    ),
                    then=Value(True)
                ),
                default=Value(False),
                output_field=BooleanField()
            )
        )

        if len(output_objs) > 0:
            percentage_increment = 70.00 / len(output_objs)
        else:
            percentage_increment = 0
        completed = 0.0

        with TemporaryDirectory() as td:
            tmp_dir = os.path.join(td, rp_id)  # because rp_id will be name of the packaged zip
            bag = BagIt(tmp_dir)

            job_namefinder = self._NameFinder()
            res_namefinder = self._NameFinder()

            for output in output_objs:
                if mode == 0:  # only endpoint resources, subdirectoried by different outputs
                    # continue if not endpoint output
                    if output.is_endpoint is False:
                        continue

                    j_name = job_namefinder.find(output.run_job.workflow_job_id, output.run_job.job_name)
                    opt_name = output.output_port_type_name
                    op_dir = os.path.join(tmp_dir, "{0} - {1}".format(j_name, opt_name))

                    rj_status = output.run_job.status
                    if rj_status == task_status.FINISHED:
                        if output.resource is not None:
                            filepath = output.resource.resource_file.path
                            ext = os.path.splitext(filepath)[1]

                            res_name = res_namefinder.find(output.resource_id, output.resource.name)  # [TODO]: or... find the modified resource name if the resource_uuid still exists?
                            result_filename = "{0}{1}".format(res_name, ext)
                            if not os.path.exists(op_dir):
                                os.makedirs(op_dir)
                            shutil.copyfile(filepath, os.path.join(op_dir, result_filename))
                        elif output.resource_list is not None:
                            res_name = res_namefinder.find(output.resource_list_id, output.resource_list.name)  # [TODO]: or... find the modified resource name if the resource_uuid still exists?
                            result_foldername = "{0}.list".format(res_name)
                            result_folder = os.path.join(op_dir, result_foldername)
                            if not os.path.exists(result_folder):
                                os.makedirs(result_folder)

                            cnt = output.resource_list.resources.count()
                            zfills = len(str(cnt))
                            for idx, r in enumerate(output.resource_list.resources.all()):
                                filepath = r.resource_file.path
                                ext = os.path.splitext(filepath)[1]
                                new_filename = "{0}{1}".format(str(idx).zfill(zfills), ext)
                                shutil.copyfile(filepath, os.path.join(result_folder, new_filename))

                elif mode == 1:
                    res_name = res_namefinder.find(output.resource_id, output.resource.name)  # [TODO]: or... find the modified resource name if the resource_uuid still exists?
                    res_dir = os.path.join(tmp_dir, res_name)

                    j_name = job_namefinder.find(output.run_job.workflow_job_id, output.run_job.job_name)
                    opt_name = output.output_port_type_name

                    rj_status = output.run_job.status
                    if rj_status == task_status.FINISHED:
                        if output.resource is not None:
                            filepath = output.resource.resource_file.path
                            ext = os.path.splitext(filepath)[1]
                            result_filename = "{0} - {1}{2}".format(j_name, opt_name, ext)
                            if not os.path.exists(res_dir):
                                os.makedirs(res_dir)
                            shutil.copyfile(filepath, os.path.join(res_dir, result_filename))
                        elif output.resource_list is not None:
                            result_foldername = "{0} - {1}.list".format(j_name, opt_name)
                            result_folder = os.path.join(res_dir, result_foldername)
                            if not os.path.exists(result_folder):
                                os.makedirs(result_folder)

                            cnt = output.resource_list.resources.count()
                            zfills = len(str(cnt))
                            for idx, r in enumerate(output.resource_list.resources.all()):
                                filepath = r.resource_file.path
                                ext = os.path.splitext(filepath)[1]
                                new_filename = "{0}{1}".format(str(idx).zfill(zfills), ext)
                                shutil.copyfile(filepath, os.path.join(result_folder, new_filename))

                    elif rj_status == task_status.FAILED:
                        result_filename = "{0} - {1} - ERROR.txt".format(j_name, opt_name)
                        if not os.path.exists(res_dir):
                            os.makedirs(res_dir)
                        with open(os.path.join(res_dir, result_filename), 'w') as f:
                            f.write("Error Summary: ")
                            f.write(output.run_job.error_summary)
                            f.write("\n\nError Details:\n")
                            f.write(output.run_job.error_details)
                elif mode == 2:
                    raise NotImplementedError() # [TODO]
                else:
                    raise ValueError("mode {0} is not supported".format(mode))

                completed += percentage_increment
                rp_query.update(percent_completed=int(completed))

            #print [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(tmp_dir)) for f in fn]   # DEBUG
            bag.update()
            errors = bag.validate()
            if not bag.is_valid:
                rp_query.update(status=task_status.FAILED,
                                error_summary="The bag failed validation.",
                                error_details=str(errors))

            target_dir_name = os.path.dirname(package_path)
            if not os.path.isdir(target_dir_name):
                os.makedirs(target_dir_name)
            bag.package(target_dir_name, method='zip')


        rp_query.update(status=task_status.FINISHED,
                        percent_completed=100)
        expiry_time = rp_query.values_list('expiry_time', flat=True)[0]
        if expiry_time:
            async_task = registry.tasks['rodan.core.expire_package'].apply_async((rp_id, ), eta=expiry_time)
            expire_task_id = async_task.task_id
        else:
            expire_task_id = None

        rp_query.update(celery_task_id=expire_task_id)
        return True
Example #23
0
    def run(self, package_id, *args, **kwargs):
        resultspackage = ResultsPackage.objects.get(pk=package_id)
        if resultspackage.status == RunJobStatus.CANCELLED:
            return

        resultspackage.status = ResultsPackageStatus.PROCESSING
        resultspackage.save()

        runjobs = resultspackage.workflow_run.run_jobs.select_related('page', 'job').all()

        if not resultspackage.pages.exists():
            pages = set()
            for runjob in runjobs:
                pages.add(runjob.page)
        else:
            pages = resultspackage.pages.all()

        jobs = resultspackage.jobs.all()
        self.package_path = resultspackage.package_path

        # The chunks are intervals used to update the percent_completed field.
        if len(pages) > 0:
            page_chunk = 70.00 / len(pages)
        completed = 0.0

        bag = BagIt(resultspackage.bag_path)

        for page in pages:
            page_dir = os.path.join(bag.data_directory, page.name)
            os.makedirs(page_dir)
            page_runjobs = runjobs.filter(page=page)

            if not jobs:
                # If no jobs are provided, we will just make a list of jobs from the available runjobs.
                jobs = []
                if len(page_runjobs) > 0:
                    runjob_chunk = page_chunk / len(page_runjobs)

                for runjob in page_runjobs:
                    _add_result_to_bag(page_dir, runjob, bag)

                    completed += runjob_chunk
                    _ensure_db_state(resultspackage)
                    _update_progress(resultspackage, completed)

                    if runjob.workflow_job.job not in jobs:
                        jobs.append(runjob.workflow_job.job)

            else:
                if len(jobs) > 0:
                    job_chunk = page_chunk / len(jobs)

                for job in jobs:
                    matcthing_runjobs = page_runjobs.filter(workflow_job__job=job)
                    if len(matcthing_runjobs) > 0:
                        runjob_chunk = job_chunk / len(matcthing_runjobs)

                    for runjob in matcthing_runjobs:
                        _add_result_to_bag(page_dir, runjob, bag)

                        completed += runjob_chunk
                        _ensure_db_state(resultspackage)
                        _update_progress(resultspackage, completed)

        bag.update()
        errors = bag.validate()
        if not bag.is_valid:
            _ensure_db_state(resultspackage)
            resultspackage.status = ResultsPackageStatus.FAILED
            resultspackage.save()
            raise BagNotValidError("The bag failed validation.\n" + str(errors))

        bag.package(resultspackage.package_path, method='zip')
        resultspackage.download_url = resultspackage.file_url
        resultspackage.percent_completed = 100
        resultspackage.status = ResultsPackageStatus.COMPLETE

        # If pages and jobs were not provided, we populate these fields now
        # since we have figured them out.
        resultspackage.pages = pages
        resultspackage.jobs = jobs

        _ensure_db_state(resultspackage)
        resultspackage.save()
        shutil.rmtree(resultspackage.bag_path)
Example #24
0
 def setUp(self):
     print "Setting up update."
     self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))
     self.invalid_bag = BagIt(os.path.join(os.getcwd(), 'test', 'invalid_bag'))
Example #25
0
    def run(self, rp_id):
        rp_query = ResultsPackage.objects.filter(uuid=rp_id)
        rp_query.update(status=task_status.PROCESSING,
                        celery_task_id=self.request.id)
        rp = rp_query.first()
        mode = rp.packaging_mode
        package_path = get_package_path(rp_id)

        output_objs = (
            Output.objects.filter(
                run_job__workflow_run=rp.workflow_run).select_related(
                    "resource", "resource__resource_type", "resource_list",
                    "run_job").prefetch_related("resource_list__resources").
            annotate(is_endpoint=Case(
                When(
                    condition=(
                        Q(resource__isnull=False)
                        & (Q(resource__inputs__isnull=True)
                           | ~Q(resource__inputs__run_job__workflow_run=rp.
                                workflow_run)))
                    | (Q(resource_list__isnull=False)
                       & (Q(resource_list__inputs__isnull=True)
                          | ~Q(resource_list__inputs__run_job__workflow_run=rp.
                               workflow_run))),
                    then=Value(True),
                ),
                default=Value(False),
                output_field=BooleanField(),
            )))

        if len(output_objs) > 0:
            percentage_increment = 70.00 / len(output_objs)
        else:
            percentage_increment = 0
        completed = 0.0

        with TemporaryDirectory() as td:
            tmp_dir = os.path.join(
                td, rp_id)  # because rp_id will be name of the packaged zip
            bag = BagIt(tmp_dir)

            job_namefinder = self._NameFinder()
            res_namefinder = self._NameFinder()

            for output in output_objs:
                if mode == 0:
                    # only endpoint resources, subdirectoried by different outputs
                    # continue if not endpoint output
                    if output.is_endpoint is False:
                        continue

                    j_name = job_namefinder.find(
                        output.run_job.workflow_job_id,
                        output.run_job.job_name)
                    opt_name = output.output_port_type_name
                    op_dir = os.path.join(tmp_dir,
                                          "{0} - {1}".format(j_name, opt_name))

                    rj_status = output.run_job.status
                    if rj_status == task_status.FINISHED:
                        if output.resource is not None:
                            filepath = output.resource.resource_file.path
                            ext = os.path.splitext(filepath)[1]

                            res_name = res_namefinder.find(
                                output.resource_id, output.resource.name
                            )  # [TODO]: or... find the modified resource name if the resource_uuid still exists?
                            result_filename = "{0}{1}".format(res_name, ext)
                            if not os.path.exists(op_dir):
                                os.makedirs(op_dir)
                            shutil.copyfile(
                                filepath, os.path.join(op_dir,
                                                       result_filename))
                        elif output.resource_list is not None:
                            res_name = res_namefinder.find(
                                output.resource_list_id,
                                output.resource_list.name
                            )  # [TODO]: or... find the modified resource name if the resource_uuid still exists?
                            result_foldername = "{0}.list".format(res_name)
                            result_folder = os.path.join(
                                op_dir, result_foldername)
                            if not os.path.exists(result_folder):
                                os.makedirs(result_folder)

                            cnt = output.resource_list.resources.count()
                            zfills = len(str(cnt))
                            for idx, r in enumerate(
                                    output.resource_list.resources.all()):
                                filepath = r.resource_file.path
                                ext = os.path.splitext(filepath)[1]
                                new_filename = "{0}{1}".format(
                                    str(idx).zfill(zfills), ext)
                                shutil.copyfile(
                                    filepath,
                                    os.path.join(result_folder, new_filename))

                elif mode == 1:
                    res_name = res_namefinder.find(
                        output.resource_id, output.resource.name
                    )  # [TODO]: or... find the modified resource name if the resource_uuid still exists?
                    res_dir = os.path.join(tmp_dir, res_name)

                    j_name = job_namefinder.find(
                        output.run_job.workflow_job_id,
                        output.run_job.job_name)
                    opt_name = output.output_port_type_name

                    rj_status = output.run_job.status
                    if rj_status == task_status.FINISHED:
                        if output.resource is not None:
                            filepath = output.resource.resource_file.path
                            ext = os.path.splitext(filepath)[1]
                            result_filename = "{0} - {1}{2}".format(
                                j_name, opt_name, ext)
                            if not os.path.exists(res_dir):
                                os.makedirs(res_dir)
                            shutil.copyfile(
                                filepath, os.path.join(res_dir,
                                                       result_filename))
                        elif output.resource_list is not None:
                            result_foldername = "{0} - {1}.list".format(
                                j_name, opt_name)
                            result_folder = os.path.join(
                                res_dir, result_foldername)
                            if not os.path.exists(result_folder):
                                os.makedirs(result_folder)

                            cnt = output.resource_list.resources.count()
                            zfills = len(str(cnt))
                            for idx, r in enumerate(
                                    output.resource_list.resources.all()):
                                filepath = r.resource_file.path
                                ext = os.path.splitext(filepath)[1]
                                new_filename = "{0}{1}".format(
                                    str(idx).zfill(zfills), ext)
                                shutil.copyfile(
                                    filepath,
                                    os.path.join(result_folder, new_filename))

                    elif rj_status == task_status.FAILED:
                        result_filename = "{0} - {1} - ERROR.txt".format(
                            j_name, opt_name)
                        if not os.path.exists(res_dir):
                            os.makedirs(res_dir)
                        with open(os.path.join(res_dir, result_filename),
                                  "w") as f:
                            f.write("Error Summary: ")
                            f.write(output.run_job.error_summary)
                            f.write("\n\nError Details:\n")
                            f.write(output.run_job.error_details)
                elif mode == 2:
                    raise NotImplementedError()  # [TODO]
                else:
                    raise ValueError("mode {0} is not supported".format(mode))

                completed += percentage_increment
                rp_query.update(percent_completed=int(completed))

            # print([os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(tmp_dir)) for f in fn])   # DEBUG
            bag.update()
            errors = bag.validate()
            if not bag.is_valid:
                rp_query.update(
                    status=task_status.FAILED,
                    error_summary="The bag failed validation.",
                    error_details=str(errors),
                )

            target_dir_name = os.path.dirname(package_path)
            if not os.path.isdir(target_dir_name):
                os.makedirs(target_dir_name)
            bag.package(target_dir_name, method="zip")

        rp_query.update(status=task_status.FINISHED, percent_completed=100)
        expiry_time = rp_query.values_list("expiry_time", flat=True)[0]
        if expiry_time:
            async_task = registry.tasks[
                "rodan.core.expire_package"].apply_async((rp_id, ),
                                                         eta=expiry_time,
                                                         queue="celery")
            expire_task_id = async_task.task_id
        else:
            expire_task_id = None

        rp_query.update(celery_task_id=expire_task_id)
        return True
Example #26
0
 def setUp(self):
     self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))
Example #27
0
 def setUp(self):
     self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))
Example #28
0
class ManifestTest(unittest.TestCase):
    def setUp(self):
        self.bag = BagIt(os.path.join(os.getcwd(), "test", "testbag"))

    def set_hash_md5(self):
        self.bag.set_hash_encoding("md5")
        self.assertEqual(self.hash_encoding, "md5")

    def set_hash_sha1(self):
        self.bag.set_hash_encoding("sha1")
        self.assertEqual(self.hash_encoding, "sha1")

    def test_sha1(self):
        self.bag.set_hash_encoding("sha1")
        self.bag.update()
        self.assertEqual(
            self.bag.manifest_contents[os.path.join("data", "subdir",
                                                    "subsubdir", "angry.jpg")],
            "c5913ae67aa40398f1182e52d2fa2c2e4c08f696",
        )

    def test_md5(self):
        self.bag.set_hash_encoding("md5")
        self.bag.update()
        self.assertEqual(
            self.bag.manifest_contents[os.path.join("data", "subdir",
                                                    "subsubdir", "angry.jpg")],
            "5f294603675cb6c0f83cef9316bb5be7",
        )

    def test_sha1_manifest(self):
        self.bag.set_hash_encoding("sha1")
        self.bag.update()
        self.assertEqual(os.path.basename(self.bag.manifest_file),
                         "manifest-sha1.txt")

    def test_md5_manifest(self):
        self.bag.set_hash_encoding("md5")
        self.bag.update()
        self.assertEqual(os.path.basename(self.bag.manifest_file),
                         "manifest-md5.txt")
Example #29
0
 def setUp(self):
     self.bag = BagIt(os.path.join(os.getcwd(), "test", "testbag"))
Example #30
0
 def setUp(self):
     print "setting up manifest."
     self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))