Ejemplo n.º 1
0
 def test_uncompress_zip(self):
     # create an empty zip bag.
     newbag = BagIt(os.path.join(os.getcwd(), "test", "newzipbag"))
     newbag.package(os.path.join(os.getcwd(), "test"), method="zip")
     # remove the created bag directory
     shutil.rmtree(os.path.join(os.getcwd(), "test", "newzipbag"))
     # this should leave us with just newtgzbag.tgz
     zipbag = BagIt(os.path.join(os.getcwd(), "test", "newzipbag.zip"))
     self.assertTrue(os.path.exists(zipbag.bag_directory))
Ejemplo n.º 2
0
 def test_uncompress_tgz(self):
     # create an empty tgz bag.
     newbag = BagIt(os.path.join(os.getcwd(), 'test', 'newtgzbag'))
     newbag.package(os.path.join(os.getcwd(), 'test'))
     # remove the created bag directory
     shutil.rmtree(os.path.join(os.getcwd(), 'test', 'newtgzbag'))
     # this should leave us with just newtgzbag.tgz
     tgzbag = BagIt(os.path.join(os.getcwd(), 'test', 'newtgzbag.tgz'))
     self.assertTrue(os.path.exists(tgzbag.bag_directory))
Ejemplo n.º 3
0
 def test_extended_bag_creation(self):
     newbag = BagIt(os.path.join(os.getcwd(), 'test', 'newtestbag'))
     self.assertTrue(
         os.path.exists(os.path.join(os.getcwd(), 'test', 'newtestbag')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag', 'bagit.txt')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag',
                          'manifest-sha1.txt')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag', 'data')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag',
                          'bag-info.txt')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag', 'fetch.txt')))
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), 'test', 'newtestbag',
                          'tagmanifest-sha1.txt')))
Ejemplo n.º 4
0
 def test_extended_bag_creation(self):
     newbag = BagIt(os.path.join(os.getcwd(), "test", "newtestbag"))
     self.assertTrue(os.path.exists(os.path.join(os.getcwd(), "test", "newtestbag")))
     self.assertTrue(
         os.path.exists(os.path.join(os.getcwd(), "test", "newtestbag", "bagit.txt"))
     )
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), "test", "newtestbag", "manifest-sha1.txt")
         )
     )
     self.assertTrue(
         os.path.exists(os.path.join(os.getcwd(), "test", "newtestbag", "data"))
     )
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), "test", "newtestbag", "bag-info.txt")
         )
     )
     self.assertTrue(
         os.path.exists(os.path.join(os.getcwd(), "test", "newtestbag", "fetch.txt"))
     )
     self.assertTrue(
         os.path.exists(
             os.path.join(os.getcwd(), "test", "newtestbag", "tagmanifest-sha1.txt")
         )
     )
Ejemplo n.º 5
0
 def setUp(self):
     self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))
     self.test_fetch_contents = [{
         'filename':
         u'data/bagitspec.pdf',
         'length':
         u'-',
         'url':
         u'http://www.digitalpreservation.gov/documents/bagitspec.pdf'
     }]
Ejemplo n.º 6
0
 def setUp(self):
     self.bag = BagIt(os.path.join(os.getcwd(), "test", "testbag"))
     self.test_fetch_contents = [{
         "filename":
         "data/bagitspec.pdf",
         "length":
         "-",
         "url":
         "http://www.digitalpreservation.gov/documents/bagitspec.pdf",
     }]
Ejemplo n.º 7
0
    def run(self, rp_id):
        rp_query = ResultsPackage.objects.filter(uuid=rp_id)
        rp_query.update(status=task_status.PROCESSING,
                        celery_task_id=self.request.id)
        rp = rp_query.first()
        mode = rp.packaging_mode
        package_path = get_package_path(rp_id)

        output_objs = (
            Output.objects.filter(
                run_job__workflow_run=rp.workflow_run).select_related(
                    "resource", "resource__resource_type", "resource_list",
                    "run_job").prefetch_related("resource_list__resources").
            annotate(is_endpoint=Case(
                When(
                    condition=(
                        Q(resource__isnull=False)
                        & (Q(resource__inputs__isnull=True)
                           | ~Q(resource__inputs__run_job__workflow_run=rp.
                                workflow_run)))
                    | (Q(resource_list__isnull=False)
                       & (Q(resource_list__inputs__isnull=True)
                          | ~Q(resource_list__inputs__run_job__workflow_run=rp.
                               workflow_run))),
                    then=Value(True),
                ),
                default=Value(False),
                output_field=BooleanField(),
            )))

        if len(output_objs) > 0:
            percentage_increment = 70.00 / len(output_objs)
        else:
            percentage_increment = 0
        completed = 0.0

        with TemporaryDirectory() as td:
            tmp_dir = os.path.join(
                td, rp_id)  # because rp_id will be name of the packaged zip
            bag = BagIt(tmp_dir)

            job_namefinder = self._NameFinder()
            res_namefinder = self._NameFinder()

            for output in output_objs:
                if mode == 0:
                    # only endpoint resources, subdirectoried by different outputs
                    # continue if not endpoint output
                    if output.is_endpoint is False:
                        continue

                    j_name = job_namefinder.find(
                        output.run_job.workflow_job_id,
                        output.run_job.job_name)
                    opt_name = output.output_port_type_name
                    op_dir = os.path.join(tmp_dir,
                                          "{0} - {1}".format(j_name, opt_name))

                    rj_status = output.run_job.status
                    if rj_status == task_status.FINISHED:
                        if output.resource is not None:
                            filepath = output.resource.resource_file.path
                            ext = os.path.splitext(filepath)[1]

                            res_name = res_namefinder.find(
                                output.resource_id, output.resource.name
                            )  # [TODO]: or... find the modified resource name if the resource_uuid still exists?
                            result_filename = "{0}{1}".format(res_name, ext)
                            if not os.path.exists(op_dir):
                                os.makedirs(op_dir)
                            shutil.copyfile(
                                filepath, os.path.join(op_dir,
                                                       result_filename))
                        elif output.resource_list is not None:
                            res_name = res_namefinder.find(
                                output.resource_list_id,
                                output.resource_list.name
                            )  # [TODO]: or... find the modified resource name if the resource_uuid still exists?
                            result_foldername = "{0}.list".format(res_name)
                            result_folder = os.path.join(
                                op_dir, result_foldername)
                            if not os.path.exists(result_folder):
                                os.makedirs(result_folder)

                            cnt = output.resource_list.resources.count()
                            zfills = len(str(cnt))
                            for idx, r in enumerate(
                                    output.resource_list.resources.all()):
                                filepath = r.resource_file.path
                                ext = os.path.splitext(filepath)[1]
                                new_filename = "{0}{1}".format(
                                    str(idx).zfill(zfills), ext)
                                shutil.copyfile(
                                    filepath,
                                    os.path.join(result_folder, new_filename))

                elif mode == 1:
                    res_name = res_namefinder.find(
                        output.resource_id, output.resource.name
                    )  # [TODO]: or... find the modified resource name if the resource_uuid still exists?
                    res_dir = os.path.join(tmp_dir, res_name)

                    j_name = job_namefinder.find(
                        output.run_job.workflow_job_id,
                        output.run_job.job_name)
                    opt_name = output.output_port_type_name

                    rj_status = output.run_job.status
                    if rj_status == task_status.FINISHED:
                        if output.resource is not None:
                            filepath = output.resource.resource_file.path
                            ext = os.path.splitext(filepath)[1]
                            result_filename = "{0} - {1}{2}".format(
                                j_name, opt_name, ext)
                            if not os.path.exists(res_dir):
                                os.makedirs(res_dir)
                            shutil.copyfile(
                                filepath, os.path.join(res_dir,
                                                       result_filename))
                        elif output.resource_list is not None:
                            result_foldername = "{0} - {1}.list".format(
                                j_name, opt_name)
                            result_folder = os.path.join(
                                res_dir, result_foldername)
                            if not os.path.exists(result_folder):
                                os.makedirs(result_folder)

                            cnt = output.resource_list.resources.count()
                            zfills = len(str(cnt))
                            for idx, r in enumerate(
                                    output.resource_list.resources.all()):
                                filepath = r.resource_file.path
                                ext = os.path.splitext(filepath)[1]
                                new_filename = "{0}{1}".format(
                                    str(idx).zfill(zfills), ext)
                                shutil.copyfile(
                                    filepath,
                                    os.path.join(result_folder, new_filename))

                    elif rj_status == task_status.FAILED:
                        result_filename = "{0} - {1} - ERROR.txt".format(
                            j_name, opt_name)
                        if not os.path.exists(res_dir):
                            os.makedirs(res_dir)
                        with open(os.path.join(res_dir, result_filename),
                                  "w") as f:
                            f.write("Error Summary: ")
                            f.write(output.run_job.error_summary)
                            f.write("\n\nError Details:\n")
                            f.write(output.run_job.error_details)
                elif mode == 2:
                    raise NotImplementedError()  # [TODO]
                else:
                    raise ValueError("mode {0} is not supported".format(mode))

                completed += percentage_increment
                rp_query.update(percent_completed=int(completed))

            # print([os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(tmp_dir)) for f in fn])   # DEBUG
            bag.update()
            errors = bag.validate()
            if not bag.is_valid:
                rp_query.update(
                    status=task_status.FAILED,
                    error_summary="The bag failed validation.",
                    error_details=str(errors),
                )

            target_dir_name = os.path.dirname(package_path)
            if not os.path.isdir(target_dir_name):
                os.makedirs(target_dir_name)
            bag.package(target_dir_name, method="zip")

        rp_query.update(status=task_status.FINISHED, percent_completed=100)
        expiry_time = rp_query.values_list("expiry_time", flat=True)[0]
        if expiry_time:
            async_task = registry.tasks[
                "rodan.core.expire_package"].apply_async((rp_id, ),
                                                         eta=expiry_time,
                                                         queue="celery")
            expire_task_id = async_task.task_id
        else:
            expire_task_id = None

        rp_query.update(celery_task_id=expire_task_id)
        return True
Ejemplo n.º 8
0
    def run(self, package_id, *args, **kwargs):
        resultspackage = ResultsPackage.objects.get(pk=package_id)
        if resultspackage.status == RunJobStatus.CANCELLED:
            return

        resultspackage.status = ResultsPackageStatus.PROCESSING
        resultspackage.save()

        runjobs = resultspackage.workflow_run.run_jobs.select_related(
            'page', 'job').all()

        if not resultspackage.pages.exists():
            pages = set()
            for runjob in runjobs:
                pages.add(runjob.page)
        else:
            pages = resultspackage.pages.all()

        jobs = resultspackage.jobs.all()
        self.package_path = resultspackage.package_path

        # The chunks are intervals used to update the percent_completed field.
        if len(pages) > 0:
            page_chunk = 70.00 / len(pages)
        completed = 0.0

        bag = BagIt(resultspackage.bag_path)

        for page in pages:
            page_dir = os.path.join(bag.data_directory, page.name)
            os.makedirs(page_dir)
            page_runjobs = runjobs.filter(page=page)

            if not jobs:
                # If no jobs are provided, we will just make a list of jobs from the available runjobs.
                jobs = []
                if len(page_runjobs) > 0:
                    runjob_chunk = page_chunk / len(page_runjobs)

                for runjob in page_runjobs:
                    _add_result_to_bag(page_dir, runjob, bag)

                    completed += runjob_chunk
                    _ensure_db_state(resultspackage)
                    _update_progress(resultspackage, completed)

                    if runjob.workflow_job.job not in jobs:
                        jobs.append(runjob.workflow_job.job)

            else:
                if len(jobs) > 0:
                    job_chunk = page_chunk / len(jobs)

                for job in jobs:
                    matcthing_runjobs = page_runjobs.filter(
                        workflow_job__job=job)
                    if len(matcthing_runjobs) > 0:
                        runjob_chunk = job_chunk / len(matcthing_runjobs)

                    for runjob in matcthing_runjobs:
                        _add_result_to_bag(page_dir, runjob, bag)

                        completed += runjob_chunk
                        _ensure_db_state(resultspackage)
                        _update_progress(resultspackage, completed)

        bag.update()
        errors = bag.validate()
        if not bag.is_valid:
            _ensure_db_state(resultspackage)
            resultspackage.status = ResultsPackageStatus.FAILED
            resultspackage.save()
            raise BagNotValidError("The bag failed validation.\n" +
                                   str(errors))

        bag.package(resultspackage.package_path, method='zip')
        resultspackage.download_url = resultspackage.file_url
        resultspackage.percent_completed = 100
        resultspackage.status = ResultsPackageStatus.COMPLETE

        # If pages and jobs were not provided, we populate these fields now
        # since we have figured them out.
        resultspackage.pages = pages
        resultspackage.jobs = jobs

        _ensure_db_state(resultspackage)
        resultspackage.save()
        shutil.rmtree(resultspackage.bag_path)
Ejemplo n.º 9
0
 def test_unicode_characters_in_bagnam(self):
     newbag = BagIt(os.path.join(os.getcwd(), 'test', 'tëst'))
     self.assertTrue(
         os.path.exists(os.path.join(os.getcwd(), 'test', 'tëst')))
Ejemplo n.º 10
0
 def setUp(self):
     self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag'))
Ejemplo n.º 11
0
 def setUp(self):
     self.bag = BagIt(os.path.join(os.getcwd(), "test", "testbag"))