Beispiel #1
0
    def run(self, path, format_map, delete_original=True):
        self.files_count = 0
        path, = self.parse_params(path)

        if os.path.isfile(path):
            try:
                new_format = format_map[os.path.splitext(path)[1][1:]]
            except KeyError:
                return
            else:
                convert_file(path, new_format)
                self.files_count += 1
                if delete_original:
                    os.remove(path)
            return

        for root, dirs, filenames in walk(path):
            for fname in filenames:
                filepath = os.path.join(root, fname)
                try:
                    new_format = format_map[os.path.splitext(filepath)[1][1:]]
                except KeyError:
                    continue
                else:
                    convert_file(filepath, new_format)
                    self.files_count += 1
                    if delete_original:
                        os.remove(filepath)
Beispiel #2
0
def ConvertFile(self, path, format_map, delete_original=True):
    self.files_count = 0
    path, = self.parse_params(path)

    if os.path.isfile(path):
        try:
            new_format = format_map[os.path.splitext(path)[1][1:]]
        except KeyError:
            return
        else:
            convert_file(path, new_format)
            self.files_count += 1
            if delete_original:
                os.remove(path)
        return

    for root, _dirs, filenames in walk(path):
        for fname in filenames:
            filepath = os.path.join(root, fname)
            try:
                new_format = format_map[os.path.splitext(filepath)[1][1:]]
            except KeyError:
                continue
            else:
                convert_file(filepath, new_format)
                self.files_count += 1
                if delete_original:
                    os.remove(filepath)

    msg = "Converted %s file(s) at %s" % (self.files_count, path,)
    self.create_success_event(msg)
Beispiel #3
0
 def run(self, filepath, new_format, delete_original=True):
     try:
         convert_file(filepath, new_format)
     except:
         raise
     else:
         if delete_original:
             os.remove(filepath)
Beispiel #4
0
    def test_non_zero_returncode(self, mock_popen):
        process_mock = mock.Mock()
        attrs = {
            'communicate.return_value': ('output', 'error'),
            'returncode': 1
        }
        process_mock.configure_mock(**attrs)
        mock_popen.return_value = process_mock

        with self.assertRaises(ValueError):
            convert_file("test.docx", "pdf")

        cmd = 'unoconv -f %s -eSelectPdfVersion=1 "%s"' % ('pdf', 'test.docx')
        mock_popen.assert_called_once_with(cmd,
                                           shell=True,
                                           stderr=PIPE,
                                           stdout=PIPE)
Beispiel #5
0
    def test_non_zero_returncode(self, mock_popen):
        process_mock = mock.Mock()
        attrs = {
            'communicate.return_value': ('output', 'error'),
            'returncode': 1
        }
        process_mock.configure_mock(**attrs)
        mock_popen.return_value = process_mock

        with self.assertRaises(ValueError):
            convert_file("test.docx", "pdf")

        if sys.platform == "win32":
            cmd = ['python.exe', 'C:/ESSArch/pd/python/scripts/unoconv.py']
        else:
            cmd = ['unoconv']
        cmd.extend(['-f', 'pdf', '-eSelectPdfVersion=1', 'test.docx'])
        mock_popen.assert_called_once_with(cmd, stderr=PIPE, stdout=PIPE)
Beispiel #6
0
    def test_zero_returncode_with_no_file_created(self, mock_popen,
                                                  mock_isfile):
        process_mock = mock.Mock()
        attrs = {
            'communicate.return_value': ('output', 'error'),
            'returncode': 0
        }
        process_mock.configure_mock(**attrs)
        mock_popen.return_value = process_mock

        with self.assertRaises(ValueError):
            convert_file("test.docx", "pdf")

        if sys.platform == "win32":
            cmd = [
                'python.exe',
                os.path.join(get_script_directory(), 'unoconv.py')
            ]
        else:
            cmd = ['unoconv']
        cmd.extend(['-f', 'pdf', '-eSelectPdfVersion=1', 'test.docx'])
        mock_popen.assert_called_once_with(cmd, stderr=PIPE, stdout=PIPE)
Beispiel #7
0
    def test_zero_returncode_with_file_created(self, mock_popen, mock_isfile):
        process_mock = mock.Mock()
        attrs = {
            'communicate.return_value': ('output', 'error'),
            'returncode': 0
        }
        process_mock.configure_mock(**attrs)
        mock_popen.return_value = process_mock

        self.assertEqual(convert_file("test.docx", "pdf"), 'test.pdf')

        cmd = 'unoconv -f %s -eSelectPdfVersion=1 "%s"' % ('pdf', 'test.docx')
        mock_popen.assert_called_once_with(cmd,
                                           shell=True,
                                           stderr=PIPE,
                                           stdout=PIPE)
Beispiel #8
0
    def _run(self):
        def get_information_packages():
            return self.rule.information_packages.filter(
                active=True, ).exclude(conversion_job_entries__job=self, )

        ips = get_information_packages()

        for ip in ips.order_by(
                '-cached').iterator():  # convert cached IPs first
            run_cached_ip(ip)

            policy = ip.policy
            srcdir = os.path.join(policy.cache_storage.value,
                                  ip.object_identifier_value)

            new_ip = ip.create_new_generation(ip.state, ip.responsible, None)

            dstdir = os.path.join(policy.cache_storage.value,
                                  new_ip.object_identifier_value)

            new_ip.object_path = dstdir
            new_ip.save()

            aip_profile = new_ip.get_profile_rel('aip').profile
            aip_profile_data = new_ip.get_profile_data('aip')

            mets_dir, mets_name = find_destination("mets_file",
                                                   aip_profile.structure)
            mets_path = os.path.join(srcdir, mets_dir, mets_name)

            # copy files to new generation
            shutil.copytree(srcdir, dstdir)

            # convert files specified in rule
            for pattern, spec in self.rule.specification.items():
                target = spec['target']
                tool = spec['tool']

                for path in iglob(dstdir + '/' + pattern):
                    if os.path.isdir(path):
                        for root, dirs, files in walk(path):
                            rel = os.path.relpath(root, dstdir)

                            for f in files:
                                fpath = os.path.join(root, f)
                                job_entry = ConversionJobEntry.objects.create(
                                    job=self,
                                    start_date=timezone.now(),
                                    ip=ip,
                                    old_document=os.path.join(rel, f))
                                convert_file(fpath, target)

                                os.remove(fpath)

                                job_entry.new_document = os.path.splitext(
                                    job_entry.old_document)[0] + '.' + target
                                job_entry.end_date = timezone.now()
                                job_entry.tool = tool
                                job_entry.save()

                    elif os.path.isfile(path):
                        rel = os.path.relpath(path, dstdir)

                        job_entry = ConversionJobEntry.objects.create(
                            job=self,
                            start_date=timezone.now(),
                            ip=ip,
                            old_document=rel,
                        )
                        convert_file(path, target)

                        os.remove(path)

                        job_entry.new_document = os.path.splitext(
                            job_entry.old_document)[0] + '.' + target
                        job_entry.end_date = timezone.now()
                        job_entry.tool = tool
                        job_entry.save()

            # preserve new generation
            preserve_new_generation(aip_profile, aip_profile_data, dstdir, ip,
                                    mets_path, new_ip, policy)
Beispiel #9
0
    def _run(self):
        def get_information_packages(job):
            return self.rule.information_packages.filter(
                active=True, ).exclude(conversion_job_entries__job=self, )

        ips = get_information_packages(self)

        for ip in ips.order_by(
                '-cached').iterator():  # convert cached IPs first
            while not ip.cached:
                with allow_join_result():
                    t, created = ProcessTask.objects.get_or_create(
                        name='workflow.tasks.CacheAIP',
                        information_package=ip,
                        defaults={
                            'responsible': ip.responsible,
                            'eager': False
                        })

                    if not created:
                        t.run()

                time.sleep(10)
                ip.refresh_from_db()

            policy = ip.policy
            srcdir = os.path.join(policy.cache_storage.value,
                                  ip.object_identifier_value)

            new_ip = ip.create_new_generation(ip.state, ip.responsible, None)

            dstdir = os.path.join(policy.cache_storage.value,
                                  new_ip.object_identifier_value)

            new_ip.object_path = dstdir
            new_ip.save()

            aip_profile = new_ip.get_profile_rel('aip').profile
            aip_profile_data = new_ip.get_profile_data('aip')

            mets_dir, mets_name = find_destination("mets_file",
                                                   aip_profile.structure)
            mets_path = os.path.join(srcdir, mets_dir, mets_name)

            mets_tree = etree.parse(mets_path)

            # copy files to new generation
            shutil.copytree(srcdir, dstdir)

            # convert files specified in rule
            for pattern, spec in six.iteritems(self.rule.specification):
                target = spec['target']
                tool = spec['tool']

                for path in iglob(dstdir + '/' + pattern):
                    if os.path.isdir(path):
                        for root, dirs, files in walk(path):
                            rel = os.path.relpath(root, dstdir)

                            for f in files:
                                fpath = os.path.join(root, f)
                                job_entry = ConversionJobEntry.objects.create(
                                    job=self,
                                    start_date=timezone.now(),
                                    ip=ip,
                                    old_document=os.path.join(rel, f))
                                convert_file(fpath, target)

                                os.remove(fpath)

                                job_entry.new_document = os.path.splitext(
                                    job_entry.old_document)[0] + '.' + target
                                job_entry.end_date = timezone.now()
                                job_entry.tool = tool
                                job_entry.save()

                    elif os.path.isfile(path):
                        rel = os.path.relpath(path, dstdir)

                        job_entry = ConversionJobEntry.objects.create(
                            job=self,
                            start_date=timezone.now(),
                            ip=ip,
                            old_document=rel,
                        )
                        convert_file(path, target)

                        os.remove(path)

                        job_entry.new_document = os.path.splitext(
                            job_entry.old_document)[0] + '.' + target
                        job_entry.end_date = timezone.now()
                        job_entry.tool = tool
                        job_entry.save()

            # preserve new generation
            sa = new_ip.submission_agreement

            try:
                os.remove(mets_path)
            except OSError as e:
                if e.errno != errno.ENOENT:
                    raise

            filesToCreate = OrderedDict()

            try:
                premis_profile = new_ip.get_profile_rel(
                    'preservation_metadata').profile
                premis_profile_data = ip.get_profile_data(
                    'preservation_metadata')
            except ProfileIP.DoesNotExist:
                pass
            else:
                premis_dir, premis_name = find_destination(
                    "preservation_description_file", aip_profile.structure)
                premis_path = os.path.join(dstdir, premis_dir, premis_name)

                try:
                    os.remove(premis_path)
                except OSError as e:
                    if e.errno != errno.ENOENT:
                        raise

                filesToCreate[premis_path] = {
                    'spec':
                    premis_profile.specification,
                    'data':
                    fill_specification_data(premis_profile_data,
                                            ip=new_ip,
                                            sa=sa),
                }

            filesToCreate[mets_path] = {
                'spec':
                aip_profile.specification,
                'data':
                fill_specification_data(aip_profile_data, ip=new_ip, sa=sa),
            }

            t = ProcessTask.objects.create(
                name='ESSArch_Core.tasks.GenerateXML',
                params={
                    'filesToCreate': filesToCreate,
                    'folderToParse': dstdir,
                },
                responsible=new_ip.responsible,
                information_package=new_ip,
            )
            t.run().get()

            dsttar = dstdir + '.tar'
            dstxml = dstdir + '.xml'

            objid = new_ip.object_identifier_value

            with tarfile.open(dsttar, 'w') as tar:
                for root, dirs, files in walk(dstdir):
                    rel = os.path.relpath(root, dstdir)
                    for d in dirs:
                        src = os.path.join(root, d)
                        arc = os.path.join(objid, rel, d)
                        arc = os.path.normpath(arc)
                        index_path(new_ip, src)
                        tar.add(src, arc, recursive=False)

                    for f in files:
                        src = os.path.join(root, f)
                        index_path(new_ip, src)
                        tar.add(src,
                                os.path.normpath(os.path.join(objid, rel, f)))

            algorithm = policy.get_checksum_algorithm_display()
            checksum = calculate_checksum(dsttar, algorithm=algorithm)

            info = fill_specification_data(
                new_ip.get_profile_data('aip_description'), ip=new_ip, sa=sa)
            info["_IP_CREATEDATE"] = timestamp_to_datetime(
                creation_date(dsttar)).isoformat()

            aip_desc_profile = new_ip.get_profile('aip_description')
            filesToCreate = {
                dstxml: {
                    'spec': aip_desc_profile.specification,
                    'data': info
                }
            }

            ProcessTask.objects.create(
                name="ESSArch_Core.tasks.GenerateXML",
                params={
                    "filesToCreate": filesToCreate,
                    "folderToParse": dsttar,
                    "extra_paths_to_parse": [mets_path],
                    "algorithm": algorithm,
                },
                information_package=new_ip,
                responsible=new_ip.responsible,
            ).run().get()

            InformationPackage.objects.filter(pk=new_ip.pk).update(
                message_digest=checksum,
                message_digest_algorithm=policy.checksum_algorithm,
            )

            ProcessTask.objects.create(
                name='ESSArch_Core.tasks.UpdateIPSizeAndCount',
                information_package=new_ip,
                responsible=new_ip.responsible,
            ).run().get()

            t = ProcessTask.objects.create(
                name='workflow.tasks.StoreAIP',
                information_package=new_ip,
                responsible=new_ip.responsible,
            )

            t.run()