def run(self, path, format_map, delete_original=True): self.files_count = 0 path, = self.parse_params(path) if os.path.isfile(path): try: new_format = format_map[os.path.splitext(path)[1][1:]] except KeyError: return else: convert_file(path, new_format) self.files_count += 1 if delete_original: os.remove(path) return for root, dirs, filenames in walk(path): for fname in filenames: filepath = os.path.join(root, fname) try: new_format = format_map[os.path.splitext(filepath)[1][1:]] except KeyError: continue else: convert_file(filepath, new_format) self.files_count += 1 if delete_original: os.remove(filepath)
def ConvertFile(self, path, format_map, delete_original=True): self.files_count = 0 path, = self.parse_params(path) if os.path.isfile(path): try: new_format = format_map[os.path.splitext(path)[1][1:]] except KeyError: return else: convert_file(path, new_format) self.files_count += 1 if delete_original: os.remove(path) return for root, _dirs, filenames in walk(path): for fname in filenames: filepath = os.path.join(root, fname) try: new_format = format_map[os.path.splitext(filepath)[1][1:]] except KeyError: continue else: convert_file(filepath, new_format) self.files_count += 1 if delete_original: os.remove(filepath) msg = "Converted %s file(s) at %s" % (self.files_count, path,) self.create_success_event(msg)
def run(self, filepath, new_format, delete_original=True): try: convert_file(filepath, new_format) except: raise else: if delete_original: os.remove(filepath)
def test_non_zero_returncode(self, mock_popen): process_mock = mock.Mock() attrs = { 'communicate.return_value': ('output', 'error'), 'returncode': 1 } process_mock.configure_mock(**attrs) mock_popen.return_value = process_mock with self.assertRaises(ValueError): convert_file("test.docx", "pdf") cmd = 'unoconv -f %s -eSelectPdfVersion=1 "%s"' % ('pdf', 'test.docx') mock_popen.assert_called_once_with(cmd, shell=True, stderr=PIPE, stdout=PIPE)
def test_non_zero_returncode(self, mock_popen): process_mock = mock.Mock() attrs = { 'communicate.return_value': ('output', 'error'), 'returncode': 1 } process_mock.configure_mock(**attrs) mock_popen.return_value = process_mock with self.assertRaises(ValueError): convert_file("test.docx", "pdf") if sys.platform == "win32": cmd = ['python.exe', 'C:/ESSArch/pd/python/scripts/unoconv.py'] else: cmd = ['unoconv'] cmd.extend(['-f', 'pdf', '-eSelectPdfVersion=1', 'test.docx']) mock_popen.assert_called_once_with(cmd, stderr=PIPE, stdout=PIPE)
def test_zero_returncode_with_no_file_created(self, mock_popen, mock_isfile): process_mock = mock.Mock() attrs = { 'communicate.return_value': ('output', 'error'), 'returncode': 0 } process_mock.configure_mock(**attrs) mock_popen.return_value = process_mock with self.assertRaises(ValueError): convert_file("test.docx", "pdf") if sys.platform == "win32": cmd = [ 'python.exe', os.path.join(get_script_directory(), 'unoconv.py') ] else: cmd = ['unoconv'] cmd.extend(['-f', 'pdf', '-eSelectPdfVersion=1', 'test.docx']) mock_popen.assert_called_once_with(cmd, stderr=PIPE, stdout=PIPE)
def test_zero_returncode_with_file_created(self, mock_popen, mock_isfile): process_mock = mock.Mock() attrs = { 'communicate.return_value': ('output', 'error'), 'returncode': 0 } process_mock.configure_mock(**attrs) mock_popen.return_value = process_mock self.assertEqual(convert_file("test.docx", "pdf"), 'test.pdf') cmd = 'unoconv -f %s -eSelectPdfVersion=1 "%s"' % ('pdf', 'test.docx') mock_popen.assert_called_once_with(cmd, shell=True, stderr=PIPE, stdout=PIPE)
def _run(self): def get_information_packages(): return self.rule.information_packages.filter( active=True, ).exclude(conversion_job_entries__job=self, ) ips = get_information_packages() for ip in ips.order_by( '-cached').iterator(): # convert cached IPs first run_cached_ip(ip) policy = ip.policy srcdir = os.path.join(policy.cache_storage.value, ip.object_identifier_value) new_ip = ip.create_new_generation(ip.state, ip.responsible, None) dstdir = os.path.join(policy.cache_storage.value, new_ip.object_identifier_value) new_ip.object_path = dstdir new_ip.save() aip_profile = new_ip.get_profile_rel('aip').profile aip_profile_data = new_ip.get_profile_data('aip') mets_dir, mets_name = find_destination("mets_file", aip_profile.structure) mets_path = os.path.join(srcdir, mets_dir, mets_name) # copy files to new generation shutil.copytree(srcdir, dstdir) # convert files specified in rule for pattern, spec in self.rule.specification.items(): target = spec['target'] tool = spec['tool'] for path in iglob(dstdir + '/' + pattern): if os.path.isdir(path): for root, dirs, files in walk(path): rel = os.path.relpath(root, dstdir) for f in files: fpath = os.path.join(root, f) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=os.path.join(rel, f)) convert_file(fpath, target) os.remove(fpath) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() elif os.path.isfile(path): rel = os.path.relpath(path, dstdir) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=rel, ) convert_file(path, target) os.remove(path) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() # preserve new generation preserve_new_generation(aip_profile, aip_profile_data, dstdir, ip, mets_path, new_ip, policy)
def _run(self): def get_information_packages(job): return self.rule.information_packages.filter( active=True, ).exclude(conversion_job_entries__job=self, ) ips = get_information_packages(self) for ip in ips.order_by( '-cached').iterator(): # convert cached IPs first while not ip.cached: with allow_join_result(): t, created = ProcessTask.objects.get_or_create( name='workflow.tasks.CacheAIP', information_package=ip, defaults={ 'responsible': ip.responsible, 'eager': False }) if not created: t.run() time.sleep(10) ip.refresh_from_db() policy = ip.policy srcdir = os.path.join(policy.cache_storage.value, ip.object_identifier_value) new_ip = ip.create_new_generation(ip.state, ip.responsible, None) dstdir = os.path.join(policy.cache_storage.value, new_ip.object_identifier_value) new_ip.object_path = dstdir new_ip.save() aip_profile = new_ip.get_profile_rel('aip').profile aip_profile_data = new_ip.get_profile_data('aip') mets_dir, mets_name = find_destination("mets_file", aip_profile.structure) mets_path = os.path.join(srcdir, mets_dir, mets_name) mets_tree = etree.parse(mets_path) # copy files to new generation shutil.copytree(srcdir, dstdir) # convert files specified in rule for pattern, spec in six.iteritems(self.rule.specification): target = spec['target'] tool = spec['tool'] for path in iglob(dstdir + '/' + pattern): if os.path.isdir(path): for root, dirs, files in walk(path): rel = os.path.relpath(root, dstdir) for f in files: fpath = os.path.join(root, f) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=os.path.join(rel, f)) convert_file(fpath, target) os.remove(fpath) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() elif os.path.isfile(path): rel = os.path.relpath(path, dstdir) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=rel, ) convert_file(path, target) os.remove(path) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() # preserve new generation sa = new_ip.submission_agreement try: os.remove(mets_path) except OSError as e: if e.errno != errno.ENOENT: raise filesToCreate = OrderedDict() try: premis_profile = new_ip.get_profile_rel( 'preservation_metadata').profile premis_profile_data = ip.get_profile_data( 'preservation_metadata') except ProfileIP.DoesNotExist: pass else: premis_dir, premis_name = find_destination( "preservation_description_file", aip_profile.structure) premis_path = os.path.join(dstdir, premis_dir, premis_name) try: os.remove(premis_path) except OSError as e: if e.errno != errno.ENOENT: raise filesToCreate[premis_path] = { 'spec': premis_profile.specification, 'data': fill_specification_data(premis_profile_data, ip=new_ip, sa=sa), } filesToCreate[mets_path] = { 'spec': aip_profile.specification, 'data': fill_specification_data(aip_profile_data, ip=new_ip, sa=sa), } t = ProcessTask.objects.create( name='ESSArch_Core.tasks.GenerateXML', params={ 'filesToCreate': filesToCreate, 'folderToParse': dstdir, }, responsible=new_ip.responsible, information_package=new_ip, ) t.run().get() dsttar = dstdir + '.tar' dstxml = dstdir + '.xml' objid = new_ip.object_identifier_value with tarfile.open(dsttar, 'w') as tar: for root, dirs, files in walk(dstdir): rel = os.path.relpath(root, dstdir) for d in dirs: src = os.path.join(root, d) arc = os.path.join(objid, rel, d) arc = os.path.normpath(arc) index_path(new_ip, src) tar.add(src, arc, recursive=False) for f in files: src = os.path.join(root, f) index_path(new_ip, src) tar.add(src, os.path.normpath(os.path.join(objid, rel, f))) algorithm = policy.get_checksum_algorithm_display() checksum = calculate_checksum(dsttar, algorithm=algorithm) info = fill_specification_data( new_ip.get_profile_data('aip_description'), ip=new_ip, sa=sa) info["_IP_CREATEDATE"] = timestamp_to_datetime( creation_date(dsttar)).isoformat() aip_desc_profile = new_ip.get_profile('aip_description') filesToCreate = { dstxml: { 'spec': aip_desc_profile.specification, 'data': info } } ProcessTask.objects.create( name="ESSArch_Core.tasks.GenerateXML", params={ "filesToCreate": filesToCreate, "folderToParse": dsttar, "extra_paths_to_parse": [mets_path], "algorithm": algorithm, }, information_package=new_ip, responsible=new_ip.responsible, ).run().get() InformationPackage.objects.filter(pk=new_ip.pk).update( message_digest=checksum, message_digest_algorithm=policy.checksum_algorithm, ) ProcessTask.objects.create( name='ESSArch_Core.tasks.UpdateIPSizeAndCount', information_package=new_ip, responsible=new_ip.responsible, ).run().get() t = ProcessTask.objects.create( name='workflow.tasks.StoreAIP', information_package=new_ip, responsible=new_ip.responsible, ) t.run()