def index_path(ip, path, parent=None): """ Indexes the file or directory at path to elasticsearch :param ip: The IP the path belongs to :type ip: InformationPackage :param path: The path of the file or directory :type path: str :param parent: The parent of the tag :type parent: TagStructure :return: The indexed elasticsearch document :rtype: File or Directory """ isfile = os.path.isfile(path) id = str(uuid.uuid4()) tag = Tag.objects.create(information_package=ip) tag_version = TagVersion(pk=id, tag=tag, name=os.path.basename(path)) if parent: TagStructure.objects.create(tag=tag, parent=parent, structure=parent.structure) if isfile: tag_version.elastic_index = 'document' tag_version.type = 'document' tag_version.save() return index_document(ip, path, id) else: tag_version.elastic_index = 'directory' tag_version.type = 'directory' tag_version.save() return index_directory(ip, path, id)
def update(self, instance: TagVersion, validated_data): structures = validated_data.pop('structures', []) notes_data = validated_data.pop('notes', None) identifiers_data = validated_data.pop('identifiers', None) appraisal_date = validated_data.pop('appraisal_date', instance.tag.appraisal_date) self.update_identifiers(instance, identifiers_data) self.update_notes(instance, notes_data) with transaction.atomic(): for structure in structures: if not TagStructure.objects.filter(tag=instance.tag, structure__template=structure).exists(): structure_instance, _ = structure.create_template_instance(instance.tag) for instance_unit in structure_instance.units.all(): StructureUnitDocument.from_obj(instance_unit).save() instance.tag.appraisal_date = appraisal_date instance.tag.save() TagVersion.objects.filter(pk=instance.pk).update(**validated_data) instance.refresh_from_db() doc = Archive.from_obj(instance) doc.save() return instance
def parse_errands(self, ip, rootdir, archive, errands_root): archive_structure = archive.get_active_structure() structure = archive_structure.structure for errand in self.get_arkiv_objekt_arenden(errands_root): component, structure_unit = self.parse_errand(errand, archive, ip, structure) tag = Tag(information_package=ip, task=self.task) tag_version = TagVersion(pk=component.meta.id, tag=tag, elastic_index=component._index._name, name=component.name, type=component.type, reference_code=component.reference_code) tag_repr = TagStructure( tag=tag, structure_unit=structure_unit, structure=structure, parent=archive_structure, tree_id=archive_structure.tree_id, lft=0, rght=0, level=0, ) acts_root = self.get_acts_root(errand) if len(acts_root): for act in self.parse_acts(ip, rootdir, component, acts_root[0], tag_repr): yield act yield tag, tag_version, tag_repr, component.to_dict(include_meta=True)
def parse_document(self, ip, rootdir, document, act, parent): id = str(uuid.uuid4()) name = document.get("Namn") desc = document.get("Beskrivning") filepath = document.get('Lank') if ip is not None: filepath = os.path.join(ip.object_path, ip.sip_path, document.get('Lank')) elif rootdir is not None: filepath = os.path.join(rootdir, document.get('Lank')) href = os.path.dirname(os.path.relpath(filepath, rootdir)) href = '' if href == '.' else href filename = os.path.basename(filepath) ext = os.path.splitext(filepath)[1][1:] with open(filepath, 'rb') as f: content = f.read() encoded_content = base64.b64encode(content).decode("ascii") size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) d = File( _id=id, name=name, type='Bilaga', archive=act.archive, desc=desc, filename=filename, href=href, extension=ext, data=encoded_content, size=size, modified=modified, current_version=True, ip=act.ip, task_id=str(self.task.pk), ) tag = Tag(information_package=ip, task=self.task) tag_version = TagVersion(pk=d.meta.id, tag=tag, elastic_index=d._index._name, name=d.name, type=d.type, reference_code='') tag_repr = TagStructure( tag=tag, parent=parent, structure=parent.structure, tree_id=parent.tree_id, lft=0, rght=0, level=0, ) self.indexed_files.append(filepath) d_dict = d.to_dict(include_meta=True) d_dict['pipeline'] = 'ingest_attachment' return tag, tag_version, tag_repr, d_dict
def update(self, instance: TagVersion, validated_data): structure_unit = validated_data.pop('structure_unit', None) parent = validated_data.pop('parent', None) structure = validated_data.pop('structure', None) notes_data = validated_data.pop('notes', None) identifiers_data = validated_data.pop('identifiers', None) information_package = validated_data.pop('information_package', instance.tag.information_package) appraisal_date = validated_data.pop('appraisal_date', instance.tag.appraisal_date) validated_data.pop('index', None) self.update_identifiers(instance, identifiers_data) self.update_notes(instance, notes_data) if structure is not None: tag = instance.tag if structure_unit is not None: archive_structure = structure.tagstructure_set.first().get_root() parent = archive_structure elif parent is not None: parent_structure = parent.get_structures(structure).get() parent = parent_structure structure_unit = None if parent or structure_unit: TagStructure.objects.update_or_create(tag=tag, structure=structure, defaults={ 'parent': parent, 'structure_unit': structure_unit, }) instance.tag.information_package = information_package instance.tag.appraisal_date = appraisal_date instance.tag.save() TagVersion.objects.filter(pk=instance.pk).update(**validated_data) instance.refresh_from_db() if instance.elastic_index == 'component': doc = Component.from_obj(instance) elif instance.elastic_index == 'document': doc = File.from_obj(instance) doc.save() return instance
def index_path(ip, path, parent=None): """ Indexes the file or directory at path to elasticsearch :param ip: The IP the path belongs to :type ip: InformationPackage :param path: The path of the file or directory :type path: str :param parent: The parent of the tag :type parent: TagStructure :return: The indexed elasticsearch document :rtype: File or Directory """ isfile = os.path.isfile(path) id = str(uuid.uuid4()) tag = Tag.objects.create(information_package=ip) tag_version = TagVersion(pk=id, tag=tag, name=os.path.basename(path)) if parent: TagStructure.objects.create(tag=tag, parent=parent, structure=parent.structure) logger.debug('indexing {}'.format(path)) if isfile: tag_version.elastic_index = 'document' # TODO: minimize db queries tag_version.type = TagVersionType.objects.get_or_create( name='document', archive_type=False)[0] doc, tag_version = index_document(tag_version, path) tag_version.save() else: tag_version.elastic_index = 'directory' # TODO: minimize db queries tag_version.type = TagVersionType.objects.get_or_create( name='directory', archive_type=False)[0] doc, tag_version = index_directory(tag_version, path) tag_version.save()
def parse_volym(cls, el, archive_version, parent_tag_structure, structure_unit, agent, task=None, ip=None): logger.debug("Parsing volym...") ref_code = el.xpath("va:volnr", namespaces=cls.NSMAP)[0].text name = el.xpath("va:utseende", namespaces=cls.NSMAP)[0].text tag_type = cls.VOLUME_TYPE volym_id = uuid.uuid4() tag = Tag(information_package=ip, task=task) tag_version = TagVersion( pk=volym_id, tag=tag, elastic_index='component', reference_code=ref_code, name=name, create_date=cls.parse_volume_create_date(el), revise_date=cls.parse_volume_revise_date(el), import_date=timezone.now(), type=tag_type, ) tag_structure = TagStructure( tag=tag, structure_unit=structure_unit, structure=parent_tag_structure.structure, parent=parent_tag_structure, tree_id=parent_tag_structure.tree_id, lft=0, rght=0, level=0 ) agent_tag_link = AgentTagLink( agent=agent, tag_id=tag_version.id, type=cls.AGENT_TAG_LINK_RELATION_TYPE, ) doc = Component.from_obj(tag_version, archive=archive_version) doc.agents = [str(agent.pk)] logger.debug("Parsed volym: {}".format(tag_version.pk)) return doc.to_dict(include_meta=True), tag, tag_version, tag_structure, agent_tag_link
def parse_acts(self, ip, rootdir, errand, acts_root, parent): for act_el in acts_root.xpath("*[local-name()='ArkivobjektHandling']"): act = self.parse_act(act_el, errand) tag = Tag(information_package=ip, task=self.task) tag_version = TagVersion(pk=act.meta.id, tag=tag, elastic_index=act._index._name, name=act.name, type=act.type, reference_code=act.reference_code) tag_repr = TagStructure( tag=tag, parent=parent, structure=parent.structure, tree_id=parent.tree_id, lft=0, rght=0, level=0 ) for doc_el in act_el.xpath("*[local-name()='Bilaga']"): yield self.parse_document(ip, rootdir, doc_el, act, tag_repr) yield tag, tag_version, tag_repr, act.to_dict(include_meta=True)