def children(self, quick=False): """Returns list of the Collection's Entity objects. >>> c = Collection.from_json('/tmp/ddr-testing-123') >>> c.children() [<Entity ddr-testing-123-1>, <Entity ddr-testing-123-2>, ...] TODO use util.find_meta_files() @param quick: Boolean List only titles and IDs @param dicts: Boolean List only titles and IDs (dicts) @returns: list of Entities or ListEntity """ entity_paths = [] if os.path.exists(self.files_path): # TODO use cached list if available for eid in os.listdir(self.files_path): path = os.path.join(self.files_path, eid) entity_paths.append(path) entity_paths = util.natural_sort(entity_paths) entities = [] for path in entity_paths: if quick: # fake Entity with just enough info for lists entity_json_path = os.path.join(path, 'entity.json') if os.path.exists(entity_json_path): e = ListEntity() e.identifier = Identifier(path=path) e.id = e.identifier.id for line in fileio.read_text(entity_json_path).split('\n'): if '"title":' in line: e.title = json.loads('{%s}' % line)['title'] elif '"signature_id":' in line: e.signature_id = json.loads('{%s}' % line)['signature_id'] e.signature_abs = common.signature_abs( e, self.identifier.basepath) if e.title and e.signature_id: # stop once we have what we need so we don't waste time # and have entity.children as separate ghost entities break entities.append(e) else: entity = Entity.from_identifier(Identifier(path=path)) for lv in entity.labels_values(): if lv['label'] == 'title': entity.title = lv['value'] entities.append(entity) return entities
def delete(self, document_id, recursive=False): """Delete a document and optionally its children. @param document_id: @param recursive: True or False """ identifier = Identifier(id=document_id) if recursive: if identifier.model == 'collection': doc_type = 'collection,entity,file' elif identifier.model == 'entity': doc_type = 'entity,file' elif identifier.model == 'file': doc_type = 'file' query = 'id:"%s"' % identifier.id try: return self.es.delete_by_query(index=self.indexname, doc_type=doc_type, q=query) except TransportError: pass else: try: return self.es.delete(index=self.indexname, doc_type=identifier.model, id=identifier.id) except TransportError: pass
def file_name(entity, path_abs, role, sha1=None): """Generate a new name for the specified file; Use only when ingesting a file! rename files to standard names on ingest: %{entity_id%}-%{role}-%{sha1}.%{ext} example: ddr-testing-56-101-master-fb73f9de29.jpg SHA1 is optional so it can be passed in by a calling process that has already generated it. @param entity @param path_abs: Absolute path to the file. @param role @param sha1: SHA1 hash (optional) """ if os.path.exists and os.access(path_abs, os.R_OK): ext = os.path.splitext(path_abs)[1] if not sha1: sha1 = util.file_hash(path_abs, 'sha1') if sha1: idparts = [a for a in entity.idparts] idparts.append(role) idparts.append(sha1[:10]) name = '{}{}'.format(Identifier(parts=idparts).id, ext) return name return None
def check_file(json_path, verbose=False): fi = Identifier(json_path) f = models.File.from_identifier(fi) if not os.path.exists(f.path_abs): result = ['missing', f.path_abs] print(result) return result mismatches = [] md5 = util.file_hash(f.path_abs, 'md5') if not (md5 == f.md5): mismatches.append['md5'] sha1 = util.file_hash(f.path_abs, 'sha1') if not (sha1 == f.sha1): mismatches.append['sha1'] sha256 = util.file_hash(f.path_abs, 'sha256') if not (sha256 == f.sha256): mismatches.append['sha256'] # SHA256 hash from the git-annex filename annex_sha256 = os.path.basename( os.path.realpath(f.path_abs) ).split('--')[1] if not (sha256 == annex_sha256): mismatches.append['annex_sha256'] if mismatches: mismatches.append(json_path) print(mismatches) return mismatches
def ddrpublic_template_key(self): """Combine factors for ddrpublic template selection into key For use in ddrindex publish to Elasticsearch. Generates a key which ddr-public will use to choose a template. Finds Entity's signature file, or the first mezzanine file, or the Entity's first child's first mezzanine file, etc, etc Matches Entity format and file mimetype to template @returns: signature,key """ entity = self try: signature = Identifier(entity.signature_id, config.MEDIA_BASE).object() except: signature = None # VH entities may not have a valid signature if not signature: def first_mezzanine(entity): for fg in entity.file_groups: if fg['role'] == 'mezzanine': files = sorted(fg['files'], key=lambda file: file['sort']) if files: return files[0] return None # use child entity if exists and has mezzanine file if entity.children_meta: for c in entity.children_meta: e = Identifier(c['id'], config.MEDIA_BASE).object() if first_mezzanine(e): entity = e break # get signature image mezzanine = first_mezzanine(entity) if mezzanine: signature = Identifier(mezzanine['id'], config.MEDIA_BASE).object() # prepare decision table key key = None if signature: key = ':'.join([ entity.format, signature.mimetype.split('/')[0] ]) return signature,key
def children(self, quick=False): """Returns list of the Collection's Entity objects. >>> c = Collection.from_json('/tmp/ddr-testing-123') >>> c.children() [<Entity ddr-testing-123-1>, <Entity ddr-testing-123-2>, ...] TODO use util.find_meta_files() @param quick: Boolean List only titles and IDs @param dicts: Boolean List only titles and IDs (dicts) @returns: list of Entities or ListEntity """ entity_paths = [] if os.path.exists(self.files_path): # TODO use cached list if available for eid in os.listdir(self.files_path): path = os.path.join(self.files_path, eid) entity_paths.append(path) entity_paths = util.natural_sort(entity_paths) entities = [] for path in entity_paths: if quick: # fake Entity with just enough info for lists entity_json_path = os.path.join(path, 'entity.json') if os.path.exists(entity_json_path): with open(entity_json_path, 'r') as f: data = json.loads(f.read()) e = ListEntity() e.identifier = Identifier(path=path) e.id = e.identifier.id for line in data[1:]: if 'title' in list(line.keys()): e.title = line['title'] elif 'signature_id' in list(line.keys()): e.signature_id = line['signature_id'] e.signature_abs = common.signature_abs( e, self.identifier.basepath) entities.append(e) else: entity = Entity.from_identifier(Identifier(path=path)) for lv in entity.labels_values(): if lv['label'] == 'title': entity.title = lv['value'] entities.append(entity) return entities
def get_role(f): if isinstance(f, File): return getattr(f, 'role') elif isinstance(f, dict) and f.get('role'): return f.get('role') elif isinstance(f, dict) and f.get('path_rel'): fid = os.path.basename(os.path.splitext(f['path_rel'])[0]) fi = Identifier(id=fid) return fi.idparts['role'] return None
def create(path_abs, identifier=None): """Creates a new Collection with initial values from module.FIELDS. @param path_abs: str Absolute path; must end in valid DDR id. @param identifier: [optional] Identifier @returns: Collection object """ if not identifier: identifier = Identifier(path=path_abs) return common.create_object(identifier)
def _repo_org(self, path, doctype, remove=False): """ seealso DDR.models.common.DDRObject.to_esobject """ # get and validate file data = load_json(path) if (not (data.get('id') and data.get('repo'))): raise Exception('Data file is not well-formed.') oi = Identifier(id=data['id']) d = OrderedDict() d['id'] = oi.id d['model'] = oi.model d['parent_id'] = oi.parent_id(stubs=1) # links d['links_html'] = oi.id d['links_json'] = oi.id d['links_img'] = '%s/logo.png' % oi.id d['links_thumb'] = '%s/logo.png' % oi.id d['links_parent'] = oi.parent_id(stubs=1) d['links_children'] = oi.id # title,description d['title'] = data['title'] d['description'] = data['description'] d['url'] = data['url'] # ID components (repo, org, cid, ...) as separate fields idparts = deepcopy(oi.idparts) idparts.pop('model') for k in ID_COMPONENTS: d[k] = '' # ensure all fields present for k, v in idparts.iteritems(): d[k] = v # add/update if remove and self.exists(doctype, oi): results = self.es.delete(index=self.indexname, doc_type=doctype, id=oi.id) else: results = self.es.index(index=self.indexname, doc_type=doctype, id=oi.id, body=d) return results
def _repo_org(self, path, doctype, remove=False): """ seealso DDR.models.common.DDRObject.to_esobject """ # get and validate file data = load_json(path) if (not (data.get('id') and data.get('repo'))): raise Exception('Data file is not well-formed.') oi = Identifier(id=data['id']) d = OrderedDict() d['id'] = oi.id d['model'] = oi.model d['parent_id'] = oi.parent_id(stubs=1) # links d['links_html'] = oi.id d['links_json'] = oi.id d['links_img'] = '%s/logo.png' % oi.id d['links_thumb'] = '%s/logo.png' % oi.id d['links_parent'] = oi.parent_id(stubs=1) d['links_children'] = oi.id # title,description d['title'] = data['title'] d['description'] = data['description'] d['url'] = data['url'] # ID components (repo, org, cid, ...) as separate fields idparts = deepcopy(oi.idparts) idparts.pop('model') for k in ID_COMPONENTS: d[k] = '' # ensure all fields present for k,v in idparts.iteritems(): d[k] = v # add/update if remove and self.exists(doctype, oi): results = self.es.delete( index=self.indexname, doc_type=doctype, id=oi.id ) else: results = self.es.index( index=self.indexname, doc_type=doctype, id=oi.id, body=d ) return results
def _children_paths(self): """Searches fs for (entity) childrens' .jsons, returns natsorted paths @returns: list """ if os.path.exists(self.files_path): return natsorted([ f for f in util.find_meta_files(self.files_path, recursive=True) # only direct children, no descendants if Identifier(f).parent_id() == self.id ]) return []
def delete(self, document_id, recursive=False): """Delete a document and optionally its children. TODO refactor after upgrading Elasticsearch past 2.4. delete_by_query was removed sometime during elasticsearch-py 2.* I think it was added back in a later version so the code stays for now. For now, instead of deleting based on document_id, we start with document_id, find all paths beneath it in the filesystem, and curl DELETE url each individual document from Elasticsearch. @param document_id: @param recursive: True or False """ logger.debug('delete(%s, %s)' % (document_id, recursive)) oi = Identifier(document_id, config.MEDIA_BASE) if recursive: paths = util.find_meta_files(oi.path_abs(), recursive=recursive, files_first=1) else: paths = [oi.path_abs()] identifiers = [Identifier(path) for path in paths] num = len(identifiers) for n, oi in enumerate(identifiers): # TODO hard-coded models here! if oi.model == 'segment': model = 'entity' else: model = oi.model try: result = self.es.delete(index=self.index_name(model), id=oi.id) print( f'{n}/{num} DELETE {self.index_name(model)} {oi.id} -> {result["result"]}' ) except docstore.NotFoundError as err: print( f'{n}/{num} DELETE {self.index_name(model)} {oi.id} -> 404 Not Found' )
def new(identifier, git_name, git_mail, agent='cmdln'): """Creates new File (metadata only!), writes to filesystem, performs initial commit @param identifier: Identifier @param git_name: str @param git_mail: str @param agent: str @returns: exit,status int,str """ parent = identifier.parent().object() if not parent: raise Exception('Parent for %s does not exist.' % identifier) file_ = File.create(identifier) file_.write_json() entity_file_edit(request, collection, file_, git_name, git_mail) exit,status = commands.entity_create( git_name, git_mail, collection, entity.identifier, [collection.json_path_rel, collection.ead_path_rel], [config.TEMPLATE_EJSON, config.TEMPLATE_METS], agent=agent ) if exit: raise Exception('Could not create new Entity: %s, %s' % (exit, status)) # load Entity object, inherit values from parent, write back to file entity = Identifier(identifier).object() entity.inherit(collection) entity.write_json() updated_files = [entity.json_path] exit,status = commands.entity_update( git_name, git_mail, collection, entity, updated_files, agent=agent ) return exit,status
def _repo_org(self, path, doctype, remove=False): """ seealso DDR.models.common.DDRObject.to_esobject """ # get and validate file data = load_json(path) if (not (data.get('id') and data.get('repo'))): raise Exception('Data file is not well-formed.') oi = Identifier(id=data['id']) ES_Class = ELASTICSEARCH_CLASSES_BY_MODEL[doctype] d = ES_Class(id=oi.id) d.meta.id = oi.id d.model = oi.model d.parent_id = oi.parent_id(stubs=1) # links d.links_html = oi.id d.links_json = oi.id d.links_img = '%s/logo.png' % oi.id d.links_thumb = '%s/logo.png' % oi.id d.links_parent = oi.parent_id(stubs=1) d.links_children = oi.id # title,description d.title = data['title'] d.description = data['description'] d.url = data['url'] # ID components (repo, org, cid, ...) as separate fields idparts = deepcopy(oi.idparts) idparts.pop('model') for key, val in idparts.items(): setattr(d, key, val) # add/update if remove and self.exists(doctype, oi): results = d.delete(index=self.index_name(doctype), using=self.es) else: results = d.save(index=self.index_name(doctype), using=self.es) return results
def signature_abs(obj, basepath): """Absolute path to signature image file, if signature_id present. Expects obj.signature_id to be either a valid file ID or a special interview signature image (ex. "denshovh-aart-03", "denshovh-hlarry_g-02") @returns: str absolute path to signature img, or None """ if isinstance(obj, dict): sid = obj.get('signature_id') else: sid = getattr(obj, 'signature_id', None) # ignore interview signature ID if sid and INTERVIEW_SIG_REGEX.match(sid): return None if sid: try: oi = Identifier(sid, basepath) except: oi = None if oi and oi.model == 'file': return oi.path_abs('access') return None
def identifiers(self, model=None, force_read=False): """Lists Identifiers for all or subset of Collection's descendents. TODO how is this different from children? >>> c = Collection.from_json('/tmp/ddr-testing-123') >>> c.descendants() [<Entity ddr-testing-123-1>, <Entity ddr-testing-123-2>, ...] @param model: str Restrict list to model. @returns: list of Identifiers """ return [ Identifier(path) for path in util.find_meta_files( self.path, recursive=1, model=model, force_read=force_read) ]
def _publishable(paths, parents, force=False): """Determines which paths represent publishable paths and which do not. @param paths @param parents @param force: boolean Just publish the damn collection already. @returns list of dicts, e.g. [{'path':'/PATH/TO/OBJECT', 'action':'publish'}] """ path_dicts = [] for path in paths: d = { 'path': path, 'identifier': Identifier(path=path), 'action': 'UNSPECIFIED', 'note': '', } if force: d['action'] = 'POST' path_dicts.append(d) continue # see if item incomplete or nonpublic # see if item's parents are incomplete or nonpublic # TODO Bad! Bad! Generalize this... UNPUBLISHABLE = [] for parent_id in _file_parent_ids(d['identifier']): parent = parents.get(parent_id, {}) for x in parent.itervalues(): if (x not in STATUS_OK) and (x not in PUBLIC_OK): if parent_id not in UNPUBLISHABLE: UNPUBLISHABLE.append(parent_id) if UNPUBLISHABLE: d['action'] = 'SKIP' d['note'] = 'parent unpublishable' path_dicts.append(d) continue if path and d['identifier'].model: d['action'] = 'POST' path_dicts.append(d) return path_dicts
def child_field_values(self, model, fieldname): """Get all values of fieldname from specified model in collection. @param model str @param fieldname str """ rows = [] paths = util.find_meta_files(self.path_abs, model=model, recursive=True) for path in paths: o = Identifier(path).object() if getattr(o, fieldname): rows.append([ o.id, fieldname, getattr(o, fieldname), ]) return rows
def ddrpublic_template_key(self): """Combine factors for ddrpublic template selection into key For use in ddrindex publish to Elasticsearch. Generates a key which ddr-public will use to choose a template. Finds Entity's signature file, or the first mezzanine file, or the Entity's first child's first mezzanine file, etc, etc Matches Entity format and file mimetype to template @returns: signature,key """ entity = self try: signature = Identifier(entity.signature_id, config.MEDIA_BASE).object() except: signature = None # VH entities may not have a valid signature if not signature: def first_mezzanine(entity): for f in entity.children(role='mezzanine'): return f return None # use child entity if exists and has mezzanine file if entity.children(models=['entity', 'segment']): for c in entity.children(models=['entity', 'segment']): if first_mezzanine(c): entity = c break # get signature image signature = first_mezzanine(entity) # prepare decision table key key = None if signature: key = ':'.join([ getattr(entity, 'format', ''), signature.mimetype.split('/')[0] ]) return signature, key
def sort_file_paths(json_paths, rank='role-eid-sort'): """Sort file JSON paths in human-friendly order. TODO this belongs in DDR.identifier @param json_paths: @param rank: 'role-eid-sort' or 'eid-sort-role' """ paths = {} keys = [] while json_paths: path = json_paths.pop() identifier = Identifier(path=path) eid = identifier.parts.get('eid', None) role = identifier.parts.get('role', None) sha1 = identifier.parts.get('sha1', None) sort = 0 with open(path, 'r') as f: for line in f.readlines(): if 'sort' in line: sort = line.split(':')[1].replace('"', '').strip() eid = str(eid) sha1 = str(sha1) sort = str(sort) if rank == 'eid-sort-role': key = '-'.join([str(eid), sort, role, sha1]) elif rank == 'role-eid-sort': key = '-'.join([role, eid, sort, sha1]) paths[key] = path keys.append(key) keys_sorted = [key for key in util.natural_sort(keys)] paths_sorted = [] while keys_sorted: val = paths.pop(keys_sorted.pop(), None) if val: paths_sorted.append(val) return paths_sorted
def children(self, models=None, role=None, quick=None, force_read=False): """List Entity's child objects,files; optionally regenerate list @param model: list Restrict to specified model(s) @param role: str Restrict list to specified File role @param quick: bool Not used @param force_read: bool Scan entity dir for file jsons @returns: list of File objects, sorted """ if force_read or not self._children_objects: # read objects from filesystem self._children_objects = _sort_children( [Identifier(path).object() for path in self._children_paths()]) if models: return [ o for o in self._children_objects if o.identifier.model in models ] elif role: return [ o for o in self._children_objects if hasattr(o, 'role') and (o.role == role) ] return self._children_objects
def new(identifier, git_name, git_mail, agent='cmdln'): """Creates new File (metadata only!), writes to filesystem, performs initial commit @param identifier: Identifier @param git_name: str @param git_mail: str @param agent: str @returns: exit,status int,str """ parent = identifier.parent().object() if not parent: raise Exception('Parent for %s does not exist.' % identifier) file_ = File.create(identifier.path_abs(), identifier) file_.write_json() entity_file_edit(request, collection, file_, git_name, git_mail) exit, status = commands.entity_create( git_name, git_mail, collection, entity.identifier, [collection.json_path_rel, collection.ead_path_rel], [config.TEMPLATE_EJSON, config.TEMPLATE_METS], agent=agent) if exit: raise Exception('Could not create new Entity: %s, %s' % (exit, status)) # load Entity object, inherit values from parent, write back to file entity = Identifier(identifier).object() entity.inherit(collection) entity.write_json() updated_files = [entity.json_path] exit, status = commands.entity_update(git_name, git_mail, collection, entity, updated_files, agent=agent) return exit, status
def index( hosts, index, path, recursive=False, public=True ): """(Re)index with data from the specified directory. After receiving a list of metadata files, index() iterates through the list several times. The first pass weeds out paths to objects that can not be published (e.g. object or its parent is unpublished). The second pass goes through the files and assigns a signature file to each entity or collection ID. There is some logic that tries to pick the first file of the first entity to be the collection signature, and so on. Mezzanine files are preferred over master files. In the final pass, a list of public/publishable fields is chosen based on the model. Additional fields not in the model (e.g. parent ID, parent organization/collection/entity ID, the signature file) are packaged. Then everything is sent off to post(). @param hosts: list of dicts containing host information. @param index: Name of the target index. @param path: Absolute path to directory containing object metadata files. @param recursive: Whether or not to recurse into subdirectories. @param public: For publication (fields not marked public will be ommitted). @param paths: Absolute paths to directory containing collections. @returns: number successful,list of paths that didn't work out """ logger.debug('index(%s, %s, %s)' % (hosts, index, path)) publicfields = public_fields() # process a single file if requested if os.path.isfile(path): paths = [path] else: # files listed first, then entities, then collections paths = util.find_meta_files(path, recursive, files_first=1) # Store value of public,status for each collection,entity. # Values will be used by entities and files to inherit these values from their parent. parents = _parents_status(paths) # Determine if paths are publishable or not successful_paths,bad_paths = _publishable_or_not(paths, parents) # iterate through paths, storing signature_url for each collection, entity # paths listed files first, then entities, then collections signature_files = _choose_signatures(successful_paths) print('Signature files') keys = signature_files.keys() keys.sort() for key in keys: print(key, signature_files[key]) successful = 0 for path in successful_paths: identifier = Identifier(path=path) parent_id = identifier.parent_id() document_pub_fields = [] if public and identifier.model: document_pub_fields = publicfields[identifier.model] additional_fields = {'parent_id': parent_id} if identifier.model == 'collection': additional_fields['organization_id'] = parent_id if identifier.model == 'entity': additional_fields['collection_id'] = parent_id if identifier.model == 'file': additional_fields['entity_id'] = parent_id if identifier.model in ['collection', 'entity']: additional_fields['signature_file'] = signature_files.get(identifier.id, '') # HERE WE GO! document = load_document_json(path, identifier.model, identifier.id) try: existing = get(hosts, index, identifier.model, identifier.id, fields=[]) except: existing = None result = post(hosts, index, document, document_pub_fields, additional_fields) # success: created, or version number incremented if result.get('_id', None): if existing: existing_version = existing.get('version', None) if not existing_version: existing_version = existing.get('_version', None) else: existing_version = None result_version = result.get('version', None) if not result_version: result_version = result.get('_version', None) if result['created'] or (existing_version and (result_version > existing_version)): successful += 1 else: bad_paths.append((path, result['status'], result['response'])) #print(status_code) logger.debug('INDEXING COMPLETED') return {'total':len(paths), 'successful':successful, 'bad':bad_paths}
def identifier_from_path(path: Path) -> Identifier: return Identifier(oid_from_path(path))
def parent( self ): i = Identifier(id=self.parent_id, base_path=self.identifier.basepath) return i.object()
def parent(self): i = Identifier(id=self.parent_id, base_path=self.identifier.basepath) return i.object()
def __init__(self, path_abs, id=None, identifier=None): """ >>> c = Collection('/tmp/ddr-testing-123') >>> c.id 'ddr-testing-123' >>> c.ead_path_rel 'ead.xml' >>> c.ead_path '/tmp/ddr-testing-123/ead.xml' >>> c.json_path_rel 'collection.json' >>> c.json_path '/tmp/ddr-testing-123/collection.json' """ path_abs = os.path.normpath(path_abs) if identifier: i = identifier else: i = Identifier(path=path_abs) self.identifier = i self.id = i.id self.idparts = i.parts.values() self.path_abs = path_abs self.path = path_abs self.root = os.path.split(self.path)[0] self.json_path = i.path_abs('json') self.git_path = i.path_abs('git') self.gitignore_path = i.path_abs('gitignore') self.annex_path = i.path_abs('annex') self.changelog_path = i.path_abs('changelog') self.control_path = i.path_abs('control') self.ead_path = i.path_abs('ead') self.lock_path = i.path_abs('lock') self.files_path = i.path_abs('files') self.path_rel = i.path_rel() self.json_path_rel = i.path_rel('json') self.git_path_rel = i.path_rel('git') self.gitignore_path_rel = i.path_rel('gitignore') self.annex_path_rel = i.path_rel('annex') self.changelog_path_rel = i.path_rel('changelog') self.control_path_rel = i.path_rel('control') self.ead_path_rel = i.path_rel('ead') self.files_path_rel = i.path_rel('files') self.git_url = '{}:{}'.format(config.GITOLITE, self.id)
def _publishable(paths, parents, force=False): """Determines which paths represent publishable paths and which do not. @param paths @param parents @param force: boolean Just publish the damn collection already. @returns list of dicts, e.g. [{'path':'/PATH/TO/OBJECT', 'action':'publish'}] """ path_dicts = [] for path in paths: d = { 'path': path, 'identifier': Identifier(path=path), 'action': 'UNSPECIFIED', 'note': '', } if force: d['action'] = 'POST' path_dicts.append(d) continue # see if item's parents are incomplete or nonpublic # TODO Bad! Bad! Generalize this... UNPUBLISHABLE = [] for parent_id in _file_parent_ids(d['identifier']): parent = parents.get(parent_id, {}) for x in parent.itervalues(): if (x not in STATUS_OK) and (x not in PUBLIC_OK): if parent_id not in UNPUBLISHABLE: UNPUBLISHABLE.append(parent_id) if UNPUBLISHABLE: d['action'] = 'SKIP' d['note'] = 'parent unpublishable' path_dicts.append(d) continue # see if item itself is incomplete or nonpublic # TODO knows way too much about JSON data format public = None status = None jsonpath = d['identifier'].path_abs('json') document = load_json(jsonpath) for field in document: for k, v in field.iteritems(): if k == 'public': public = v if k == 'status': status = v if public and (public not in PUBLIC_OK): d['action'] = 'SKIP' d['note'] = 'not public' path_dicts.append(d) continue elif status and (status not in STATUS_OK): d['action'] = 'SKIP' d['note'] = 'status' path_dicts.append(d) continue if path and d['identifier'].model: d['action'] = 'POST' path_dicts.append(d) return path_dicts
def __init__( self, path_abs, id=None, identifier=None ): path_abs = os.path.normpath(path_abs) if identifier: i = identifier else: i = Identifier(path=path_abs) self.identifier = i self.id = i.id self.idparts = i.parts.values() self.collection_id = i.collection_id() self.parent_id = i.parent_id() self.path_abs = path_abs self.path = path_abs self.collection_path = i.collection_path() self.parent_path = i.parent_path() self.root = os.path.dirname(self.parent_path) self.json_path = i.path_abs('json') self.changelog_path = i.path_abs('changelog') self.control_path = i.path_abs('control') self.mets_path = i.path_abs('mets') self.lock_path = i.path_abs('lock') self.files_path = i.path_abs('files') self.path_rel = i.path_rel() self.json_path_rel = i.path_rel('json') self.changelog_path_rel = i.path_rel('changelog') self.control_path_rel = i.path_rel('control') self.mets_path_rel = i.path_rel('mets') self.files_path_rel = i.path_rel('files')
def __init__(self, path_abs, id=None, identifier=None): path_abs = os.path.normpath(path_abs) if identifier: i = identifier else: i = Identifier(path=path_abs) self.identifier = i self.id = i.id self.idparts = list(i.parts.values()) self.collection_id = i.collection_id() self.parent_id = i.parent_id() self.path_abs = path_abs self.path = path_abs self.collection_path = i.collection_path() self.parent_path = i.parent_path() self.root = os.path.dirname(self.parent_path) self.json_path = i.path_abs('json') self.changelog_path = i.path_abs('changelog') self.control_path = i.path_abs('control') self.mets_path = i.path_abs('mets') self.lock_path = i.path_abs('lock') self.files_path = i.path_abs('files') self.path_rel = i.path_rel() self.json_path_rel = i.path_rel('json') self.changelog_path_rel = i.path_rel('changelog') self.control_path_rel = i.path_rel('control') self.mets_path_rel = i.path_rel('mets') self.files_path_rel = i.path_rel('files')
def post_multi(self, path, recursive=False, force=False, backblaze=None): """Publish (index) specified document and (optionally) its children. After receiving a list of metadata files, index() iterates through the list several times. The first pass weeds out paths to objects that can not be published (e.g. object or its parent is unpublished). In the final pass, a list of public/publishable fields is chosen based on the model. Additional fields not in the model (e.g. parent ID, parent organization/collection/entity ID) are packaged. Then everything is sent off to post(). @param path: Absolute path to directory containing object metadata files. @param recursive: Whether or not to recurse into subdirectories. @param force: boolean Just publish the damn collection already. @param backblaze: storage.Backblaze object Look in b2sync tmpdir and mark files uploaded to Backblaze. @returns: number successful,list of paths that didn't work out """ logger.debug(f'post_multi({path}, {recursive}, {force}, {backblaze})') # Check that path try: ci = Identifier(path).collection() except: raise Exception( 'Docstore.post_multi path must point to a collection or subdirectory.' ) ci_path = Path(ci.id) publicfields = _public_fields() # process a single file if requested if os.path.isfile(path): paths = [path] else: # files listed first, then entities, then collections logger.debug(f'Finding files in {path}') paths = util.find_meta_files(path, recursive, files_first=1) # Determine if paths are publishable or not logger.debug(f'Checking for publishability') identifiers = [Identifier(path) for path in paths] parents = { oid: oi.object() for oid, oi in _all_parents(identifiers).items() } paths = publishable(identifiers, parents, force=force) # list files in b2 bucket # TODO do this in parallel with util.find_meta_files? b2_files = [] if backblaze: logger.debug( f'Checking Backblaze for uploaded files ({backblaze.bucketname})' ) b2_files = backblaze.list_files(folder=ci.id) logger.debug(f'{len(b2_files)} files') skipped = 0 successful = 0 bad_paths = [] num = len(paths) for n, path in enumerate(paths): oi = path.get('identifier') if not oi: path['note'] = 'No identifier' bad_paths.append(path) continue try: document = oi.object() except Exception as err: path['note'] = f'Could not instantiate: {err}' bad_paths.append(path) continue if not document: path['note'] = 'No document' bad_paths.append(path) continue # see if file uploaded to Backblaze b2_synced = False b2str = '' if (oi.model == 'file') and b2_files: dir_filename = str(ci_path / Path(document.path).name) if dir_filename in b2_files: b2_synced = True b2str = '(b2)' b2_files.remove(dir_filename) # TODO write logs instead of print now = datetime.now(config.TZ) action = path['action'] path_note = path['note'].strip() print(f'{now} | {n+1}/{num} {action} {oi.id} {path_note}{b2str}') # see if document exists existing_v = None d = self.get(model=oi.model, es_class=ELASTICSEARCH_CLASSES_BY_MODEL[oi.model], document_id=oi.id) if d: existing_v = d.meta.version # post document if path['action'] == 'POST': try: created = self.post(document, parents=parents, b2=b2_synced, force=True) except Exception as err: traceback.print_exc() # force=True bypasses publishable in post() function # delete previously published items now marked incomplete/private elif existing_v and (path['action'] == 'SKIP'): print('%s | %s/%s DELETE' % (datetime.now(config.TZ), n + 1, num)) self.delete(oi.id) if path['action'] == 'SKIP': skipped += 1 continue # version is incremented with each updated posted_v = None # for e.g. segment the ES doc_type will be 'entity' but oi.model is 'segment' d = self.get(model=oi.model, es_class=ELASTICSEARCH_CLASSES_BY_MODEL[oi.model], document_id=oi.id) if d: posted_v = d.meta.version # success: created, or version number incremented status = 'ERROR - unspecified' if posted_v and not existing_v: status = 'CREATED' successful += 1 elif (existing_v and posted_v) and (existing_v < posted_v): status = 'UPDATED' successful += 1 elif not posted_v: status = 'ERROR: not created' bad_paths.append(path) print(status) logger.debug('INDEXING COMPLETED') return { 'total': len(paths), 'skipped': skipped, 'successful': successful, 'bad': bad_paths }