def __init__(self, path_abs, id=None, identifier=None): path_abs = os.path.normpath(path_abs) if identifier: i = identifier else: i = Identifier(path=path_abs) self.identifier = i self.id = i.id self.idparts = list(i.parts.values()) self.collection_id = i.collection_id() self.parent_id = i.parent_id() self.path_abs = path_abs self.path = path_abs self.collection_path = i.collection_path() self.parent_path = i.parent_path() self.root = os.path.dirname(self.parent_path) self.json_path = i.path_abs('json') self.changelog_path = i.path_abs('changelog') self.control_path = i.path_abs('control') self.mets_path = i.path_abs('mets') self.lock_path = i.path_abs('lock') self.files_path = i.path_abs('files') self.path_rel = i.path_rel() self.json_path_rel = i.path_rel('json') self.changelog_path_rel = i.path_rel('changelog') self.control_path_rel = i.path_rel('control') self.mets_path_rel = i.path_rel('mets') self.files_path_rel = i.path_rel('files')
def __init__( self, path_abs, id=None, identifier=None ): path_abs = os.path.normpath(path_abs) if identifier: i = identifier else: i = Identifier(path=path_abs) self.identifier = i self.id = i.id self.idparts = i.parts.values() self.collection_id = i.collection_id() self.parent_id = i.parent_id() self.path_abs = path_abs self.path = path_abs self.collection_path = i.collection_path() self.parent_path = i.parent_path() self.root = os.path.dirname(self.parent_path) self.json_path = i.path_abs('json') self.changelog_path = i.path_abs('changelog') self.control_path = i.path_abs('control') self.mets_path = i.path_abs('mets') self.lock_path = i.path_abs('lock') self.files_path = i.path_abs('files') self.path_rel = i.path_rel() self.json_path_rel = i.path_rel('json') self.changelog_path_rel = i.path_rel('changelog') self.control_path_rel = i.path_rel('control') self.mets_path_rel = i.path_rel('mets') self.files_path_rel = i.path_rel('files')
def _repo_org(self, path, doctype, remove=False): """ seealso DDR.models.common.DDRObject.to_esobject """ # get and validate file data = load_json(path) if (not (data.get('id') and data.get('repo'))): raise Exception('Data file is not well-formed.') oi = Identifier(id=data['id']) d = OrderedDict() d['id'] = oi.id d['model'] = oi.model d['parent_id'] = oi.parent_id(stubs=1) # links d['links_html'] = oi.id d['links_json'] = oi.id d['links_img'] = '%s/logo.png' % oi.id d['links_thumb'] = '%s/logo.png' % oi.id d['links_parent'] = oi.parent_id(stubs=1) d['links_children'] = oi.id # title,description d['title'] = data['title'] d['description'] = data['description'] d['url'] = data['url'] # ID components (repo, org, cid, ...) as separate fields idparts = deepcopy(oi.idparts) idparts.pop('model') for k in ID_COMPONENTS: d[k] = '' # ensure all fields present for k, v in idparts.iteritems(): d[k] = v # add/update if remove and self.exists(doctype, oi): results = self.es.delete(index=self.indexname, doc_type=doctype, id=oi.id) else: results = self.es.index(index=self.indexname, doc_type=doctype, id=oi.id, body=d) return results
def _repo_org(self, path, doctype, remove=False): """ seealso DDR.models.common.DDRObject.to_esobject """ # get and validate file data = load_json(path) if (not (data.get('id') and data.get('repo'))): raise Exception('Data file is not well-formed.') oi = Identifier(id=data['id']) d = OrderedDict() d['id'] = oi.id d['model'] = oi.model d['parent_id'] = oi.parent_id(stubs=1) # links d['links_html'] = oi.id d['links_json'] = oi.id d['links_img'] = '%s/logo.png' % oi.id d['links_thumb'] = '%s/logo.png' % oi.id d['links_parent'] = oi.parent_id(stubs=1) d['links_children'] = oi.id # title,description d['title'] = data['title'] d['description'] = data['description'] d['url'] = data['url'] # ID components (repo, org, cid, ...) as separate fields idparts = deepcopy(oi.idparts) idparts.pop('model') for k in ID_COMPONENTS: d[k] = '' # ensure all fields present for k,v in idparts.iteritems(): d[k] = v # add/update if remove and self.exists(doctype, oi): results = self.es.delete( index=self.indexname, doc_type=doctype, id=oi.id ) else: results = self.es.index( index=self.indexname, doc_type=doctype, id=oi.id, body=d ) return results
def _repo_org(self, path, doctype, remove=False): """ seealso DDR.models.common.DDRObject.to_esobject """ # get and validate file data = load_json(path) if (not (data.get('id') and data.get('repo'))): raise Exception('Data file is not well-formed.') oi = Identifier(id=data['id']) ES_Class = ELASTICSEARCH_CLASSES_BY_MODEL[doctype] d = ES_Class(id=oi.id) d.meta.id = oi.id d.model = oi.model d.parent_id = oi.parent_id(stubs=1) # links d.links_html = oi.id d.links_json = oi.id d.links_img = '%s/logo.png' % oi.id d.links_thumb = '%s/logo.png' % oi.id d.links_parent = oi.parent_id(stubs=1) d.links_children = oi.id # title,description d.title = data['title'] d.description = data['description'] d.url = data['url'] # ID components (repo, org, cid, ...) as separate fields idparts = deepcopy(oi.idparts) idparts.pop('model') for key, val in idparts.items(): setattr(d, key, val) # add/update if remove and self.exists(doctype, oi): results = d.delete(index=self.index_name(doctype), using=self.es) else: results = d.save(index=self.index_name(doctype), using=self.es) return results
def index( hosts, index, path, recursive=False, public=True ): """(Re)index with data from the specified directory. After receiving a list of metadata files, index() iterates through the list several times. The first pass weeds out paths to objects that can not be published (e.g. object or its parent is unpublished). The second pass goes through the files and assigns a signature file to each entity or collection ID. There is some logic that tries to pick the first file of the first entity to be the collection signature, and so on. Mezzanine files are preferred over master files. In the final pass, a list of public/publishable fields is chosen based on the model. Additional fields not in the model (e.g. parent ID, parent organization/collection/entity ID, the signature file) are packaged. Then everything is sent off to post(). @param hosts: list of dicts containing host information. @param index: Name of the target index. @param path: Absolute path to directory containing object metadata files. @param recursive: Whether or not to recurse into subdirectories. @param public: For publication (fields not marked public will be ommitted). @param paths: Absolute paths to directory containing collections. @returns: number successful,list of paths that didn't work out """ logger.debug('index(%s, %s, %s)' % (hosts, index, path)) publicfields = public_fields() # process a single file if requested if os.path.isfile(path): paths = [path] else: # files listed first, then entities, then collections paths = util.find_meta_files(path, recursive, files_first=1) # Store value of public,status for each collection,entity. # Values will be used by entities and files to inherit these values from their parent. parents = _parents_status(paths) # Determine if paths are publishable or not successful_paths,bad_paths = _publishable_or_not(paths, parents) # iterate through paths, storing signature_url for each collection, entity # paths listed files first, then entities, then collections signature_files = _choose_signatures(successful_paths) print('Signature files') keys = signature_files.keys() keys.sort() for key in keys: print(key, signature_files[key]) successful = 0 for path in successful_paths: identifier = Identifier(path=path) parent_id = identifier.parent_id() document_pub_fields = [] if public and identifier.model: document_pub_fields = publicfields[identifier.model] additional_fields = {'parent_id': parent_id} if identifier.model == 'collection': additional_fields['organization_id'] = parent_id if identifier.model == 'entity': additional_fields['collection_id'] = parent_id if identifier.model == 'file': additional_fields['entity_id'] = parent_id if identifier.model in ['collection', 'entity']: additional_fields['signature_file'] = signature_files.get(identifier.id, '') # HERE WE GO! document = load_document_json(path, identifier.model, identifier.id) try: existing = get(hosts, index, identifier.model, identifier.id, fields=[]) except: existing = None result = post(hosts, index, document, document_pub_fields, additional_fields) # success: created, or version number incremented if result.get('_id', None): if existing: existing_version = existing.get('version', None) if not existing_version: existing_version = existing.get('_version', None) else: existing_version = None result_version = result.get('version', None) if not result_version: result_version = result.get('_version', None) if result['created'] or (existing_version and (result_version > existing_version)): successful += 1 else: bad_paths.append((path, result['status'], result['response'])) #print(status_code) logger.debug('INDEXING COMPLETED') return {'total':len(paths), 'successful':successful, 'bad':bad_paths}