def file_info(repo): """Info from looking at set of metadata files @param repo: GitPython.Repository object @returns: dict """ data = {} paths = util.find_meta_files( repo.working_dir, recursive=True, force_read=True ) identifiers = [ identifier.Identifier(path) for path in paths ] data['total objects'] = len(identifiers) # model totals for i in identifiers: key = '%s objects' % i.model if not data.get(key): data[key] = 0 data[key] = data[key] + 1 # role totals roles = identifier.VALID_COMPONENTS['role'] for role in roles: key = '%s files' % role data[key] = 0 for i in identifiers: if i.model == 'file': for role in roles: if role in i.path_abs(): key = '%s files' % role data[key] = data[key] + 1 return data
def test_pick_signatures(): cpath = os.path.join(TESTING_BASE_DIR, COLLECTION_ID) paths = util.find_meta_files(cpath, recursive=True, force_read=True, testing=True) parents = signatures.choose(paths) updates = signatures.find_updates(parents) files_written = signatures.write_updates(updates) return files_written
def test_00_pick_signatures(tmpdir, collection): paths = util.find_meta_files(collection.path_abs, recursive=True, force_read=True) print('paths') for x in paths: print(x) parents = signatures.choose(paths) print('parents') for x in parents: print(x) updates = signatures.find_updates(parents) print('updates') for x in updates: print(x) files_written = signatures.write_updates(updates) print('files_written') for x in files_written: print(x) for oid, expected in SIGNATURES: oi = identifier.Identifier(id=oid, base_path=collection.identifier.basepath) o = oi.object() print('expected ', expected) print('o.signature_id ', o.signature_id) assert o.signature_id == expected
def file_info(repo): """Info from looking at set of metadata files @param repo: GitPython.Repository object @returns: dict """ data = {} paths = util.find_meta_files(repo.working_dir, recursive=True, force_read=True) identifiers = [identifier.Identifier(path) for path in paths] data['total objects'] = len(identifiers) # model totals for i in identifiers: key = '%s objects' % i.model if not data.get(key): data[key] = 0 data[key] = data[key] + 1 # role totals roles = identifier.VALID_COMPONENTS['role'] for role in roles: key = '%s files' % role data[key] = 0 for i in identifiers: if i.model == 'file': for role in roles: if role in i.path_abs(): key = '%s files' % role data[key] = data[key] + 1 return data
def fieldcsv(collectionsdir, cidpattern, model, fieldname, csvfile): """Export value of specified field for all model objects in collections @param collectionsdir str: @param cidpattern str: @param model str: @param fieldname str: @param csvfile str: """ collection_paths = [ path for path in util.natural_sort( util.find_meta_files(basedir=collectionsdir, model='collection', recursive=1, force_read=1)) if cidpattern in path ] for collection_path in collection_paths: print(collection_path) try: batch.Exporter.export_field_csv( collection=identifier.Identifier(collection_path).object(), model=model, fieldname=fieldname, csv_path=csvfile, ) except Exception as err: print('ERROR: %s' % err)
def export_files( collection_path, csv_path ): """ @param collection_path: Absolute path to collection repo. @param csv_path: Absolute path to CSV data file. """ started = datetime.now(settings.TZ) print('%s starting import' % started) make_tmpdir(CSV_TMPDIR) fieldnames = [field['name'] for field in filemodule.FILE_FIELDS] print(fieldnames) paths = [] for path in util.find_meta_files(basedir=collection_path, recursive=True): if ('master' in path) or ('mezzanine' in path): paths.append(path) with open(csv_path, 'wb') as csvfile: writer = make_csv_writer(csvfile) # headers writer.writerow(fieldnames) # everything else for n,path in enumerate(paths): rowstarted = datetime.now(settings.TZ) # load file object filename = os.path.basename(path) file_id = os.path.splitext(filename)[0] file_ = DDRLocalFile.from_json(path) if file_: # seealso DDR.modules.Module.function values = [] for f in filemodule.FILE_FIELDS: value = '' if hasattr(file_, f['name']): key = f['name'] # run csvexport_* functions on field data if present val = modules.Module(filemodule).function( 'csvexport_%s' % key, getattr(file_, f['name']) ) if not (isinstance(val, str) or isinstance(val, unicode)): val = unicode(val) if val: value = val.encode('utf-8') values.append(value) writer.writerow(values) rowfinished = datetime.now(settings.TZ) rowelapsed = rowfinished - rowstarted print('%s %s/%s %s (%s)' % (dtfmt(rowfinished), n+1, len(paths), file_id, rowelapsed)) else: print('NO FILE FOR %s' % path) finished = datetime.now(settings.TZ) elapsed = finished - started print('%s DONE (%s files)' % (dtfmt(finished), len(paths))) print('%s elapsed' % elapsed) if os.path.exists(csv_path): return csv_path return 'no file written'
def export_entities( collection_path, csv_path ): """ @param collection_path: Absolute path to collection repo. @param csv_path: Absolute path to CSV data file. """ started = datetime.now(settings.TZ) print('%s starting import' % started) make_tmpdir(CSV_TMPDIR) fieldnames = [field['name'] for field in entitymodule.ENTITY_FIELDS] # exclude 'files' bc not hard to convert to CSV and not different from files export. fieldnames.remove('files') print(fieldnames) paths = [] for path in util.find_meta_files(basedir=collection_path, recursive=True): if os.path.basename(path) == 'entity.json': paths.append(path) with open(csv_path, 'wb') as csvfile: writer = make_csv_writer(csvfile) # headers writer.writerow(fieldnames) # everything else for n,path in enumerate(paths): rowstarted = datetime.now(settings.TZ) entity_dir = os.path.dirname(path) entity_id = os.path.basename(entity_dir) entity = DDRLocalEntity.from_json(entity_dir) # seealso DDR.modules.Module.function values = [] for f in entitymodule.ENTITY_FIELDS: value = '' if hasattr(entity, f['name']) and f.get('form',None): key = f['name'] label = f['form']['label'] # run csvexport_* functions on field data if present val = modules.Module(entitymodule).function( 'csvexport_%s' % key, getattr(entity, f['name']) ) if not (isinstance(val, str) or isinstance(val, unicode)): val = unicode(val) if val: value = val.encode('utf-8') values.append(value) writer.writerow(values) rowfinished = datetime.now(settings.TZ) rowelapsed = rowfinished - rowstarted print('%s %s/%s %s (%s)' % (dtfmt(rowfinished), n+1, len(paths), entity_id, rowelapsed)) finished = datetime.now(settings.TZ) elapsed = finished - started print('%s DONE (%s entities)' % (dtfmt(finished), len(paths))) print('%s elapsed' % elapsed) if os.path.exists(csv_path): return csv_path return 'no file written'
def all_paths(collection_path, model): """Get all .json paths for specified model. @param collection_path: str Absolute path to collection repo @param model: str One of ['collection', 'entity', 'file'] """ return util.find_meta_files( basedir=collection_path, model=model, recursive=1, force_read=1 )
def check_encoding(repo_url, destdir, verbose=False, csv=False, headers=False, json=False): collection_id = extract_collection_id(repo_url) repo_path = os.path.join(destdir, collection_id) out(verbose, collection_id) out(verbose, repo_path) # if verbose, add marker to important lines if verbose: prefix = '%% ' else: prefix = '' if csv and headers: print('{} collection id, files, defects, elapsed'.format(prefix)) start = datetime.now() out(verbose, start) out(verbose, 'clone {} {}'.format(repo_url, repo_path)) repo = clone(repo_url, repo_path) out(verbose, repo) out(verbose, 'analyzing') paths = util.find_meta_files(repo_path, recursive=True) defects = analyze_files(paths, verbose) out(verbose, 'cleaning up') clean(repo_path) end = datetime.now() elapsed = end - start out(verbose, end) if csv: print('{}{}'.format( prefix, ','.join([ str(collection_id), str(len(paths)), str(len(defects)), str(elapsed) ]))) elif json: data = { 'collection id': collection_id, 'files': len(paths), 'defects': len(defects), 'elapsed': str(elapsed), } print('{}{}'.format(prefix, json.dumps(data))) else: print('{}{}, {} bad, {} files, {} elapsed'.format( prefix, collection_id, len(defects), len(paths), elapsed))
def all_paths(collection_path, model): """Get all .json paths for specified model. @param collection_path: str Absolute path to collection repo @param model: str One of ['collection', 'entity', 'file'] """ return util.find_meta_files(basedir=collection_path, model=model, recursive=1, force_read=1)
def _child_jsons( path, testing=False ): """List all the .json files under path directory; excludes specified dir. @param path: Absolute directory path. @return list of paths """ return [ p for p in util.find_meta_files(basedir=path, recursive=True, testing=testing) if os.path.dirname(p) != path ]
def test_find_meta_files(): basedir = os.path.join(TESTING_BASE_DIR, 'find-meta-files') if os.path.exists(basedir): shutil.rmtree(basedir, ignore_errors=1) # build sample repo sampledir = os.path.join(basedir, 'ddr-test-123') for d in SAMPLE_DIRS: path = os.path.join(sampledir, d) os.makedirs(path) for fn in SAMPLE_FILES: path = os.path.join(sampledir, fn) with open(path, 'w') as f: f.write('testing') # cache cache_path = os.path.join(basedir, 'cache') if os.path.exists(cache_path): os.remove(cache_path) assert not os.path.exists(cache_path) def clean(paths): base = '%s/' % sampledir cleaned = [path.replace(base, '') for path in paths] cleaned.sort() return cleaned paths0 = clean(util.find_meta_files(sampledir, recursive=True, force_read=True, testing=1)) assert paths0 == META_ALL for model in ['collection', 'entity', 'file']: paths2 = clean(util.find_meta_files(sampledir, model=model, recursive=True, force_read=True, testing=1)) assert paths2 == META_MODEL[model] paths3 = clean(util.find_meta_files(sampledir, recursive=False, force_read=True, testing=1)) assert paths3 == META_MODEL['collection'] paths4 = clean(util.find_meta_files(sampledir, recursive=True, force_read=True, files_first=True, testing=1)) assert paths4 == META_ALL paths5 = clean(util.find_meta_files(sampledir, recursive=True, force_read=False, testing=1)) assert paths5 == META_ALL
def test_find_meta_files(tmpdir): basedir = str(tmpdir / 'find-meta-files') if os.path.exists(basedir): shutil.rmtree(basedir, ignore_errors=1) # build sample repo sampledir = os.path.join(basedir, 'ddr-test-123') for d in SAMPLE_DIRS: path = os.path.join(sampledir, d) os.makedirs(path) for fn in SAMPLE_FILES: path = os.path.join(sampledir, fn) with open(path, 'w') as f: f.write('testing') # cache cache_path = os.path.join(basedir, 'cache') if os.path.exists(cache_path): os.remove(cache_path) assert not os.path.exists(cache_path) def clean(paths): base = '%s/' % sampledir cleaned = [path.replace(base, '') for path in paths] cleaned.sort() return cleaned paths0 = clean(util.find_meta_files(sampledir, recursive=True, force_read=True)) assert paths0 == META_ALL for model in ['collection', 'entity', 'file']: paths2 = clean(util.find_meta_files(sampledir, model=model, recursive=True, force_read=True)) assert paths2 == META_MODEL[model] paths3 = clean(util.find_meta_files(sampledir, recursive=False, force_read=True)) assert paths3 == META_MODEL['collection'] paths4 = clean(util.find_meta_files(sampledir, recursive=True, force_read=True, files_first=True)) assert paths4 == META_ALL paths5 = clean(util.find_meta_files(sampledir, recursive=True, force_read=False)) assert paths5 == META_ALL
def _child_jsons(path: str, testing: bool = False) -> List[str]: """List all the .json files under path directory; excludes specified dir. @param path: str Absolute directory path. @param testing: boolean @returns: list of paths """ return [ p for p in util.find_meta_files(basedir=path, recursive=True) if os.path.dirname(p) != path ]
def ddrcheckbinaries(repo, verbose=False): """ddrcheckbinaries - Find binaries that don't match metadata hashes. \b Example: $ ddrcheckbinaries /var/www/media/base/ddr-testing-141 """ filepaths = util.find_meta_files( repo, recursive=1, model='file', force_read=True ) hits = check_files(filepaths, verbose)
def ddrcheck(help, collection_path): print('Gathering files in %s' % collection_path) paths = util.find_meta_files( collection_path, recursive=1, model=None, files_first=False, force_read=False, testing=0 ) print('Checking files...') for item in util.validate_paths(paths): n,path,err = item print('%s/%s ERROR %s - %s' % (n, len(paths), path, err)) print('Checked %s files' % len(paths))
def _child_jsons(path, testing=False): """List all the .json files under path directory; excludes specified dir. @param path: Absolute directory path. @return list of paths """ return [ p for p in util.find_meta_files( basedir=path, recursive=True, testing=testing) if os.path.dirname(p) != path ]
def ddrcheck(collection_path): print('Gathering files in %s' % collection_path) paths = util.find_meta_files( collection_path, recursive=1, model=None, files_first=False, force_read=False, testing=0 ) print('Checking files...') for item in util.validate_paths(paths): n,path,err = item print('%s/%s ERROR %s - %s' % (n, len(paths), path, err)) print('Checked %s files' % len(paths))
def _children_paths(self): """Searches fs for (entity) childrens' .jsons, returns natsorted paths @returns: list """ if os.path.exists(self.files_path): return natsorted([ f for f in util.find_meta_files(self.files_path, recursive=True) # only direct children, no descendants if Identifier(f).parent_id() == self.id ]) return []
def check_file_hashes(collection_path): """Check that hashes are present in file JSONs """ paths = util.find_meta_files(collection_path, recursive=True, model='file', force_read=True) for path in paths: f = identifier.Identifier(path).object() if not (f.sha1 and f.sha256 and f.md5 and f.size): print('f.sha1 %s' % f.sha1) print('f.sha256 %s' % f.sha256) print('f.md5 %s' % f.md5) print('f.size %s' % f.size) raise Exception('Hash data missing')
def _file_paths(self, rel=False): """Searches filesystem for childrens' metadata files, returns relative paths. @param rel: bool Return relative paths @returns: list """ if os.path.exists(self.files_path): prefix_path = 'THISWILLNEVERMATCHANYTHING' if rel: prefix_path = '{}/'.format(os.path.normpath(self.files_path)) return sorted([ f.replace(prefix_path, '') for f in util.find_meta_files(self.files_path, recursive=False) ], key=lambda f: util.natural_order_string(f)) return []
def identifiers(self, model=None, force_read=False): """Lists Identifiers for all or subset of Collection's descendents. TODO how is this different from children? >>> c = Collection.from_json('/tmp/ddr-testing-123') >>> c.descendants() [<Entity ddr-testing-123-1>, <Entity ddr-testing-123-2>, ...] @param model: str Restrict list to model. @returns: list of Identifiers """ return [ Identifier(path) for path in util.find_meta_files( self.path, recursive=1, model=model, force_read=force_read) ]
def _file_paths(self, rel=False): """Searches filesystem for childrens' metadata files, returns relative paths. @param rel: bool Return relative paths @returns: list """ if os.path.exists(self.files_path): prefix_path = 'THISWILLNEVERMATCHANYTHING' if rel: prefix_path = '{}/'.format(os.path.normpath(self.files_path)) return sorted( [ f.replace(prefix_path, '') for f in util.find_meta_files(self.files_path, recursive=False) ], key=lambda f: util.natural_order_string(f) ) return []
def csv_export_model( collection_path, model ): """Export collection {model} metadata to CSV file. @return collection_path: Absolute path to collection. @return model: 'entity' or 'file'. """ collection = Collection.from_identifier(Identifier(path=collection_path)) csv_path = settings.CSV_EXPORT_PATH[model] % collection.id logger.info('All paths in %s' % collection_path) paths = util.find_meta_files( basedir=collection_path, model=model, recursive=1, force_read=1 ) logger.info('Exporting %s paths' % len(paths)) batch.Exporter.export( paths, model, csv_path, required_only=False ) return csv_path
def child_field_values(self, model, fieldname): """Get all values of fieldname from specified model in collection. @param model str @param fieldname str """ rows = [] paths = util.find_meta_files(self.path_abs, model=model, recursive=True) for path in paths: o = Identifier(path).object() if getattr(o, fieldname): rows.append([ o.id, fieldname, getattr(o, fieldname), ]) return rows
def collect_hashes(collection_path): """Make dict of existing file hash data @param collection_path: str @returns: dict {file_id: {'sha1':..., 'sha256':..., 'md5':..., 'size':...} """ paths = util.find_meta_files(collection_path, recursive=True, model='file', force_read=True) data = OrderedDict() for path in paths: o = identifier.Identifier(path).object() data[o.id] = OrderedDict() data[o.id]['sha1'] = o.sha1 data[o.id]['sha256'] = o.sha256 data[o.id]['md5'] = o.md5 data[o.id]['size'] = o.size return data
def _ids_in_local_repo(rowds, model, collection_path): """Lists which IDs in CSV are present in local repo. @param rowds: list of dicts @param model: str @param collection_path: str Absolute path to collection repo. @returns: list of IDs. """ metadata_paths = util.find_meta_files( collection_path, model=model, recursive=True, force_read=True ) existing_ids = [ identifier.Identifier(path=path) for path in metadata_paths ] new_ids = [rowd['id'] for rowd in rowds] already = [i for i in new_ids if i in existing_ids] return already
def check( collection_path ): if not os.path.exists(settings.MEDIA_BASE): raise Exception('base_dir does not exist: %s' % settings.MEDIA_BASE) paths = util.find_meta_files( collection_path, recursive=1, model=None, files_first=False, force_read=False, testing=0 ) bad_files = util.validate_paths(paths) output = [ 'Checked %s files' % len(paths), ] if bad_files: for item in bad_files: n,path,err = item output.append( '%s/%s ERROR %s - %s' % (n, len(paths), path, err) ) else: output.append('No bad files.') output.append('DONE') return '\n'.join(output)
def ddrsignatures(collection, nowrite, nocommit, user, mail): """ddrsignatures - Picks signature files for each collection object. """ if not nocommit: if not (user and mail): logging.debug('You must specify a user and email address! >:-0') sys.exit(1) logging.debug('-----------------------------------------------') logging.debug('Loading collection') collection = identifier.Identifier(path=collection).object() logging.debug(collection) # Read data files, gather *published* Identifiers, map parents->nodes # assign signatures, write files updates = signatures.find_updates( signatures.choose( util.find_meta_files(collection.path, recursive=True, force_read=True) ) ) if nowrite: logging.debug('Not writing changes') files_written = [] else: files_written = signatures.write_updates(updates) if nocommit: logging.debug('Not committing changes') elif files_written: if (not user) or (not mail): logging.debug('You must specify a user and email address! >:-0') sys.exit(1) status,msg = signatures.commit_updates( collection, files_written, user, mail, agent='ddr-signature', commit=True ) logging.debug('DONE')
def delete(self, document_id, recursive=False): """Delete a document and optionally its children. TODO refactor after upgrading Elasticsearch past 2.4. delete_by_query was removed sometime during elasticsearch-py 2.* I think it was added back in a later version so the code stays for now. For now, instead of deleting based on document_id, we start with document_id, find all paths beneath it in the filesystem, and curl DELETE url each individual document from Elasticsearch. @param document_id: @param recursive: True or False """ logger.debug('delete(%s, %s)' % (document_id, recursive)) oi = Identifier(document_id, config.MEDIA_BASE) if recursive: paths = util.find_meta_files(oi.path_abs(), recursive=recursive, files_first=1) else: paths = [oi.path_abs()] identifiers = [Identifier(path) for path in paths] num = len(identifiers) for n, oi in enumerate(identifiers): # TODO hard-coded models here! if oi.model == 'segment': model = 'entity' else: model = oi.model try: result = self.es.delete(index=self.index_name(model), id=oi.id) print( f'{n}/{num} DELETE {self.index_name(model)} {oi.id} -> {result["result"]}' ) except docstore.NotFoundError as err: print( f'{n}/{num} DELETE {self.index_name(model)} {oi.id} -> 404 Not Found' )
def find_missing_annex_binaries(repo): """Find binaries that are not placed in .git/annex """ files = [ identifier.Identifier(path_json).object() for path_json in util.find_meta_files( repo.working_dir, recursive=True, model='file', force_read=True) ] binaries_missing = [ f.path_rel for f in files if (not f.external) and (not dvcs.file_in_git_annex(repo, f.path_rel)) ] accessfiles_missing = [ f.access_rel for f in files if (os.path.exists(f.access_abs)) and ( not dvcs.file_in_git_annex(repo, f.path_rel)) ] missing = binaries_missing + accessfiles_missing if missing: print('Binaries missing from git-annex') for path in missing: print(path) return missing
def find_binaries_in_git_objects(repo): """Find binaries mistakenly placed into .git/objects """ files = [ identifier.Identifier(path_json).object() for path_json in util.find_meta_files( repo.working_dir, recursive=True, model='file', force_read=True) ] binaries_in_git_objects = [ f.path_rel for f in files if (not f.external) and dvcs.file_in_git_objects(repo, f.path_rel) ] accessfiles_in_git_objects = [ f.access_rel for f in files if os.path.exists(f.access_abs) and dvcs.file_in_git_objects(repo, f.path_rel) ] binaries = binaries_in_git_objects + accessfiles_in_git_objects if binaries: print('Found binaries in %s' % os.path.join(repo.working_dir, '.git/objects/')) for path in binaries: print(path) return binaries
def post_multi(self, path, recursive=False, force=False): """Publish (index) specified document and (optionally) its children. After receiving a list of metadata files, index() iterates through the list several times. The first pass weeds out paths to objects that can not be published (e.g. object or its parent is unpublished). In the final pass, a list of public/publishable fields is chosen based on the model. Additional fields not in the model (e.g. parent ID, parent organization/collection/entity ID) are packaged. Then everything is sent off to post(). @param path: Absolute path to directory containing object metadata files. @param recursive: Whether or not to recurse into subdirectories. @param force: boolean Just publish the damn collection already. @returns: number successful,list of paths that didn't work out """ logger.debug('index(%s, %s, %s, %s)' % (self.indexname, path, recursive, force)) publicfields = _public_fields() # process a single file if requested if os.path.isfile(path): paths = [path] else: # files listed first, then entities, then collections paths = util.find_meta_files(path, recursive, files_first=1) # Store value of public,status for each collection,entity. # Values will be used by entities and files to inherit these values # from their parent. parents = _parents_status(paths) # Determine if paths are publishable or not paths = _publishable(paths, parents, force=force) skipped = 0 successful = 0 bad_paths = [] num = len(paths) for n, path in enumerate(paths): oi = path.get('identifier') # TODO write logs instead of print print('%s | %s/%s %s %s %s' % (datetime.now( config.TZ), n + 1, num, path['action'], oi.id, path['note'])) if not oi: path['note'] = 'No identifier' bad_paths.append(path) continue try: document = oi.object() except Exception as err: path['note'] = 'Could not instantiate: %s' % err bad_paths.append(path) continue if not document: path['note'] = 'No document' bad_paths.append(path) continue # see if document exists existing_v = None d = self.get(oi.model, oi.id) if d: existing_v = d.meta.version # post document if path['action'] == 'POST': created = self.post(document, parents=parents, force=True) # force=True bypasses _publishable in post() function # delete previously published items now marked incomplete/private elif existing_v and (path['action'] == 'SKIP'): print('%s | %s/%s DELETE' % (datetime.now(config.TZ), n + 1, num)) self.delete(oi.id) if path['action'] == 'SKIP': skipped += 1 continue # version is incremented with each updated posted_v = None # for e.g. segment the ES doc_type will be 'entity' but oi.model is 'segment' es_model = ELASTICSEARCH_CLASSES_BY_MODEL[oi.model]._doc_type.name d = self.get(es_model, oi.id) if d: posted_v = d.meta.version # success: created, or version number incremented status = 'ERROR - unspecified' if posted_v and not existing_v: status = 'CREATED' successful += 1 elif (existing_v and posted_v) and (existing_v < posted_v): status = 'UPDATED' successful += 1 elif not posted_v: status = 'ERROR: not created' bad_paths.append(path) print(status) logger.debug('INDEXING COMPLETED') return { 'total': len(paths), 'skipped': skipped, 'successful': successful, 'bad': bad_paths }
def transform(collection, filter='', models='', topics=None, created=None, commit=None, user=None, mail=None): if commit and ((not user) or (not mail)): logging.error('You must specify a user and email address! >:-0') sys.exit(1) else: logging.info('Not committing changes') start = datetime.now() if filter: logging.info('FILTER: "%s"' % filter) ONLY_THESE = [] if models: logging.info('MODELS: "%s"' % models) ONLY_THESE = models.split(',') logging.info('Loading collection') cidentifier = identifier.Identifier(os.path.normpath(collection)) collection = cidentifier.object() logging.info(collection) logging.info('Finding metadata files') paths = util.find_meta_files(collection.identifier.path_abs(), recursive=True, force_read=True) logging.info('%s paths' % len(paths)) TOPICS = vocab.get_vocabs(config.VOCABS_URL)['topics'] # filter out paths these_paths = [] for path in paths: oi = identifier.Identifier(path) if filter and (not fnmatch.fnmatch(oi.id, filter)): continue if models and (oi.model not in ONLY_THESE): continue these_paths.append(path) if len(these_paths) != len(paths): logging.info('%s after filters' % len(these_paths)) logging.info('Writing') num = len(these_paths) for n, path in enumerate(these_paths): logging.info('%s/%s %s' % (n, num, path)) o = identifier.Identifier(path).object() if filter and (not fnmatch.fnmatch(o.id, filter)): continue if models and (o.identifier.model not in ONLY_THESE): continue if o.identifier.model in ['entity', 'segment']: o.children(force_read=True) if topics and o.identifier.model in ['entity', 'segment']: before = o.topics after = vocab.repair_topicdata(o.topics, TOPICS) o.topics = after if created and hasattr(o, 'record_created'): record_created_before = o.record_created commit = dvcs.earliest_commit(path, parsed=True) o.record_created = commit['ts'] o.write_json() if commit: logging.info('Committing changes') status, msg = commands.update(user, mail, collection, paths, agent='ddr-transform') logging.info('ok') else: logging.info('Changes not committed') end = datetime.now() elapsed = end - start per = elapsed / num logging.info('DONE (%s elapsed, %s per object)' % (elapsed, per))
def index( hosts, index, path, recursive=False, public=True ): """(Re)index with data from the specified directory. After receiving a list of metadata files, index() iterates through the list several times. The first pass weeds out paths to objects that can not be published (e.g. object or its parent is unpublished). The second pass goes through the files and assigns a signature file to each entity or collection ID. There is some logic that tries to pick the first file of the first entity to be the collection signature, and so on. Mezzanine files are preferred over master files. In the final pass, a list of public/publishable fields is chosen based on the model. Additional fields not in the model (e.g. parent ID, parent organization/collection/entity ID, the signature file) are packaged. Then everything is sent off to post(). @param hosts: list of dicts containing host information. @param index: Name of the target index. @param path: Absolute path to directory containing object metadata files. @param recursive: Whether or not to recurse into subdirectories. @param public: For publication (fields not marked public will be ommitted). @param paths: Absolute paths to directory containing collections. @returns: number successful,list of paths that didn't work out """ logger.debug('index(%s, %s, %s)' % (hosts, index, path)) publicfields = public_fields() # process a single file if requested if os.path.isfile(path): paths = [path] else: # files listed first, then entities, then collections paths = util.find_meta_files(path, recursive, files_first=1) # Store value of public,status for each collection,entity. # Values will be used by entities and files to inherit these values from their parent. parents = _parents_status(paths) # Determine if paths are publishable or not successful_paths,bad_paths = _publishable_or_not(paths, parents) # iterate through paths, storing signature_url for each collection, entity # paths listed files first, then entities, then collections signature_files = _choose_signatures(successful_paths) print('Signature files') keys = signature_files.keys() keys.sort() for key in keys: print(key, signature_files[key]) successful = 0 for path in successful_paths: identifier = Identifier(path=path) parent_id = identifier.parent_id() document_pub_fields = [] if public and identifier.model: document_pub_fields = publicfields[identifier.model] additional_fields = {'parent_id': parent_id} if identifier.model == 'collection': additional_fields['organization_id'] = parent_id if identifier.model == 'entity': additional_fields['collection_id'] = parent_id if identifier.model == 'file': additional_fields['entity_id'] = parent_id if identifier.model in ['collection', 'entity']: additional_fields['signature_file'] = signature_files.get(identifier.id, '') # HERE WE GO! document = load_document_json(path, identifier.model, identifier.id) try: existing = get(hosts, index, identifier.model, identifier.id, fields=[]) except: existing = None result = post(hosts, index, document, document_pub_fields, additional_fields) # success: created, or version number incremented if result.get('_id', None): if existing: existing_version = existing.get('version', None) if not existing_version: existing_version = existing.get('_version', None) else: existing_version = None result_version = result.get('version', None) if not result_version: result_version = result.get('_version', None) if result['created'] or (existing_version and (result_version > existing_version)): successful += 1 else: bad_paths.append((path, result['status'], result['response'])) #print(status_code) logger.debug('INDEXING COMPLETED') return {'total':len(paths), 'successful':successful, 'bad':bad_paths}
def post_multi(self, path, recursive=False, force=False): """Publish (index) specified document and (optionally) its children. After receiving a list of metadata files, index() iterates through the list several times. The first pass weeds out paths to objects that can not be published (e.g. object or its parent is unpublished). In the final pass, a list of public/publishable fields is chosen based on the model. Additional fields not in the model (e.g. parent ID, parent organization/collection/entity ID) are packaged. Then everything is sent off to post(). @param path: Absolute path to directory containing object metadata files. @param recursive: Whether or not to recurse into subdirectories. @param force: boolean Just publish the damn collection already. @returns: number successful,list of paths that didn't work out """ logger.debug('index(%s, %s, %s, %s)' % (self.indexname, path, recursive, force)) publicfields = _public_fields() # process a single file if requested if os.path.isfile(path): paths = [path] else: # files listed first, then entities, then collections paths = util.find_meta_files(path, recursive, files_first=1) # Store value of public,status for each collection,entity. # Values will be used by entities and files to inherit these values # from their parent. parents = _parents_status(paths) # Determine if paths are publishable or not paths = _publishable(paths, parents, force=force) skipped = 0 successful = 0 bad_paths = [] num = len(paths) for n,path in enumerate(paths): oi = path.get('identifier') # TODO write logs instead of print print('%s | %s/%s %s %s %s' % ( datetime.now(config.TZ), n+1, num, path['action'], oi.id, path['note']) ) if not oi: path['note'] = 'No identifier' bad_paths.append(path) continue try: document = oi.object() except Exception as err: path['note'] = 'Could not instantiate: %s' % err bad_paths.append(path) continue if not document: path['note'] = 'No document' bad_paths.append(path) continue # see if document exists existing_v = None d = self.get(oi.model, oi.id) if d: existing_v = d.meta.version # post document if path['action'] == 'POST': created = self.post(document, parents=parents, force=True) # force=True bypasses _publishable in post() function # delete previously published items now marked incomplete/private elif existing_v and (path['action'] == 'SKIP'): print('%s | %s/%s DELETE' % (datetime.now(config.TZ), n+1, num)) self.delete(oi.id) if path['action'] == 'SKIP': skipped += 1 continue # version is incremented with each updated posted_v = None # for e.g. segment the ES doc_type will be 'entity' but oi.model is 'segment' es_model = ELASTICSEARCH_CLASSES_BY_MODEL[oi.model]._doc_type.name d = self.get(es_model, oi.id) if d: posted_v = d.meta.version # success: created, or version number incremented status = 'ERROR - unspecified' if posted_v and not existing_v: status = 'CREATED' successful += 1 elif (existing_v and posted_v) and (existing_v < posted_v): status = 'UPDATED' successful += 1 elif not posted_v: status = 'ERROR: not created' bad_paths.append(path) print(status) logger.debug('INDEXING COMPLETED') return {'total':len(paths), 'skipped':skipped, 'successful':successful, 'bad':bad_paths}
def post_multi(self, path, recursive=False, force=False, backblaze=None): """Publish (index) specified document and (optionally) its children. After receiving a list of metadata files, index() iterates through the list several times. The first pass weeds out paths to objects that can not be published (e.g. object or its parent is unpublished). In the final pass, a list of public/publishable fields is chosen based on the model. Additional fields not in the model (e.g. parent ID, parent organization/collection/entity ID) are packaged. Then everything is sent off to post(). @param path: Absolute path to directory containing object metadata files. @param recursive: Whether or not to recurse into subdirectories. @param force: boolean Just publish the damn collection already. @param backblaze: storage.Backblaze object Look in b2sync tmpdir and mark files uploaded to Backblaze. @returns: number successful,list of paths that didn't work out """ logger.debug(f'post_multi({path}, {recursive}, {force}, {backblaze})') # Check that path try: ci = Identifier(path).collection() except: raise Exception( 'Docstore.post_multi path must point to a collection or subdirectory.' ) ci_path = Path(ci.id) publicfields = _public_fields() # process a single file if requested if os.path.isfile(path): paths = [path] else: # files listed first, then entities, then collections logger.debug(f'Finding files in {path}') paths = util.find_meta_files(path, recursive, files_first=1) # Determine if paths are publishable or not logger.debug(f'Checking for publishability') identifiers = [Identifier(path) for path in paths] parents = { oid: oi.object() for oid, oi in _all_parents(identifiers).items() } paths = publishable(identifiers, parents, force=force) # list files in b2 bucket # TODO do this in parallel with util.find_meta_files? b2_files = [] if backblaze: logger.debug( f'Checking Backblaze for uploaded files ({backblaze.bucketname})' ) b2_files = backblaze.list_files(folder=ci.id) logger.debug(f'{len(b2_files)} files') skipped = 0 successful = 0 bad_paths = [] num = len(paths) for n, path in enumerate(paths): oi = path.get('identifier') if not oi: path['note'] = 'No identifier' bad_paths.append(path) continue try: document = oi.object() except Exception as err: path['note'] = f'Could not instantiate: {err}' bad_paths.append(path) continue if not document: path['note'] = 'No document' bad_paths.append(path) continue # see if file uploaded to Backblaze b2_synced = False b2str = '' if (oi.model == 'file') and b2_files: dir_filename = str(ci_path / Path(document.path).name) if dir_filename in b2_files: b2_synced = True b2str = '(b2)' b2_files.remove(dir_filename) # TODO write logs instead of print now = datetime.now(config.TZ) action = path['action'] path_note = path['note'].strip() print(f'{now} | {n+1}/{num} {action} {oi.id} {path_note}{b2str}') # see if document exists existing_v = None d = self.get(model=oi.model, es_class=ELASTICSEARCH_CLASSES_BY_MODEL[oi.model], document_id=oi.id) if d: existing_v = d.meta.version # post document if path['action'] == 'POST': try: created = self.post(document, parents=parents, b2=b2_synced, force=True) except Exception as err: traceback.print_exc() # force=True bypasses publishable in post() function # delete previously published items now marked incomplete/private elif existing_v and (path['action'] == 'SKIP'): print('%s | %s/%s DELETE' % (datetime.now(config.TZ), n + 1, num)) self.delete(oi.id) if path['action'] == 'SKIP': skipped += 1 continue # version is incremented with each updated posted_v = None # for e.g. segment the ES doc_type will be 'entity' but oi.model is 'segment' d = self.get(model=oi.model, es_class=ELASTICSEARCH_CLASSES_BY_MODEL[oi.model], document_id=oi.id) if d: posted_v = d.meta.version # success: created, or version number incremented status = 'ERROR - unspecified' if posted_v and not existing_v: status = 'CREATED' successful += 1 elif (existing_v and posted_v) and (existing_v < posted_v): status = 'UPDATED' successful += 1 elif not posted_v: status = 'ERROR: not created' bad_paths.append(path) print(status) logger.debug('INDEXING COMPLETED') return { 'total': len(paths), 'skipped': skipped, 'successful': successful, 'bad': bad_paths }