def fieldcsv(collectionsdir, cidpattern, model, fieldname, csvfile): """Export value of specified field for all model objects in collections @param collectionsdir str: @param cidpattern str: @param model str: @param fieldname str: @param csvfile str: """ collection_paths = [ path for path in util.natural_sort( util.find_meta_files(basedir=collectionsdir, model='collection', recursive=1, force_read=1)) if cidpattern in path ] for collection_path in collection_paths: print(collection_path) try: batch.Exporter.export_field_csv( collection=identifier.Identifier(collection_path).object(), model=model, fieldname=fieldname, csv_path=csvfile, ) except Exception as err: print('ERROR: %s' % err)
def _store( signatures, object_id, file_id ): if signatures.get(object_id,None): filenames = [signatures[object_id], file_id] first = util.natural_sort(filenames)[0] if file_id == first: signatures[object_id] = file_id else: signatures[object_id] = file_id
def collection_paths(collections_root, repository, organization): """Returns collection paths. TODO use util.find_meta_files() """ paths = [] regex = '^{}-{}-[0-9]+$'.format(repository, organization) id = re.compile(regex) for x in os.listdir(collections_root): m = id.search(x) if m: colldir = os.path.join(collections_root, x) if 'collection.json' in os.listdir(colldir): paths.append(colldir) return util.natural_sort(paths)
def children(self, quick=False): """Returns list of the Collection's Entity objects. >>> c = Collection.from_json('/tmp/ddr-testing-123') >>> c.children() [<Entity ddr-testing-123-1>, <Entity ddr-testing-123-2>, ...] TODO use util.find_meta_files() @param quick: Boolean List only titles and IDs @param dicts: Boolean List only titles and IDs (dicts) @returns: list of Entities or ListEntity """ entity_paths = [] if os.path.exists(self.files_path): # TODO use cached list if available for eid in os.listdir(self.files_path): path = os.path.join(self.files_path, eid) entity_paths.append(path) entity_paths = util.natural_sort(entity_paths) entities = [] for path in entity_paths: if quick: # fake Entity with just enough info for lists entity_json_path = os.path.join(path, 'entity.json') if os.path.exists(entity_json_path): e = ListEntity() e.identifier = Identifier(path=path) e.id = e.identifier.id for line in fileio.read_text(entity_json_path).split('\n'): if '"title":' in line: e.title = json.loads('{%s}' % line)['title'] elif '"signature_id":' in line: e.signature_id = json.loads('{%s}' % line)['signature_id'] e.signature_abs = common.signature_abs( e, self.identifier.basepath) if e.title and e.signature_id: # stop once we have what we need so we don't waste time # and have entity.children as separate ghost entities break entities.append(e) else: entity = Entity.from_identifier(Identifier(path=path)) for lv in entity.labels_values(): if lv['label'] == 'title': entity.title = lv['value'] entities.append(entity) return entities
def export(json_paths, model, csv_path, required_only=False): """Write the specified objects' data to CSV. IMPORTANT: All objects in json_paths must have the same set of fields! TODO let user specify which fields to write TODO confirm that each identifier's class matches object_class @param json_paths: list of .json files @param model: str @param csv_path: Absolute path to CSV data file. @param required_only: boolean Only required fields. """ object_class = identifier.class_for_name( identifier.MODEL_CLASSES[model]['module'], identifier.MODEL_CLASSES[model]['class'] ) module = modules.Module(identifier.module_for_name( identifier.MODEL_REPO_MODELS[model]['module'] )) if hasattr(object_class, 'xmp') and not hasattr(object_class, 'mets'): # File or subclass json_paths = models.sort_file_paths(json_paths) else: # Entity or subclass json_paths = util.natural_sort(json_paths) json_paths_len = len(json_paths) Exporter._make_tmpdir(os.path.dirname(csv_path)) headers = module.csv_export_fields(required_only) # make sure we export 'id' if it's not in model FIELDS (ahem, files) if 'id' not in headers: headers.insert(0, 'id') with codecs.open(csv_path, 'wb', 'utf-8') as csvfile: writer = fileio.csv_writer(csvfile) # headers in first line writer.writerow(headers) for n,json_path in enumerate(json_paths): i = identifier.Identifier(json_path) logging.info('%s/%s - %s' % (n+1, json_paths_len, i.id)) obj = object_class.from_identifier(i) if obj: writer.writerow(obj.dump_csv(headers=headers)) return csv_path
def children(self, quick=False): """Returns list of the Collection's Entity objects. >>> c = Collection.from_json('/tmp/ddr-testing-123') >>> c.children() [<Entity ddr-testing-123-1>, <Entity ddr-testing-123-2>, ...] TODO use util.find_meta_files() @param quick: Boolean List only titles and IDs @param dicts: Boolean List only titles and IDs (dicts) @returns: list of Entities or ListEntity """ entity_paths = [] if os.path.exists(self.files_path): # TODO use cached list if available for eid in os.listdir(self.files_path): path = os.path.join(self.files_path, eid) entity_paths.append(path) entity_paths = util.natural_sort(entity_paths) entities = [] for path in entity_paths: if quick: # fake Entity with just enough info for lists entity_json_path = os.path.join(path, 'entity.json') if os.path.exists(entity_json_path): with open(entity_json_path, 'r') as f: data = json.loads(f.read()) e = ListEntity() e.identifier = Identifier(path=path) e.id = e.identifier.id for line in data[1:]: if 'title' in list(line.keys()): e.title = line['title'] elif 'signature_id' in list(line.keys()): e.signature_id = line['signature_id'] e.signature_abs = common.signature_abs( e, self.identifier.basepath) entities.append(e) else: entity = Entity.from_identifier(Identifier(path=path)) for lv in entity.labels_values(): if lv['label'] == 'title': entity.title = lv['value'] entities.append(entity) return entities
def sort_file_paths(json_paths, rank='role-eid-sort'): """Sort file JSON paths in human-friendly order. TODO this belongs in DDR.identifier @param json_paths: @param rank: 'role-eid-sort' or 'eid-sort-role' """ paths = {} keys = [] while json_paths: path = json_paths.pop() identifier = Identifier(path=path) eid = identifier.parts.get('eid', None) role = identifier.parts.get('role', None) sha1 = identifier.parts.get('sha1', None) sort = 0 with open(path, 'r') as f: for line in f.readlines(): if 'sort' in line: sort = line.split(':')[1].replace('"', '').strip() eid = str(eid) sha1 = str(sha1) sort = str(sort) if rank == 'eid-sort-role': key = '-'.join([str(eid), sort, role, sha1]) elif rank == 'role-eid-sort': key = '-'.join([role, eid, sort, sha1]) paths[key] = path keys.append(key) keys_sorted = [key for key in util.natural_sort(keys)] paths_sorted = [] while keys_sorted: val = paths.pop(keys_sorted.pop(), None) if val: paths_sorted.append(val) return paths_sorted
def sort_file_paths(json_paths, rank='role-eid-sort'): """Sort file JSON paths in human-friendly order. TODO this belongs in DDR.identifier @param json_paths: @param rank: 'role-eid-sort' or 'eid-sort-role' """ paths = {} keys = [] while json_paths: path = json_paths.pop() identifier = Identifier(path=path) eid = identifier.parts.get('eid',None) role = identifier.parts.get('role',None) sha1 = identifier.parts.get('sha1',None) sort = 0 for line in fileio.read_text(path).splitlines(): if 'sort' in line: sort = line.split(':')[1].replace('"','').strip() eid = str(eid) sha1 = str(sha1) sort = str(sort) if rank == 'eid-sort-role': key = '-'.join([str(eid),sort,role,sha1]) elif rank == 'role-eid-sort': key = '-'.join([role,eid,sort,sha1]) paths[key] = path keys.append(key) keys_sorted = [key for key in util.natural_sort(keys)] paths_sorted = [] while keys_sorted: val = paths.pop(keys_sorted.pop(), None) if val: paths_sorted.append(val) return paths_sorted
def test_natural_sort(): l = ['11', '1', '12', '2', '13', '3'] util.natural_sort(l) assert l == ['1', '2', '3', '11', '12', '13']
def import_files(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, log_path=None, dryrun=False): """Adds or updates files from a CSV file TODO how to handle excluded fields like XMP??? @param csv_path: Absolute path to CSV data file. @param cidentifier: Identifier @param vocabs_path: Absolute path to vocab dir @param git_name: str @param git_mail: str @param agent: str @param log_path: str Absolute path to addfile log for all files @param dryrun: boolean """ logging.info('batch import files ----------------------------') # TODO hard-coded model name... model = 'file' csv_dir = os.path.dirname(csv_path) logging.debug('csv_dir %s' % csv_dir) # TODO this still knows too much about entities and files... entity_class = identifier.class_for_name( identifier.MODEL_CLASSES['entity']['module'], identifier.MODEL_CLASSES['entity']['class'] ) logging.debug('entity_class %s' % entity_class) logging.info('Reading %s' % csv_path) headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path)) logging.info('%s rows' % len(rowds)) # check for modified or uncommitted files in repo repository = dvcs.repository(cidentifier.path_abs()) logging.debug(repository) fidentifiers = { rowd['id']: identifier.Identifier( id=rowd['id'], base_path=cidentifier.basepath ) for rowd in rowds } fidentifier_parents = { fi.id: Importer._fidentifier_parent(fi) for fi in fidentifiers.itervalues() } # eidentifiers, removing duplicates eidentifiers = list(set([e for e in fidentifier_parents.itervalues()])) entities = {} bad_entities = [] for eidentifier in eidentifiers: if os.path.exists(eidentifier.path_abs()): entity = eidentifier.object() entities[eidentifier.id] = entity else: if eidentifier.id not in bad_entities: bad_entities.append(eidentifier.id) if bad_entities: for f in bad_entities: logging.error(' %s missing' % f) raise Exception('%s entities could not be loaded! - IMPORT CANCELLED!' % len(bad_entities)) # separate into new and existing lists rowds_new = [] rowds_existing = [] for n,rowd in enumerate(rowds): if Importer._file_is_new(fidentifiers[rowd['id']]): rowds_new.append(rowd) else: rowds_existing.append(rowd) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Updating existing files') start_updates = datetime.now() git_files = [] updated = [] elapsed_rounds_updates = [] staged = [] obj_metadata = None for n,rowd in enumerate(rowds_existing): logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig'])) start_round = datetime.now() fidentifier = fidentifiers[rowd['id']] eidentifier = fidentifier_parents[fidentifier.id] entity = entities[eidentifier.id] file_ = fidentifier.object() modified = file_.load_csv(rowd) # Getting obj_metadata takes about 1sec each time # TODO caching works as long as all objects have same metadata... if not obj_metadata: obj_metadata = models.object_metadata( fidentifier.fields_module(), repository.working_dir ) if dryrun: pass elif modified: logging.debug(' writing %s' % file_.json_path) file_.write_json(obj_metadata=obj_metadata) # TODO better to write to collection changelog? Importer._write_entity_changelog(entity, git_name, git_mail, agent) # stage git_files.append(file_.json_path_rel) git_files.append(entity.changelog_path_rel) updated.append(file_) elapsed_round = datetime.now() - start_round elapsed_rounds_updates.append(elapsed_round) logging.debug('| %s (%s)' % (fidentifier, elapsed_round)) elapsed_updates = datetime.now() - start_updates logging.debug('%s updated in %s' % (len(elapsed_rounds_updates), elapsed_updates)) if dryrun: pass elif git_files: logging.info('Staging %s modified files' % len(git_files)) start_stage = datetime.now() dvcs.stage(repository, git_files) staged = util.natural_sort(dvcs.list_staged(repository)) for path in staged: if path in git_files: logging.debug('+ %s' % path) else: logging.debug('| %s' % path) elapsed_stage = datetime.now() - start_stage logging.debug('ok (%s)' % elapsed_stage) logging.debug('%s staged in %s' % (len(staged), elapsed_stage)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Adding new files') start_adds = datetime.now() elapsed_rounds_adds = [] logging.info('Checking source files') for rowd in rowds_new: rowd['src_path'] = os.path.join(csv_dir, rowd['basename_orig']) logging.debug('| %s' % rowd['src_path']) if not os.path.exists(rowd['src_path']): raise Exception('Missing file: %s' % rowd['src_path']) if log_path: logging.info('addfile logging to %s' % log_path) for n,rowd in enumerate(rowds_new): logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig'])) start_round = datetime.now() fidentifier = fidentifiers[rowd['id']] eidentifier = fidentifier_parents[fidentifier.id] entity = entities[eidentifier.id] logging.debug('| %s' % (entity)) if dryrun: pass elif Importer._file_is_new(fidentifier): # ingest # TODO make sure this updates entity.files file_,repo2,log2 = ingest.add_file( entity, rowd['src_path'], fidentifier.parts['role'], rowd, git_name, git_mail, agent, log_path=log_path, show_staged=False ) elapsed_round = datetime.now() - start_round elapsed_rounds_adds.append(elapsed_round) logging.debug('| %s (%s)' % (file_, elapsed_round)) elapsed_adds = datetime.now() - start_adds logging.debug('%s added in %s' % (len(elapsed_rounds_adds), elapsed_adds)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') return git_files
def import_entities(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, dryrun=False): """Adds or updates entities from a CSV file Running function multiple times with the same CSV file is idempotent. After the initial pass, files will only be modified if the CSV data has been updated. This function writes and stages files but does not commit them! That is left to the user or to another function. @param csv_path: Absolute path to CSV data file. @param cidentifier: Identifier @param vocabs_path: Absolute path to vocab dir @param git_name: str @param git_mail: str @param agent: str @param dryrun: boolean @returns: list of updated entities """ logging.info('------------------------------------------------------------------------') logging.info('batch import entity') model = 'entity' repository = dvcs.repository(cidentifier.path_abs()) logging.info(repository) logging.info('Reading %s' % csv_path) headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path)) logging.info('%s rows' % len(rowds)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Importing') start_updates = datetime.now() git_files = [] updated = [] elapsed_rounds = [] obj_metadata = None if dryrun: logging.info('Dry run - no modifications') for n,rowd in enumerate(rowds): logging.info('%s/%s - %s' % (n+1, len(rowds), rowd['id'])) start_round = datetime.now() eidentifier = identifier.Identifier(id=rowd['id'], base_path=cidentifier.basepath) # if there is an existing object it will be loaded entity = eidentifier.object() if not entity: entity = models.Entity.create(eidentifier.path_abs(), eidentifier) modified = entity.load_csv(rowd) # Getting obj_metadata takes about 1sec each time # TODO caching works as long as all objects have same metadata... if not obj_metadata: obj_metadata = models.object_metadata( eidentifier.fields_module(), repository.working_dir ) if dryrun: pass elif modified: # write files if not os.path.exists(entity.path_abs): os.makedirs(entity.path_abs) logging.debug(' writing %s' % entity.json_path) entity.write_json(obj_metadata=obj_metadata) # TODO better to write to collection changelog? # TODO write all additions to changelog at one time Importer._write_entity_changelog(entity, git_name, git_mail, agent) # stage git_files.append(entity.json_path_rel) git_files.append(entity.changelog_path_rel) updated.append(entity) elapsed_round = datetime.now() - start_round elapsed_rounds.append(elapsed_round) logging.debug('| %s (%s)' % (eidentifier, elapsed_round)) if dryrun: logging.info('Dry run - no modifications') elif updated: logging.info('Staging %s modified files' % len(git_files)) start_stage = datetime.now() dvcs.stage(repository, git_files) for path in util.natural_sort(dvcs.list_staged(repository)): if path in git_files: logging.debug('+ %s' % path) else: logging.debug('| %s' % path) elapsed_stage = datetime.now() - start_stage logging.debug('ok (%s)' % elapsed_stage) elapsed_updates = datetime.now() - start_updates logging.debug('%s updated in %s' % (len(elapsed_rounds), elapsed_updates)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') return updated