def export(json_paths, model, csv_path, required_only=False): """Write the specified objects' data to CSV. IMPORTANT: All objects in json_paths must have the same set of fields! TODO let user specify which fields to write TODO confirm that each identifier's class matches object_class @param json_paths: list of .json files @param model: str @param csv_path: Absolute path to CSV data file. @param required_only: boolean Only required fields. """ object_class = identifier.class_for_name( identifier.MODEL_CLASSES[model]['module'], identifier.MODEL_CLASSES[model]['class'] ) module = modules.Module(identifier.module_for_name( identifier.MODEL_REPO_MODELS[model]['module'] )) if hasattr(object_class, 'xmp') and not hasattr(object_class, 'mets'): # File or subclass json_paths = models.sort_file_paths(json_paths) else: # Entity or subclass json_paths = util.natural_sort(json_paths) json_paths_len = len(json_paths) Exporter._make_tmpdir(os.path.dirname(csv_path)) headers = module.csv_export_fields(required_only) # make sure we export 'id' if it's not in model FIELDS (ahem, files) if 'id' not in headers: headers.insert(0, 'id') with codecs.open(csv_path, 'wb', 'utf-8') as csvfile: writer = fileio.csv_writer(csvfile) # headers in first line writer.writerow(headers) for n,json_path in enumerate(json_paths): i = identifier.Identifier(json_path) logging.info('%s/%s - %s' % (n+1, json_paths_len, i.id)) obj = object_class.from_identifier(i) if obj: writer.writerow(obj.dump_csv(headers=headers)) return csv_path
def publishable_objects(tmpdir_factory): fn = tmpdir_factory.mktemp('repo').join(COLLECTION_IDS[0]) repo_path = str(fn) repo = dvcs.initialize_repository(repo_path, GIT_USER, GIT_MAIL) basepath = os.path.dirname(repo_path) objects = [] for oid in COLLECTION_IDS: oi = identifier.Identifier(oid, basepath) model_class = identifier.class_for_name( identifier.MODEL_CLASSES[oi.model]['module'], identifier.MODEL_CLASSES[oi.model]['class']) o = model_class.new(oi) if o.identifier.model == 'file': o.sha1 = o.identifier.idparts['sha1'] o.save(GIT_USER, GIT_MAIL, AGENT) objects.append(o) return objects
def import_files(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, log_path=None, dryrun=False): """Adds or updates files from a CSV file TODO how to handle excluded fields like XMP??? @param csv_path: Absolute path to CSV data file. @param cidentifier: Identifier @param vocabs_path: Absolute path to vocab dir @param git_name: str @param git_mail: str @param agent: str @param log_path: str Absolute path to addfile log for all files @param dryrun: boolean """ logging.info('batch import files ----------------------------') # TODO hard-coded model name... model = 'file' csv_dir = os.path.dirname(csv_path) logging.debug('csv_dir %s' % csv_dir) # TODO this still knows too much about entities and files... entity_class = identifier.class_for_name( identifier.MODEL_CLASSES['entity']['module'], identifier.MODEL_CLASSES['entity']['class'] ) logging.debug('entity_class %s' % entity_class) logging.info('Reading %s' % csv_path) headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path)) logging.info('%s rows' % len(rowds)) # check for modified or uncommitted files in repo repository = dvcs.repository(cidentifier.path_abs()) logging.debug(repository) fidentifiers = { rowd['id']: identifier.Identifier( id=rowd['id'], base_path=cidentifier.basepath ) for rowd in rowds } fidentifier_parents = { fi.id: Importer._fidentifier_parent(fi) for fi in fidentifiers.itervalues() } # eidentifiers, removing duplicates eidentifiers = list(set([e for e in fidentifier_parents.itervalues()])) entities = {} bad_entities = [] for eidentifier in eidentifiers: if os.path.exists(eidentifier.path_abs()): entity = eidentifier.object() entities[eidentifier.id] = entity else: if eidentifier.id not in bad_entities: bad_entities.append(eidentifier.id) if bad_entities: for f in bad_entities: logging.error(' %s missing' % f) raise Exception('%s entities could not be loaded! - IMPORT CANCELLED!' % len(bad_entities)) # separate into new and existing lists rowds_new = [] rowds_existing = [] for n,rowd in enumerate(rowds): if Importer._file_is_new(fidentifiers[rowd['id']]): rowds_new.append(rowd) else: rowds_existing.append(rowd) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Updating existing files') start_updates = datetime.now() git_files = [] updated = [] elapsed_rounds_updates = [] staged = [] obj_metadata = None for n,rowd in enumerate(rowds_existing): logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig'])) start_round = datetime.now() fidentifier = fidentifiers[rowd['id']] eidentifier = fidentifier_parents[fidentifier.id] entity = entities[eidentifier.id] file_ = fidentifier.object() modified = file_.load_csv(rowd) # Getting obj_metadata takes about 1sec each time # TODO caching works as long as all objects have same metadata... if not obj_metadata: obj_metadata = models.object_metadata( fidentifier.fields_module(), repository.working_dir ) if dryrun: pass elif modified: logging.debug(' writing %s' % file_.json_path) file_.write_json(obj_metadata=obj_metadata) # TODO better to write to collection changelog? Importer._write_entity_changelog(entity, git_name, git_mail, agent) # stage git_files.append(file_.json_path_rel) git_files.append(entity.changelog_path_rel) updated.append(file_) elapsed_round = datetime.now() - start_round elapsed_rounds_updates.append(elapsed_round) logging.debug('| %s (%s)' % (fidentifier, elapsed_round)) elapsed_updates = datetime.now() - start_updates logging.debug('%s updated in %s' % (len(elapsed_rounds_updates), elapsed_updates)) if dryrun: pass elif git_files: logging.info('Staging %s modified files' % len(git_files)) start_stage = datetime.now() dvcs.stage(repository, git_files) staged = util.natural_sort(dvcs.list_staged(repository)) for path in staged: if path in git_files: logging.debug('+ %s' % path) else: logging.debug('| %s' % path) elapsed_stage = datetime.now() - start_stage logging.debug('ok (%s)' % elapsed_stage) logging.debug('%s staged in %s' % (len(staged), elapsed_stage)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Adding new files') start_adds = datetime.now() elapsed_rounds_adds = [] logging.info('Checking source files') for rowd in rowds_new: rowd['src_path'] = os.path.join(csv_dir, rowd['basename_orig']) logging.debug('| %s' % rowd['src_path']) if not os.path.exists(rowd['src_path']): raise Exception('Missing file: %s' % rowd['src_path']) if log_path: logging.info('addfile logging to %s' % log_path) for n,rowd in enumerate(rowds_new): logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig'])) start_round = datetime.now() fidentifier = fidentifiers[rowd['id']] eidentifier = fidentifier_parents[fidentifier.id] entity = entities[eidentifier.id] logging.debug('| %s' % (entity)) if dryrun: pass elif Importer._file_is_new(fidentifier): # ingest # TODO make sure this updates entity.files file_,repo2,log2 = ingest.add_file( entity, rowd['src_path'], fidentifier.parts['role'], rowd, git_name, git_mail, agent, log_path=log_path, show_staged=False ) elapsed_round = datetime.now() - start_round elapsed_rounds_adds.append(elapsed_round) logging.debug('| %s (%s)' % (file_, elapsed_round)) elapsed_adds = datetime.now() - start_adds logging.debug('%s added in %s' % (len(elapsed_rounds_adds), elapsed_adds)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') return git_files