Example #1
0
 def export(json_paths, model, csv_path, required_only=False):
     """Write the specified objects' data to CSV.
     
     IMPORTANT: All objects in json_paths must have the same set of fields!
     
     TODO let user specify which fields to write
     TODO confirm that each identifier's class matches object_class
     
     @param json_paths: list of .json files
     @param model: str
     @param csv_path: Absolute path to CSV data file.
     @param required_only: boolean Only required fields.
     """
     object_class = identifier.class_for_name(
         identifier.MODEL_CLASSES[model]['module'],
         identifier.MODEL_CLASSES[model]['class']
     )
     module = modules.Module(identifier.module_for_name(
         identifier.MODEL_REPO_MODELS[model]['module']
     ))
     
     if hasattr(object_class, 'xmp') and not hasattr(object_class, 'mets'):
         # File or subclass
         json_paths = models.sort_file_paths(json_paths)
     else:
         # Entity or subclass
         json_paths = util.natural_sort(json_paths)
     json_paths_len = len(json_paths)
     
     Exporter._make_tmpdir(os.path.dirname(csv_path))
     
     headers = module.csv_export_fields(required_only)
     # make sure we export 'id' if it's not in model FIELDS (ahem, files)
     if 'id' not in headers:
         headers.insert(0, 'id')
     
     with codecs.open(csv_path, 'wb', 'utf-8') as csvfile:
         writer = fileio.csv_writer(csvfile)
         # headers in first line
         writer.writerow(headers)
         for n,json_path in enumerate(json_paths):
             i = identifier.Identifier(json_path)
             logging.info('%s/%s - %s' % (n+1, json_paths_len, i.id))
             obj = object_class.from_identifier(i)
             if obj:
                 writer.writerow(obj.dump_csv(headers=headers))
     
     return csv_path
Example #2
0
def publishable_objects(tmpdir_factory):
    fn = tmpdir_factory.mktemp('repo').join(COLLECTION_IDS[0])
    repo_path = str(fn)
    repo = dvcs.initialize_repository(repo_path, GIT_USER, GIT_MAIL)
    basepath = os.path.dirname(repo_path)
    objects = []
    for oid in COLLECTION_IDS:
        oi = identifier.Identifier(oid, basepath)
        model_class = identifier.class_for_name(
            identifier.MODEL_CLASSES[oi.model]['module'],
            identifier.MODEL_CLASSES[oi.model]['class'])
        o = model_class.new(oi)
        if o.identifier.model == 'file':
            o.sha1 = o.identifier.idparts['sha1']
        o.save(GIT_USER, GIT_MAIL, AGENT)
        objects.append(o)
    return objects
Example #3
0
 def import_files(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, log_path=None, dryrun=False):
     """Adds or updates files from a CSV file
     
     TODO how to handle excluded fields like XMP???
     
     @param csv_path: Absolute path to CSV data file.
     @param cidentifier: Identifier
     @param vocabs_path: Absolute path to vocab dir
     @param git_name: str
     @param git_mail: str
     @param agent: str
     @param log_path: str Absolute path to addfile log for all files
     @param dryrun: boolean
     """
     logging.info('batch import files ----------------------------')
     
     # TODO hard-coded model name...
     model = 'file'
     
     csv_dir = os.path.dirname(csv_path)
     logging.debug('csv_dir %s' % csv_dir)
 
     # TODO this still knows too much about entities and files...
     entity_class = identifier.class_for_name(
         identifier.MODEL_CLASSES['entity']['module'],
         identifier.MODEL_CLASSES['entity']['class']
     )
     logging.debug('entity_class %s' % entity_class)
     
     logging.info('Reading %s' % csv_path)
     headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path))
     logging.info('%s rows' % len(rowds))
     
     # check for modified or uncommitted files in repo
     repository = dvcs.repository(cidentifier.path_abs())
     logging.debug(repository)
 
     fidentifiers = {
         rowd['id']: identifier.Identifier(
             id=rowd['id'],
             base_path=cidentifier.basepath
         )
         for rowd in rowds
     }
     fidentifier_parents = {
         fi.id: Importer._fidentifier_parent(fi)
         for fi in fidentifiers.itervalues()
     }
     # eidentifiers, removing duplicates
     eidentifiers = list(set([e for e in fidentifier_parents.itervalues()]))
     entities = {}
     bad_entities = []
     for eidentifier in eidentifiers:
         if os.path.exists(eidentifier.path_abs()):
             entity = eidentifier.object()
             entities[eidentifier.id] = entity
         else:
             if eidentifier.id not in bad_entities:
                 bad_entities.append(eidentifier.id)
     if bad_entities:
         for f in bad_entities:
             logging.error('    %s missing' % f)
         raise Exception('%s entities could not be loaded! - IMPORT CANCELLED!' % len(bad_entities))
 
     # separate into new and existing lists
     rowds_new = []
     rowds_existing = []
     for n,rowd in enumerate(rowds):
         if Importer._file_is_new(fidentifiers[rowd['id']]):
             rowds_new.append(rowd)
         else:
             rowds_existing.append(rowd)
     
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     logging.info('Updating existing files')
     start_updates = datetime.now()
     git_files = []
     updated = []
     elapsed_rounds_updates = []
     staged = []
     obj_metadata = None
     for n,rowd in enumerate(rowds_existing):
         logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig']))
         start_round = datetime.now()
         
         fidentifier = fidentifiers[rowd['id']]
         eidentifier = fidentifier_parents[fidentifier.id]
         entity = entities[eidentifier.id]
         file_ = fidentifier.object()
         modified = file_.load_csv(rowd)
         # Getting obj_metadata takes about 1sec each time
         # TODO caching works as long as all objects have same metadata...
         if not obj_metadata:
             obj_metadata = models.object_metadata(
                 fidentifier.fields_module(),
                 repository.working_dir
             )
         
         if dryrun:
             pass
         elif modified:
             logging.debug('    writing %s' % file_.json_path)
             file_.write_json(obj_metadata=obj_metadata)
             # TODO better to write to collection changelog?
             Importer._write_entity_changelog(entity, git_name, git_mail, agent)
             # stage
             git_files.append(file_.json_path_rel)
             git_files.append(entity.changelog_path_rel)
             updated.append(file_)
         
         elapsed_round = datetime.now() - start_round
         elapsed_rounds_updates.append(elapsed_round)
         logging.debug('| %s (%s)' % (fidentifier, elapsed_round))
     
     elapsed_updates = datetime.now() - start_updates
     logging.debug('%s updated in %s' % (len(elapsed_rounds_updates), elapsed_updates))
             
     if dryrun:
         pass
     elif git_files:
         logging.info('Staging %s modified files' % len(git_files))
         start_stage = datetime.now()
         dvcs.stage(repository, git_files)
         staged = util.natural_sort(dvcs.list_staged(repository))
         for path in staged:
             if path in git_files:
                 logging.debug('+ %s' % path)
             else:
                 logging.debug('| %s' % path)
         elapsed_stage = datetime.now() - start_stage
         logging.debug('ok (%s)' % elapsed_stage)
         logging.debug('%s staged in %s' % (len(staged), elapsed_stage))
     
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     logging.info('Adding new files')
     start_adds = datetime.now()
     elapsed_rounds_adds = []
     logging.info('Checking source files')
     for rowd in rowds_new:
         rowd['src_path'] = os.path.join(csv_dir, rowd['basename_orig'])
         logging.debug('| %s' % rowd['src_path'])
         if not os.path.exists(rowd['src_path']):
             raise Exception('Missing file: %s' % rowd['src_path'])
     if log_path:
         logging.info('addfile logging to %s' % log_path)
     for n,rowd in enumerate(rowds_new):
         logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig']))
         start_round = datetime.now()
         
         fidentifier = fidentifiers[rowd['id']]
         eidentifier = fidentifier_parents[fidentifier.id]
         entity = entities[eidentifier.id]
         logging.debug('| %s' % (entity))
 
         if dryrun:
             pass
         elif Importer._file_is_new(fidentifier):
             # ingest
             # TODO make sure this updates entity.files
             file_,repo2,log2 = ingest.add_file(
                 entity,
                 rowd['src_path'],
                 fidentifier.parts['role'],
                 rowd,
                 git_name, git_mail, agent,
                 log_path=log_path,
                 show_staged=False
             )
         
         elapsed_round = datetime.now() - start_round
         elapsed_rounds_adds.append(elapsed_round)
         logging.debug('| %s (%s)' % (file_, elapsed_round))
     
     elapsed_adds = datetime.now() - start_adds
     logging.debug('%s added in %s' % (len(elapsed_rounds_adds), elapsed_adds))
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     
     return git_files