예제 #1
0
 def import_files(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, log_path=None, dryrun=False):
     """Adds or updates files from a CSV file
     
     TODO how to handle excluded fields like XMP???
     
     @param csv_path: Absolute path to CSV data file.
     @param cidentifier: Identifier
     @param vocabs_path: Absolute path to vocab dir
     @param git_name: str
     @param git_mail: str
     @param agent: str
     @param log_path: str Absolute path to addfile log for all files
     @param dryrun: boolean
     """
     logging.info('batch import files ----------------------------')
     
     # TODO hard-coded model name...
     model = 'file'
     
     csv_dir = os.path.dirname(csv_path)
     logging.debug('csv_dir %s' % csv_dir)
 
     # TODO this still knows too much about entities and files...
     entity_class = identifier.class_for_name(
         identifier.MODEL_CLASSES['entity']['module'],
         identifier.MODEL_CLASSES['entity']['class']
     )
     logging.debug('entity_class %s' % entity_class)
     
     logging.info('Reading %s' % csv_path)
     headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path))
     logging.info('%s rows' % len(rowds))
     
     # check for modified or uncommitted files in repo
     repository = dvcs.repository(cidentifier.path_abs())
     logging.debug(repository)
 
     fidentifiers = {
         rowd['id']: identifier.Identifier(
             id=rowd['id'],
             base_path=cidentifier.basepath
         )
         for rowd in rowds
     }
     fidentifier_parents = {
         fi.id: Importer._fidentifier_parent(fi)
         for fi in fidentifiers.itervalues()
     }
     # eidentifiers, removing duplicates
     eidentifiers = list(set([e for e in fidentifier_parents.itervalues()]))
     entities = {}
     bad_entities = []
     for eidentifier in eidentifiers:
         if os.path.exists(eidentifier.path_abs()):
             entity = eidentifier.object()
             entities[eidentifier.id] = entity
         else:
             if eidentifier.id not in bad_entities:
                 bad_entities.append(eidentifier.id)
     if bad_entities:
         for f in bad_entities:
             logging.error('    %s missing' % f)
         raise Exception('%s entities could not be loaded! - IMPORT CANCELLED!' % len(bad_entities))
 
     # separate into new and existing lists
     rowds_new = []
     rowds_existing = []
     for n,rowd in enumerate(rowds):
         if Importer._file_is_new(fidentifiers[rowd['id']]):
             rowds_new.append(rowd)
         else:
             rowds_existing.append(rowd)
     
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     logging.info('Updating existing files')
     start_updates = datetime.now()
     git_files = []
     updated = []
     elapsed_rounds_updates = []
     staged = []
     obj_metadata = None
     for n,rowd in enumerate(rowds_existing):
         logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig']))
         start_round = datetime.now()
         
         fidentifier = fidentifiers[rowd['id']]
         eidentifier = fidentifier_parents[fidentifier.id]
         entity = entities[eidentifier.id]
         file_ = fidentifier.object()
         modified = file_.load_csv(rowd)
         # Getting obj_metadata takes about 1sec each time
         # TODO caching works as long as all objects have same metadata...
         if not obj_metadata:
             obj_metadata = models.object_metadata(
                 fidentifier.fields_module(),
                 repository.working_dir
             )
         
         if dryrun:
             pass
         elif modified:
             logging.debug('    writing %s' % file_.json_path)
             file_.write_json(obj_metadata=obj_metadata)
             # TODO better to write to collection changelog?
             Importer._write_entity_changelog(entity, git_name, git_mail, agent)
             # stage
             git_files.append(file_.json_path_rel)
             git_files.append(entity.changelog_path_rel)
             updated.append(file_)
         
         elapsed_round = datetime.now() - start_round
         elapsed_rounds_updates.append(elapsed_round)
         logging.debug('| %s (%s)' % (fidentifier, elapsed_round))
     
     elapsed_updates = datetime.now() - start_updates
     logging.debug('%s updated in %s' % (len(elapsed_rounds_updates), elapsed_updates))
             
     if dryrun:
         pass
     elif git_files:
         logging.info('Staging %s modified files' % len(git_files))
         start_stage = datetime.now()
         dvcs.stage(repository, git_files)
         staged = util.natural_sort(dvcs.list_staged(repository))
         for path in staged:
             if path in git_files:
                 logging.debug('+ %s' % path)
             else:
                 logging.debug('| %s' % path)
         elapsed_stage = datetime.now() - start_stage
         logging.debug('ok (%s)' % elapsed_stage)
         logging.debug('%s staged in %s' % (len(staged), elapsed_stage))
     
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     logging.info('Adding new files')
     start_adds = datetime.now()
     elapsed_rounds_adds = []
     logging.info('Checking source files')
     for rowd in rowds_new:
         rowd['src_path'] = os.path.join(csv_dir, rowd['basename_orig'])
         logging.debug('| %s' % rowd['src_path'])
         if not os.path.exists(rowd['src_path']):
             raise Exception('Missing file: %s' % rowd['src_path'])
     if log_path:
         logging.info('addfile logging to %s' % log_path)
     for n,rowd in enumerate(rowds_new):
         logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig']))
         start_round = datetime.now()
         
         fidentifier = fidentifiers[rowd['id']]
         eidentifier = fidentifier_parents[fidentifier.id]
         entity = entities[eidentifier.id]
         logging.debug('| %s' % (entity))
 
         if dryrun:
             pass
         elif Importer._file_is_new(fidentifier):
             # ingest
             # TODO make sure this updates entity.files
             file_,repo2,log2 = ingest.add_file(
                 entity,
                 rowd['src_path'],
                 fidentifier.parts['role'],
                 rowd,
                 git_name, git_mail, agent,
                 log_path=log_path,
                 show_staged=False
             )
         
         elapsed_round = datetime.now() - start_round
         elapsed_rounds_adds.append(elapsed_round)
         logging.debug('| %s (%s)' % (file_, elapsed_round))
     
     elapsed_adds = datetime.now() - start_adds
     logging.debug('%s added in %s' % (len(elapsed_rounds_adds), elapsed_adds))
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     
     return git_files
예제 #2
0
 def import_entities(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, dryrun=False):
     """Adds or updates entities from a CSV file
     
     Running function multiple times with the same CSV file is idempotent.
     After the initial pass, files will only be modified if the CSV data
     has been updated.
     
     This function writes and stages files but does not commit them!
     That is left to the user or to another function.
     
     @param csv_path: Absolute path to CSV data file.
     @param cidentifier: Identifier
     @param vocabs_path: Absolute path to vocab dir
     @param git_name: str
     @param git_mail: str
     @param agent: str
     @param dryrun: boolean
     @returns: list of updated entities
     """
     logging.info('------------------------------------------------------------------------')
     logging.info('batch import entity')
     model = 'entity'
     
     repository = dvcs.repository(cidentifier.path_abs())
     logging.info(repository)
     
     logging.info('Reading %s' % csv_path)
     headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path))
     logging.info('%s rows' % len(rowds))
     
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     logging.info('Importing')
     start_updates = datetime.now()
     git_files = []
     updated = []
     elapsed_rounds = []
     obj_metadata = None
     
     if dryrun:
         logging.info('Dry run - no modifications')
     for n,rowd in enumerate(rowds):
         logging.info('%s/%s - %s' % (n+1, len(rowds), rowd['id']))
         start_round = datetime.now()
         
         eidentifier = identifier.Identifier(id=rowd['id'], base_path=cidentifier.basepath)
         # if there is an existing object it will be loaded
         entity = eidentifier.object()
         if not entity:
             entity = models.Entity.create(eidentifier.path_abs(), eidentifier)
         modified = entity.load_csv(rowd)
         # Getting obj_metadata takes about 1sec each time
         # TODO caching works as long as all objects have same metadata...
         if not obj_metadata:
             obj_metadata = models.object_metadata(
                 eidentifier.fields_module(),
                 repository.working_dir
             )
         
         if dryrun:
             pass
         elif modified:
             # write files
             if not os.path.exists(entity.path_abs):
                 os.makedirs(entity.path_abs)
             logging.debug('    writing %s' % entity.json_path)
             entity.write_json(obj_metadata=obj_metadata)
             # TODO better to write to collection changelog?
             # TODO write all additions to changelog at one time
             Importer._write_entity_changelog(entity, git_name, git_mail, agent)
             # stage
             git_files.append(entity.json_path_rel)
             git_files.append(entity.changelog_path_rel)
             updated.append(entity)
         
         elapsed_round = datetime.now() - start_round
         elapsed_rounds.append(elapsed_round)
         logging.debug('| %s (%s)' % (eidentifier, elapsed_round))
 
     if dryrun:
         logging.info('Dry run - no modifications')
     elif updated:
         logging.info('Staging %s modified files' % len(git_files))
         start_stage = datetime.now()
         dvcs.stage(repository, git_files)
         for path in util.natural_sort(dvcs.list_staged(repository)):
             if path in git_files:
                 logging.debug('+ %s' % path)
             else:
                 logging.debug('| %s' % path)
         elapsed_stage = datetime.now() - start_stage
         logging.debug('ok (%s)' % elapsed_stage)
     
     elapsed_updates = datetime.now() - start_updates
     logging.debug('%s updated in %s' % (len(elapsed_rounds), elapsed_updates))
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     
     return updated