Ejemplo n.º 1
0
def set_change_from_cache(change, change_dir):
   change_types = {'ADDED': list, 'DELETED': list, 
                   'MODIFIED': dict, 'METACHANGE': dict,
                   'UNCHANGED': list}
   change_data = {}
   for change_type in change_types:
      change_file = os.path.join(change_dir, change_type)
      if change_types[change_type] == list:
         with open(change_file, 'r') as f:
            lines = f.readlines()
            change_data[change_type] = [line.strip() for line in lines]
      else:
         with open(change_file, 'r') as f:
            change_dict = {}
            for line in f:
               kv = line.split(':')
               change_dict[kv[0]] = kv[1].strip()
            change_data[change_type] = change_dict

   change.add_changes(change_data['ADDED'], change_data['DELETED'],
                      change_data['MODIFIED'], change_data['UNCHANGED'],
                      change_data['METACHANGE'])
   change.calculate_degree()

   metainfo = dacman_utils.load_yaml(os.path.join(change_dir, 'META_INFO'))
   change.old_nfiles = metainfo['base']['nfiles']
   change.new_nfiles = metainfo['revision']['nfiles']
Ejemplo n.º 2
0
def append(datapath, usermeta, custom_stagingdir=None):
    logger = logging.getLogger(__name__)

    if not custom_stagingdir:
        stagingdir = dacman_utils.DACMAN_STAGING_LOC
    else:
        stagingdir = custom_stagingdir

    indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath))

    if not os.path.exists(indexdir):
        logger.error(
            'Data is not indexed... please index before adding metadata!')
        sys.exit()

    if not usermeta:
        logger.warn('No user metadata provided. Exiting...')
        return

    meta_file = os.path.join(indexdir, 'METADATA')

    metadata = dacman_utils.load_yaml(meta_file)
    if not metadata:
        metadata = {}
    newmeta = ''
    if 'USER_DEFINED_METADATA' in metadata:
        newmeta = metadata['USER_DEFINED_METADATA']

    newmeta += ', ' + usermeta
    extended_metadata = {'USER_DEFINED_METADATA': newmeta}
    dacman_utils.dump_yaml(extended_metadata, meta_file)

    logger.info('New user metadata added')
Ejemplo n.º 3
0
def retrieve(datapath, custom_stagingdir=None):
    logger = logging.getLogger(__name__)

    if not custom_stagingdir:
        stagingdir = dacman_utils.DACMAN_STAGING_LOC
    else:
        stagingdir = custom_stagingdir

    indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath))

    if not os.path.exists(indexdir):
        logger.error(
            'Data is not indexed... please index before retrieving metadata!')
        sys.exit()

    meta_file = os.path.join(indexdir, 'METADATA')

    metadata = {}
    metadata = dacman_utils.load_yaml(meta_file)
    if not metadata:
        metadata = {}
    elif 'USER_DEFINED_METADATA' in metadata:
        usermeta = metadata['USER_DEFINED_METADATA']
        print(usermeta)
    else:
        print('No user-defined metadata available for the dataset')

    logger.info('User metadata retrieved')
Ejemplo n.º 4
0
 def load_comparator(cls, data_type):
     COMPARATORS_MAP = get_comparators_map()
     plugin_config = os.path.join(os.getenv('HOME'),
                                  '.dacman/config/plugins.yaml')
     if os.path.exists(plugin_config):
         plugin_info = dacman_utils.load_yaml(plugin_config)
         LOG.debug(f'COMPARATORS_MAP={COMPARATORS_MAP}')
         if plugin_info is not None:
             if data_type in plugin_info:
                 # check if it's one of the default plugins for the data type
                 for comparator in COMPARATORS_MAP['default']:
                     if plugin_info[
                             data_type] == comparator.__class__.__name__:
                         return comparator
                 # check if the data type plugin is available or not
                 if data_type in COMPARATORS_MAP:
                     for comparator in COMPARATORS_MAP[data_type]:
                         if comparator.__class__.__name__ == plugin_info[
                                 data_type]:
                             return comparator
                     else:
                         print(
                             "Configured plugin {} not found. Using available plugins."
                             .format(plugin_info[data_type]))
                 else:
                     print("Plugin for {} not found. Using default plugin.".
                           format(data_type))
     if data_type in COMPARATORS_MAP:
         return COMPARATORS_MAP[data_type][0]
     else:
         return COMPARATORS_MAP['default'][0]
Ejemplo n.º 5
0
def clean(datadirs):
    logger = logging.getLogger(__name__)
    logger.info('Removing indexes for %s', ', '.join(datadirs))
    indexdir = os.path.join(dacman_utils.DACMAN_STAGING_LOC, 'indexes')
    cachedir = os.path.join(dacman_utils.DACMAN_STAGING_LOC, 'cache')
    cachefile = os.path.join(cachedir, 'ENTRIES')
    if os.path.exists(cachefile):
        cache = dacman_utils.load_yaml(cachefile)
        for datadir in datadirs:
            path = os.path.abspath(datadir)
            if path in cache:
                for comp in cache[path]:
                    cache_data = os.path.join(cachedir, cache[path][comp])
                    shutil.rmtree(cache_data)
                del cache[path]
            else:
                to_delete = []
                for k in cache:
                    for s in cache[k]:
                        if s == path:
                            to_delete.append([k, s])
                for elem in to_delete:
                    k, s = elem[0], elem[1]
                    cache_data = os.path.join(cachedir, cache[k][s])
                    shutil.rmtree(cache_data)
                    del cache[k][s]
        dacman_utils.dump_yaml(cache, cachefile)

    for datadir in datadirs:
        path = os.path.abspath(datadir)
        indexes = os.path.join(indexdir, get_hash_id(path))
        if os.path.exists(indexes):
            index_file = os.path.join(indexdir, 'INDEXED_PATHS')
            shutil.rmtree(indexes)
            index_metadata = dacman_utils.load_yaml(index_file)
            del index_metadata[path]
            dacman_utils.dump_yaml(index_metadata, index_file)
            logger.info('Indexes removed for %s', datadir)
        elif os.path.exists(datadir):
            logger.warn('Indexes and metadata for directory %s are not staged',
                        datadir)
        else:
            logger.error('Data directory %s does not exist', datadir)
Ejemplo n.º 6
0
def index(datapath, custom_stagingdir=None, manager='python'):
    logger.info('Indexing %s', datapath)
    stagingdir = check_stagingdir(custom_stagingdir, datapath)
    if manager == 'tigres':
        if not TIGRES_IMPORT:
            logger.error('Tigres is not installed or not in path')
            sys.exit()
        logger.info('Using Tigres for parallel indexing')
        indexdir = tigres_index(stagingdir, datapath)
    else:
        logger.info('Using Python multiprocessing for parallel indexing')
        indexdir = mp_index(stagingdir, datapath)

    index_metafile = os.path.join(os.path.dirname(indexdir), 'INDEXED_PATHS')
    index_metadata = {}
    if os.path.exists(index_metafile):
        index_metadata = dacman_utils.load_yaml(index_metafile)
    index_metadata[datapath] = os.path.basename(indexdir)
    dacman_utils.dump_yaml(index_metadata, index_metafile)
    return indexdir
Ejemplo n.º 7
0
    def get_change_pairs(self):
        if not (self.old_path and self.new_path):
            self.logger.error('Old and new datapaths are not specified!')
            sys.exit()

        change_pairs = []

        old_base = self.old_path
        new_base = self.new_path
        self.logger.info('Starting diff calculation')
        if self.old_path_is_file and self.new_path_is_file:
            change_pairs.append((self.old_path, self.new_path))
            return change_pairs
        elif self.old_path_is_file != self.new_path_is_file:
            self.logger.error('Datapaths are of different types')
            sys.exit()
        '''
        check if indexes on the data are present
        else, check for data types and invoke parallel comparison
        '''
        old_index_path = None
        new_index_path = None
        is_indexed = False
        indexdir = os.path.join(self.stagingdir, 'indexes')
        index_metafile = os.path.join(indexdir, 'INDEXED_PATHS')

        if os.path.exists(index_metafile):
            indexed_paths = dacman_utils.load_yaml(index_metafile)
            paths_indexed = [False, False]
            for path in indexed_paths:
                p = path + os.sep
                if self.old_path.startswith(p) or self.old_path == path:
                    old_index_path = os.path.join(
                        indexdir, get_hash_id(os.path.abspath(path)))
                    paths_indexed[0] = True
                if self.new_path.startswith(p) or self.new_path == path:
                    new_index_path = os.path.join(
                        indexdir, get_hash_id(os.path.abspath(path)))
                    paths_indexed[1] = True
                if all(paths_indexed):
                    is_indexed = True
                    break

        if is_indexed:
            changeManager = ChangeManager(self.old_path, self.new_path, False,
                                          self.stagingdir)
            status, cached_old_path, cached_new_path = changeManager.get_cached_paths(
            )
            change_data = changeManager.get_changes(status, cached_old_path,
                                                    cached_new_path)

            old_datapath_file = os.path.join(old_index_path, 'DATAPATH')
            new_datapath_file = os.path.join(new_index_path, 'DATAPATH')

            old_filelist = os.path.join(old_index_path, 'FILEPATHS')
            new_filelist = os.path.join(new_index_path, 'FILEPATHS')

            with open(old_datapath_file) as f:
                old_basepath = f.readline().split('\n')[0]

            with open(new_datapath_file) as f:
                new_basepath = f.readline().split('\n')[0]

            with open(old_filelist) as f:
                for relpath in f:
                    filepath = os.path.join(old_basepath, relpath)
                    if filepath == self.old_path:
                        self.old_path_is_file = True
                        break

            with open(new_filelist) as f:
                for relpath in f:
                    filepath = os.path.join(new_basepath, relpath)
                    if filepath == self.old_path:
                        self.new_path_is_file = True
                        break
        else:
            self.logger.warning(
                'Datapaths are not indexed. Trying to locate and index the data...'
            )
            '''
            The code below allows to check for a diff between any two random files
            '''
            # change_data = change.changes(old_base, new_base, False, self.stagingdir)
            changeManager = ChangeManager(old_base, new_base, False,
                                          self.stagingdir)
            status, cached_old_path, cached_new_path = changeManager.get_cached_paths(
            )
            change_data = changeManager.get_changes(status, cached_old_path,
                                                    cached_new_path)

        changes = change_data.modified

        self.logger.info('Searching for path indexes')
        '''
        find the old and new base directories which are indexed through
        '''
        path_prefix_new = cached_new_path
        path_prefix_old = cached_old_path
        '''
        save the metadata about the high-level diff between the directories
        '''
        if not self.old_path_is_file:
            if self.save_changes:
                self._save_dir_diff(change_data)
                self.logger.info('Change summary saved in: %s', self.outdir)
            change.display(change_data)
            '''
            for each file level change, a detailed change analysis is reqd
            '''
            for change_key in changes:
                new_path = os.path.join(path_prefix_new, change_key)
                old_path = os.path.join(path_prefix_old, changes[change_key])
                change_pairs.append((new_path, old_path))
        else:
            rel_new_path = os.path.relpath(self.new_path, path_prefix_new)
            rel_old_path = os.path.relpath(self.old_path, path_prefix_old)
            if rel_new_path in changes and changes[
                    rel_new_path] == rel_old_path:
                change_pairs.append((self.new_path, self.old_path))

        return change_pairs