def set_change_from_cache(change, change_dir): change_types = {'ADDED': list, 'DELETED': list, 'MODIFIED': dict, 'METACHANGE': dict, 'UNCHANGED': list} change_data = {} for change_type in change_types: change_file = os.path.join(change_dir, change_type) if change_types[change_type] == list: with open(change_file, 'r') as f: lines = f.readlines() change_data[change_type] = [line.strip() for line in lines] else: with open(change_file, 'r') as f: change_dict = {} for line in f: kv = line.split(':') change_dict[kv[0]] = kv[1].strip() change_data[change_type] = change_dict change.add_changes(change_data['ADDED'], change_data['DELETED'], change_data['MODIFIED'], change_data['UNCHANGED'], change_data['METACHANGE']) change.calculate_degree() metainfo = dacman_utils.load_yaml(os.path.join(change_dir, 'META_INFO')) change.old_nfiles = metainfo['base']['nfiles'] change.new_nfiles = metainfo['revision']['nfiles']
def append(datapath, usermeta, custom_stagingdir=None): logger = logging.getLogger(__name__) if not custom_stagingdir: stagingdir = dacman_utils.DACMAN_STAGING_LOC else: stagingdir = custom_stagingdir indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath)) if not os.path.exists(indexdir): logger.error( 'Data is not indexed... please index before adding metadata!') sys.exit() if not usermeta: logger.warn('No user metadata provided. Exiting...') return meta_file = os.path.join(indexdir, 'METADATA') metadata = dacman_utils.load_yaml(meta_file) if not metadata: metadata = {} newmeta = '' if 'USER_DEFINED_METADATA' in metadata: newmeta = metadata['USER_DEFINED_METADATA'] newmeta += ', ' + usermeta extended_metadata = {'USER_DEFINED_METADATA': newmeta} dacman_utils.dump_yaml(extended_metadata, meta_file) logger.info('New user metadata added')
def retrieve(datapath, custom_stagingdir=None): logger = logging.getLogger(__name__) if not custom_stagingdir: stagingdir = dacman_utils.DACMAN_STAGING_LOC else: stagingdir = custom_stagingdir indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath)) if not os.path.exists(indexdir): logger.error( 'Data is not indexed... please index before retrieving metadata!') sys.exit() meta_file = os.path.join(indexdir, 'METADATA') metadata = {} metadata = dacman_utils.load_yaml(meta_file) if not metadata: metadata = {} elif 'USER_DEFINED_METADATA' in metadata: usermeta = metadata['USER_DEFINED_METADATA'] print(usermeta) else: print('No user-defined metadata available for the dataset') logger.info('User metadata retrieved')
def load_comparator(cls, data_type): COMPARATORS_MAP = get_comparators_map() plugin_config = os.path.join(os.getenv('HOME'), '.dacman/config/plugins.yaml') if os.path.exists(plugin_config): plugin_info = dacman_utils.load_yaml(plugin_config) LOG.debug(f'COMPARATORS_MAP={COMPARATORS_MAP}') if plugin_info is not None: if data_type in plugin_info: # check if it's one of the default plugins for the data type for comparator in COMPARATORS_MAP['default']: if plugin_info[ data_type] == comparator.__class__.__name__: return comparator # check if the data type plugin is available or not if data_type in COMPARATORS_MAP: for comparator in COMPARATORS_MAP[data_type]: if comparator.__class__.__name__ == plugin_info[ data_type]: return comparator else: print( "Configured plugin {} not found. Using available plugins." .format(plugin_info[data_type])) else: print("Plugin for {} not found. Using default plugin.". format(data_type)) if data_type in COMPARATORS_MAP: return COMPARATORS_MAP[data_type][0] else: return COMPARATORS_MAP['default'][0]
def clean(datadirs): logger = logging.getLogger(__name__) logger.info('Removing indexes for %s', ', '.join(datadirs)) indexdir = os.path.join(dacman_utils.DACMAN_STAGING_LOC, 'indexes') cachedir = os.path.join(dacman_utils.DACMAN_STAGING_LOC, 'cache') cachefile = os.path.join(cachedir, 'ENTRIES') if os.path.exists(cachefile): cache = dacman_utils.load_yaml(cachefile) for datadir in datadirs: path = os.path.abspath(datadir) if path in cache: for comp in cache[path]: cache_data = os.path.join(cachedir, cache[path][comp]) shutil.rmtree(cache_data) del cache[path] else: to_delete = [] for k in cache: for s in cache[k]: if s == path: to_delete.append([k, s]) for elem in to_delete: k, s = elem[0], elem[1] cache_data = os.path.join(cachedir, cache[k][s]) shutil.rmtree(cache_data) del cache[k][s] dacman_utils.dump_yaml(cache, cachefile) for datadir in datadirs: path = os.path.abspath(datadir) indexes = os.path.join(indexdir, get_hash_id(path)) if os.path.exists(indexes): index_file = os.path.join(indexdir, 'INDEXED_PATHS') shutil.rmtree(indexes) index_metadata = dacman_utils.load_yaml(index_file) del index_metadata[path] dacman_utils.dump_yaml(index_metadata, index_file) logger.info('Indexes removed for %s', datadir) elif os.path.exists(datadir): logger.warn('Indexes and metadata for directory %s are not staged', datadir) else: logger.error('Data directory %s does not exist', datadir)
def index(datapath, custom_stagingdir=None, manager='python'): logger.info('Indexing %s', datapath) stagingdir = check_stagingdir(custom_stagingdir, datapath) if manager == 'tigres': if not TIGRES_IMPORT: logger.error('Tigres is not installed or not in path') sys.exit() logger.info('Using Tigres for parallel indexing') indexdir = tigres_index(stagingdir, datapath) else: logger.info('Using Python multiprocessing for parallel indexing') indexdir = mp_index(stagingdir, datapath) index_metafile = os.path.join(os.path.dirname(indexdir), 'INDEXED_PATHS') index_metadata = {} if os.path.exists(index_metafile): index_metadata = dacman_utils.load_yaml(index_metafile) index_metadata[datapath] = os.path.basename(indexdir) dacman_utils.dump_yaml(index_metadata, index_metafile) return indexdir
def get_change_pairs(self): if not (self.old_path and self.new_path): self.logger.error('Old and new datapaths are not specified!') sys.exit() change_pairs = [] old_base = self.old_path new_base = self.new_path self.logger.info('Starting diff calculation') if self.old_path_is_file and self.new_path_is_file: change_pairs.append((self.old_path, self.new_path)) return change_pairs elif self.old_path_is_file != self.new_path_is_file: self.logger.error('Datapaths are of different types') sys.exit() ''' check if indexes on the data are present else, check for data types and invoke parallel comparison ''' old_index_path = None new_index_path = None is_indexed = False indexdir = os.path.join(self.stagingdir, 'indexes') index_metafile = os.path.join(indexdir, 'INDEXED_PATHS') if os.path.exists(index_metafile): indexed_paths = dacman_utils.load_yaml(index_metafile) paths_indexed = [False, False] for path in indexed_paths: p = path + os.sep if self.old_path.startswith(p) or self.old_path == path: old_index_path = os.path.join( indexdir, get_hash_id(os.path.abspath(path))) paths_indexed[0] = True if self.new_path.startswith(p) or self.new_path == path: new_index_path = os.path.join( indexdir, get_hash_id(os.path.abspath(path))) paths_indexed[1] = True if all(paths_indexed): is_indexed = True break if is_indexed: changeManager = ChangeManager(self.old_path, self.new_path, False, self.stagingdir) status, cached_old_path, cached_new_path = changeManager.get_cached_paths( ) change_data = changeManager.get_changes(status, cached_old_path, cached_new_path) old_datapath_file = os.path.join(old_index_path, 'DATAPATH') new_datapath_file = os.path.join(new_index_path, 'DATAPATH') old_filelist = os.path.join(old_index_path, 'FILEPATHS') new_filelist = os.path.join(new_index_path, 'FILEPATHS') with open(old_datapath_file) as f: old_basepath = f.readline().split('\n')[0] with open(new_datapath_file) as f: new_basepath = f.readline().split('\n')[0] with open(old_filelist) as f: for relpath in f: filepath = os.path.join(old_basepath, relpath) if filepath == self.old_path: self.old_path_is_file = True break with open(new_filelist) as f: for relpath in f: filepath = os.path.join(new_basepath, relpath) if filepath == self.old_path: self.new_path_is_file = True break else: self.logger.warning( 'Datapaths are not indexed. Trying to locate and index the data...' ) ''' The code below allows to check for a diff between any two random files ''' # change_data = change.changes(old_base, new_base, False, self.stagingdir) changeManager = ChangeManager(old_base, new_base, False, self.stagingdir) status, cached_old_path, cached_new_path = changeManager.get_cached_paths( ) change_data = changeManager.get_changes(status, cached_old_path, cached_new_path) changes = change_data.modified self.logger.info('Searching for path indexes') ''' find the old and new base directories which are indexed through ''' path_prefix_new = cached_new_path path_prefix_old = cached_old_path ''' save the metadata about the high-level diff between the directories ''' if not self.old_path_is_file: if self.save_changes: self._save_dir_diff(change_data) self.logger.info('Change summary saved in: %s', self.outdir) change.display(change_data) ''' for each file level change, a detailed change analysis is reqd ''' for change_key in changes: new_path = os.path.join(path_prefix_new, change_key) old_path = os.path.join(path_prefix_old, changes[change_key]) change_pairs.append((new_path, old_path)) else: rel_new_path = os.path.relpath(self.new_path, path_prefix_new) rel_old_path = os.path.relpath(self.old_path, path_prefix_old) if rel_new_path in changes and changes[ rel_new_path] == rel_old_path: change_pairs.append((self.new_path, self.old_path)) return change_pairs