Ejemplo n.º 1
0
   def get_changes(self, cache_status, cached_old_path, cached_new_path):
      logger = logging.getLogger(__name__)

      is_subdir_oldpath = False
      is_subdir_newpath = False

      logger.info('Checking for changes between %s and %s', self.old_datapath, self.new_datapath)

      if cache_status == CacheStatus.NOT_CACHED:
         change_dir = comparator.compare(self.old_datapath, self.new_datapath, self.stagingdir)
      else:
         with open(self.cache_entries, 'r') as f:
            cache = yaml.safe_load(f)
            change_dir = cache[cached_new_path][cached_old_path]

      if cached_old_path != self.old_datapath:
         is_subdir_oldpath = True
         
      if cached_new_path != self.new_datapath:
         is_subdir_newpath = True
         
      logger.info('Retrieving changes between %s and %s', self.old_datapath, self.new_datapath)
      
      change = FilesystemChange(cached_old_path, cached_new_path, self.stagingdir)

      if is_subdir_newpath:
         indexdir = os.path.join(self.stagingdir, 'indexes', get_hash_id(cached_new_path))
         subdir_nfiles = get_subdir_nfiles(self.new_datapath, indexdir)
         change.new_nfiles = subdir_nfiles

      if is_subdir_oldpath:
         indexdir = os.path.join(self.stagingdir, 'indexes', get_hash_id(cached_old_path))
         subdir_nfiles = get_subdir_nfiles(self.old_datapath, indexdir)
         change.old_nfiles = subdir_nfiles

      change_data_dir = os.path.join(self.cachedir, change_dir)
      if not (is_subdir_oldpath or is_subdir_newpath):
         set_change_from_cache(change, change_data_dir)
      else:
         compare_hash = dacman_utils.hash_comparison_id(self.old_datapath, self.new_datapath)
         change_data_subdir = os.path.join(self.cachedir, compare_hash)
         if os.path.exists(change_data_subdir):
            set_change_from_cache(change, change_data_subdir)
         else:
            save_subdir_changes_to_cache(change, self.stagingdir,
                                         cached_old_path, cached_new_path,
                                         self.old_datapath, self.new_datapath,
                                         is_subdir_oldpath, is_subdir_newpath,
                                         change_data_dir, change_data_subdir)

            logger.info('Updating change cache entries')
            change_id = dacman_utils.hash_comparison_id(self.old_datapath, self.new_datapath)
            change_info = {self.new_datapath : {self.old_datapath: change_id}}
            dacman_utils.update_yaml(change_info, self.cache_entries)

      logger.info('Change retrieval completed')

      return change
Ejemplo n.º 2
0
def append(datapath, usermeta, custom_stagingdir=None):
    logger = logging.getLogger(__name__)

    if not custom_stagingdir:
        stagingdir = dacman_utils.DACMAN_STAGING_LOC
    else:
        stagingdir = custom_stagingdir

    indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath))

    if not os.path.exists(indexdir):
        logger.error(
            'Data is not indexed... please index before adding metadata!')
        sys.exit()

    if not usermeta:
        logger.warn('No user metadata provided. Exiting...')
        return

    meta_file = os.path.join(indexdir, 'METADATA')

    metadata = dacman_utils.load_yaml(meta_file)
    if not metadata:
        metadata = {}
    newmeta = ''
    if 'USER_DEFINED_METADATA' in metadata:
        newmeta = metadata['USER_DEFINED_METADATA']

    newmeta += ', ' + usermeta
    extended_metadata = {'USER_DEFINED_METADATA': newmeta}
    dacman_utils.dump_yaml(extended_metadata, meta_file)

    logger.info('New user metadata added')
Ejemplo n.º 3
0
def retrieve(datapath, custom_stagingdir=None):
    logger = logging.getLogger(__name__)

    if not custom_stagingdir:
        stagingdir = dacman_utils.DACMAN_STAGING_LOC
    else:
        stagingdir = custom_stagingdir

    indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath))

    if not os.path.exists(indexdir):
        logger.error(
            'Data is not indexed... please index before retrieving metadata!')
        sys.exit()

    meta_file = os.path.join(indexdir, 'METADATA')

    metadata = {}
    metadata = dacman_utils.load_yaml(meta_file)
    if not metadata:
        metadata = {}
    elif 'USER_DEFINED_METADATA' in metadata:
        usermeta = metadata['USER_DEFINED_METADATA']
        print(usermeta)
    else:
        print('No user-defined metadata available for the dataset')

    logger.info('User metadata retrieved')
Ejemplo n.º 4
0
def clean(datadirs):
    logger = logging.getLogger(__name__)
    logger.info('Removing indexes for %s', ', '.join(datadirs))
    indexdir = os.path.join(dacman_utils.DACMAN_STAGING_LOC, 'indexes')
    cachedir = os.path.join(dacman_utils.DACMAN_STAGING_LOC, 'cache')
    cachefile = os.path.join(cachedir, 'ENTRIES')
    if os.path.exists(cachefile):
        cache = dacman_utils.load_yaml(cachefile)
        for datadir in datadirs:
            path = os.path.abspath(datadir)
            if path in cache:
                for comp in cache[path]:
                    cache_data = os.path.join(cachedir, cache[path][comp])
                    shutil.rmtree(cache_data)
                del cache[path]
            else:
                to_delete = []
                for k in cache:
                    for s in cache[k]:
                        if s == path:
                            to_delete.append([k, s])
                for elem in to_delete:
                    k, s = elem[0], elem[1]
                    cache_data = os.path.join(cachedir, cache[k][s])
                    shutil.rmtree(cache_data)
                    del cache[k][s]
        dacman_utils.dump_yaml(cache, cachefile)

    for datadir in datadirs:
        path = os.path.abspath(datadir)
        indexes = os.path.join(indexdir, get_hash_id(path))
        if os.path.exists(indexes):
            index_file = os.path.join(indexdir, 'INDEXED_PATHS')
            shutil.rmtree(indexes)
            index_metadata = dacman_utils.load_yaml(index_file)
            del index_metadata[path]
            dacman_utils.dump_yaml(index_metadata, index_file)
            logger.info('Indexes removed for %s', datadir)
        elif os.path.exists(datadir):
            logger.warn('Indexes and metadata for directory %s are not staged',
                        datadir)
        else:
            logger.error('Data directory %s does not exist', datadir)
Ejemplo n.º 5
0
def tigres_index(stagingdir, datapath):
    indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath))
    deduce_file = os.path.join(indexdir, 'FILEPATHS')
    if not os.path.exists(deduce_file):
        scanner.scan(datapath, os.path.dirname(stagingdir))

    filelist = read_filelist(deduce_file)
    
    exec_name = 'EXECUTION_DISTRIBUTE_PROCESS'
    exec_plugin = tigres.utils.Execution.get(exec_name)

    try:
        logfile = 'deduce_index_{}.log'.format(round(time.time()))
        tigres.start(name='deduce_index', log_dest=logfile, execution=exec_plugin)
        tigres.set_log_level(tigres.Level.ERROR)

        task_array = tigres.TaskArray(tasks=[])
        
        task_hash = tigres.Task("hash_index", task_type=tigres.FUNCTION, impl_name=calculate_hash)
        task_array.append(task_hash)
        
        input_list = []
        for file in filelist:
            input_list.append([datapath, file])
        input_array = tigres.InputArray(values=input_list)

        logger.info('Indexing %d files', len(filelist))
        indexes = tigres.parallel('index_files', input_array=input_array, task_array=task_array)

        save_indexes(indexdir, indexes)

    except tigres.utils.TigresException as e:
        print(str(e))
        return_code = 1    

    tigres.end()

    return indexdir
Ejemplo n.º 6
0
def mp_index(stagingdir, datapath):
    indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath))
    deduce_file = os.path.join(indexdir, 'FILEPATHS')
    if not os.path.exists(deduce_file):
        scanner.scan(datapath, stagingdir)

    filelist = read_filelist(deduce_file)
    
    logger.info('Indexing %d files', len(filelist))
    num_procs = multiprocessing.cpu_count()
    results = []
    pool = multiprocessing.Pool(processes=num_procs)
    for filename in filelist:
        result = pool.apply_async(calculate_hash, args=(datapath, filename))
        results.append(result)

    pool.close()
    pool.join()
    indexes = [result.get() for result in results]

    save_indexes(indexdir, indexes)

    return indexdir
Ejemplo n.º 7
0
def compare(old_datapath, new_datapath, custom_stagingdir):
    logger = logging.getLogger(__name__)

    logger.info('Starting directory comparison')
    if not custom_stagingdir:
        #old_stagingdir = os.path.join(old_datapath, '.deduce')
        stagingdir = dacman_utils.DACMAN_STAGING_LOC
    else:
        stagingdir = custom_stagingdir

    old_indexdir = os.path.join(stagingdir, 'indexes',
                                get_hash_id(old_datapath))
    new_indexdir = os.path.join(stagingdir, 'indexes',
                                get_hash_id(new_datapath))

    old_index_file = os.path.join(old_indexdir, 'PATH.idx')
    new_index_file = os.path.join(new_indexdir, 'PATH.idx')

    if not os.path.exists(old_index_file):
        indexer.index(old_datapath, stagingdir)
    if not os.path.exists(new_index_file):
        indexer.index(new_datapath, stagingdir)

    old_data_index_file = os.path.join(old_indexdir, 'DATA.idx')
    old_pathname_map_file = os.path.join(old_indexdir, 'PATHNAME.map')

    old_metafile = os.path.join(old_indexdir, 'METADATA')
    new_metafile = os.path.join(new_indexdir, 'METADATA')

    #cprint(__modulename__, 'Loading Indexes')
    logger.info('Loading indexes for fast comparison')
    old_path_indexes = dacman_utils.file_to_dict(old_index_file)
    new_path_indexes = dacman_utils.file_to_dict(new_index_file)
    old_data_indexes = dacman_utils.file_to_dict(old_data_index_file)
    name_path_map = dacman_utils.file_to_dict_list(old_pathname_map_file)

    old_metadata = dacman_utils.file_to_dict(old_metafile)
    new_metadata = dacman_utils.file_to_dict(new_metafile)

    _unchanged = []
    _metachange = {}
    _added = []
    _deleted = []
    _modified = {}

    #cprint(__modulename__, 'Comparing {} and {}'.format(old_datapath, new_datapath))

    # MD5 hash for a zero-byte file
    __MAGIC_HASH__ = 'd41d8cd98f00b204e9800998ecf8427e'

    logger.info('Comparing files in %s and %s', old_datapath, new_datapath)
    for filepath in new_path_indexes:
        datahash = new_path_indexes[filepath]
        if filepath in old_path_indexes:
            '''
            if filepaths are same, but data or metadata changed
            '''
            if datahash == old_path_indexes[filepath]:
                if filepath in old_metadata and filepath in new_metadata:
                    if old_metadata[filepath] == new_metadata[filepath]:
                        _unchanged.append(filepath)
                    else:
                        _metachange[filepath] = filepath
                else:
                    _unchanged.append(filepath)
            else:
                _modified[filepath] = filepath
            old_path_indexes.pop(filepath)
            basename = os.path.basename(filepath)
            if basename in name_path_map:
                if filepath in name_path_map[basename]:
                    name_path_map[basename].remove(filepath)
                    if len(name_path_map[basename]) == 0:
                        name_path_map.pop(basename)
        elif os.path.basename(filepath) in name_path_map:
            '''
            if filenames are same, but filepaths and data changed
            '''
            filename = os.path.basename(filepath)
            old_filepaths = name_path_map[filename]
            for old_filepath in old_filepaths:
                if datahash == old_path_indexes[old_filepath]:
                    _metachange[filepath] = old_filepath
                    old_path_indexes.pop(old_filepath)
                    name_path_map[filename].remove(old_filepath)
                    break
            if filepath not in _metachange:
                old_filepath = old_filepaths[0]
                _modified[filepath] = old_filepath
                old_path_indexes.pop(old_filepath)
                del name_path_map[filename][0]
            if len(name_path_map[filename]) == 0:
                name_path_map.pop(filename)
        elif datahash in old_data_indexes and datahash != __MAGIC_HASH__:
            '''
            if data remains same, but filepath changes
            '''
            old_filepath = old_data_indexes[datahash]
            if old_filepath in old_path_indexes:
                _metachange[filepath] = old_filepath
                old_path_indexes.pop(old_filepath)
                old_data_indexes.pop(datahash)
                basename = os.path.basename(old_filepath)
                if basename in name_path_map:
                    if old_filepath in name_path_map[basename]:
                        name_path_map[basename].remove(old_filepath)
                        if len(name_path_map[basename]) == 0:
                            name_path_map.pop(basename)
            else:
                _added.append(filepath)
        else:
            _added.append(filepath)

    for old_filepath in old_path_indexes:
        _deleted.append(old_filepath)
    '''
    Saving change information in cache
    '''
    logger.info('Updating change cache entries')
    change_id = dacman_utils.hash_comparison_id(old_datapath, new_datapath)
    cachedir = os.path.join(stagingdir, 'cache')
    if not os.path.exists(cachedir):
        os.makedirs(cachedir)
    change_file = os.path.join(cachedir, 'ENTRIES')
    change_info = {new_datapath: {old_datapath: change_id}}
    dacman_utils.update_yaml(change_info, change_file)

    logger.info('Saving change measurements')

    change_dir = os.path.join(cachedir, change_id)
    if not os.path.exists(change_dir):
        os.makedirs(change_dir)

    _meta_info = {
        'base': {
            'dataset_id': old_datapath,
            'nfiles': dacman_utils.get_nfiles(old_datapath, stagingdir)
        },
        'revision': {
            'dataset_id': new_datapath,
            'nfiles': dacman_utils.get_nfiles(new_datapath, stagingdir)
        }
    }
    _metafile = os.path.join(change_dir, 'META_INFO')
    _ufile = os.path.join(change_dir, 'UNCHANGED')
    _afile = os.path.join(change_dir, 'ADDED')
    _dfile = os.path.join(change_dir, 'DELETED')
    _mfile = os.path.join(change_dir, 'MODIFIED')
    _mcfile = os.path.join(change_dir, 'METACHANGE')
    dacman_utils.dump_yaml(_meta_info, _metafile)
    dacman_utils.list_to_file(_unchanged, _ufile)
    dacman_utils.list_to_file(_added, _afile)
    dacman_utils.list_to_file(_deleted, _dfile)
    dacman_utils.dict_to_file(_modified, _mfile)
    dacman_utils.dict_to_file(_metachange, _mcfile)

    logger.info('Directory comparison complete')

    return change_id
Ejemplo n.º 8
0
def scan(datapath,
         custom_stagingdir=None,
         nonrecursive=False,
         symlinks=False,
         details=False,
         ignorelist=[]):
    logger = logging.getLogger(__name__)

    if not os.path.exists(datapath):
        #cprint(__modulename__, 'Datapath `{}` does not exist!'.format(datapath))
        logger.error('Datapath %s does not exist!', datapath)
        sys.exit()

    if not os.path.isdir(datapath):
        #print('Indexing currently allowed only for data in a directory.')
        #cprint(__modulename__, 'Indexing currently allowed only for data in a directory.')
        logger.error(
            'Indexing currently allowed only for data in a directory.')
        sys.exit()

    if not custom_stagingdir:
        stagingdir = dacman_utils.DACMAN_STAGING_LOC
    else:
        stagingdir = custom_stagingdir

    indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath))

    if not os.path.exists(indexdir):
        os.makedirs(indexdir)

    entries = []
    follow_symlinks = symlinks
    excluded_dirs = {'.dacman': True}
    metainfo = {}

    # keeping this for feature enhancements and future optimizations
    #dirtree = DirectoryTree(datapath, stagingdir)
    #cprint(__modulename__, 'Scanning datapath {}'.format(datapath))
    logger.info('Scanning datapath %s', datapath)

    ### Doing this is too slow ###
    """
   for entry in scantree(datapath, excluded_dirs, follow_symlinks):
      filepath = entry.path
      relative_path = os.path.relpath(filepath, datapath)
      '''
      ### Doing this is too slow ###
      # saving only info about the files and not directories
      if not entry.is_dir(follow_symlinks=False):
         metainfo[relative_path] = get_metadata(entry)
      '''
      # keeping this for feature enhancements and future optimizations
      #dirtree.add(entry.path)

   #dirtree.save()
   #dirtree.close()

   '''
   meta_path = os.path.join(stagingdir, 'METADATA')    
   cprint(__modulename__, 'Dumping metadata')
   dump(metainfo, meta_path)
   '''
   """
    scan_funcs = {False: scantree, True: scan_only_dir}
    scan_fn = scan_funcs[nonrecursive]

    paths_file = os.path.join(indexdir, 'FILEPATHS')
    meta_file = os.path.join(indexdir, 'METADATA')
    # open the metadata file
    mf = open(meta_file, 'w')

    if nonrecursive:
        logger.info(
            'Ignoring subdirectory scans: scanning files only in the present directory'
        )
    '''
   if there is no file to ignore
   '''
    if len(ignorelist) == 0:
        with open(paths_file, 'w') as f:
            for entry in scan_fn(datapath, excluded_dirs, follow_symlinks):
                filepath = entry.path
                relative_path = os.path.relpath(filepath, datapath)
                '''
            only save the file paths and not dir paths
            '''
                if not entry.is_dir(follow_symlinks=symlinks):
                    line = '{}\n'.format(relative_path)
                    f.write(line)
                    if details:
                        file_stats = entry.stat()
                        owner = pwd.getpwuid(file_stats.st_uid).pw_name
                        group = grp.getgrgid(file_stats.st_gid).gr_name
                        size = file_stats.st_size
                        #mtime = datetime.fromtimestamp(file_stats.st_mtime).strftime("%d %B %Y %I:%M:%S")
                        # File modification time doesn't make sense here, because we compare two versions
                        #mtime = file_stats.st_mtime
                        metadata = relative_path + ':owner=' + owner + ',group=' + group + ',size=' + str(
                            size) + '\n'
                        #metadata = relative_path+':owner='+owner+',group='+group+\
                        #    ',size='+str(size)+',mtime='+str(mtime)+'\n'
                        mf.write(metadata)
    else:
        with open(paths_file, 'w') as f:
            for entry in scan_fn(datapath, excluded_dirs, follow_symlinks):
                filepath = entry.path
                relative_path = os.path.relpath(filepath, datapath)
                ignore_file = False
                for ignore_pattern in ignorelist:
                    if fnmatch.fnmatch(relative_path, ignore_pattern):
                        ignore_file = True
                        break
                '''
            only save the file paths and not dir paths
            '''
                if not (ignore_file or entry.is_dir(follow_symlinks=symlinks)):
                    line = '{}\n'.format(relative_path)
                    f.write(line)
                    if details:
                        file_stats = entry.stat()
                        owner = pwd.getpwuid(file_stats.st_uid).pw_name
                        group = grp.getgrgid(file_stats.st_gid).gr_name
                        size = file_stats.st_size
                        #mtime = datetime.fromtimestamp(file_stats.st_mtime).strftime("%d %B %Y %I:%M:%S")
                        #mtime = file_stats.st_mtime
                        metadata = relative_path + ':owner=' + owner + ',group=' + group + ',size=' + str(
                            size) + '\n'
                        #metadata = relative_path+':owner='+owner+',group='+group+\
                        #    ',size='+str(size)+',mtime='+str(mtime)+'\n'
                        mf.write(metadata)

    logger.info('Saving path metadata and directory scan information')

    basepath_file = os.path.join(indexdir, 'DATAPATH')
    with open(basepath_file, 'w') as f:
        f.write('{}\n'.format(datapath))

    # close the metadata file
    mf.close()

    #cprint(__modulename__, 'Scan complete')
    logger.info('Directory scan complete')

    return indexdir
Ejemplo n.º 9
0
def mpi_index(custom_stagingdir, datapath):    
    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()
    status = MPI.Status()
    
    class States():
        READY = 0
        START = 1
        DONE = 2
        EXIT = 3

    if rank == 0:
        stagingdir = check_stagingdir(custom_stagingdir, datapath)
        '''
        deduce_metadata = os.path.join(stagingdir, 'METADATA')
        if not os.path.exists(deduce_metadata):
        scanner.scan(datapath, stagingdir)
        '''
        indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath))
        deduce_file = os.path.join(indexdir, 'FILEPATHS')
        if not os.path.exists(deduce_file):
            scanner.scan(datapath, stagingdir)

        filelist = []
        file_num = 0
        closed_workers = 0
        num_workers = size - 1
        indexes = []

        filelist = read_filelist(deduce_file)

        logger.info('Indexing %d files', len(filelist))
        while closed_workers < num_workers:
            result = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
            source = status.Get_source()
            tag = status.Get_tag()
            
            if tag == States.READY:
                if file_num < len(filelist):
                    comm.send(filelist[file_num], dest=source, tag=States.START)
                    file_num += 1
                else:
                    comm.send(None, dest=source, tag=States.EXIT)
            elif tag == States.DONE:
                indexes.append(result)
            elif tag == States.EXIT:
                closed_workers += 1

        save_indexes(indexdir, indexes)

        return indexdir
    else:
        while True:
            comm.send(None, dest=0, tag=States.READY)
            filename = comm.recv(source=0, tag=MPI.ANY_TAG, status=status)
            tag = status.Get_tag()

            if tag == States.START:
                index = calculate_hash(datapath, filename)
                comm.send(index, dest=0, tag=States.DONE)
            elif tag == States.EXIT:
                comm.send(None, dest=0, tag=States.EXIT)
                break
Ejemplo n.º 10
0
def changes(old_datapath, new_datapath, force=False, custom_stagingdir=None):
    logger = logging.getLogger(__name__)

    #old_datapath = os.path.abspath(old_path)
    #new_datapath = os.path.abspath(new_path)

    if old_datapath == new_datapath:
       logger.error('Comparison paths are the same')
       sys.exit()

    if not custom_stagingdir:
       stagingdir = dacman_utils.DACMAN_STAGING_LOC
    else:
       stagingdir = custom_stagingdir

    is_subdir_oldpath = False
    is_subdir_newpath = False
    cached_old_path = old_datapath
    cached_new_path = new_datapath

    cachedir = os.path.join(stagingdir, 'cache')
    cache_entries = os.path.join(cachedir, 'ENTRIES')

    logger.info('Checking for changes between %s and %s', old_datapath, new_datapath)

    '''
    This is the caching logic where change information is saved and subsequently retrieved
    If no high-level diff exists for the data, then do a comparison
    - do the comparison for the all the indexed data
    - at runtime, decide if the comparison is between any subdirectories of the total diff
    '''
    if not os.path.exists(cache_entries) or force:
        logger.info('Cache is empty... starting dataset comparison')
        change_dir = comparator.compare(old_datapath, new_datapath, stagingdir)
    else:
        logger.info('Checking for pre-calculated and cached changes.')
        '''
        if the high-level diff exists, then check if it exists for the two data versions provided here
        '''
        with open(cache_entries, 'r') as f:
            cache = yaml.safe_load(f)
            '''
            if changes for the newpath are in cache, then
            check if they are for the compared oldpath 
            '''
            if new_datapath in cache:
               '''
               if the diff paths are already compared, then get the corresponding directory;
               else, do the comparisons/diff
               '''
               if old_datapath in cache[new_datapath]:
                  '''
                  if both oldpath and newpath are in the cache
                  '''
                  logger.info('Changes are present in cache... fetching change information.')
                  change_dir = cache[new_datapath][old_datapath]
               else:
                  '''
                  check if the oldpath is a subdirectory of a cached path change
                  '''
                  for o in cache[new_datapath]:
                     parent_path = o + os.sep
                     if old_datapath.startswith(parent_path):
                        logger.info('Changes can be derived from the cache.')
                        change_dir = cache[new_datapath][o]
                        cached_old_path = os.path.abspath(parent_path)
                        break
                     '''
                     if the oldpath is neither in cache nor is a subdir of a cache entry,
                     then it's a new comparison
                     '''
                  else:
                     logger.info('Changes are not cached... initiating dataset comparison.')
                     change_dir = comparator.compare(old_datapath, new_datapath, stagingdir)
            else:
               '''
               if changes for the original newpath are not in cache,
               check if any parent directory changes are in cache
               '''
               d = os.path.dirname(new_datapath)
               '''
               check if any parent dir changes are calculated and cached
               '''
               while d != '/' and d not in cache:
                  d = os.path.dirname(d)
                  
               '''
               if changes for a matching parent are found,
               then check if oldpath changes are cached 
               '''
               if d != '/':
                  if old_datapath in cache[d]:
                     change_dir = cache[d][old_datapath]
                     cached_new_path = d
                  else:
                     for o in cache[d]:
                        parent_path = o + os.sep
                        if old_datapath.startswith(parent_path):
                           logger.info('Subdirectory changes can be derived from cache.')
                           change_dir = cache[d][o]
                           cached_old_path = os.path.abspath(parent_path)
                           cached_new_path = d
                           break
                     else:
                        logger.info('Changes are not pre-calculated... initiating dataset comparison.')
                        change_dir = comparator.compare(old_datapath, new_datapath, stagingdir)
               else:
                  '''
                  if changes are not present in the cache, then compare
                  '''
                  logger.info('Changes are not pre-calculated... initiating dataset comparison.')
                  change_dir = comparator.compare(old_datapath, new_datapath, stagingdir)

    if cached_old_path != old_datapath:
       is_subdir_oldpath = True

    if cached_new_path != new_datapath:
       is_subdir_newpath = True

    logger.info('Retrieving changes between %s and %s', old_datapath, new_datapath)

    #change = FilesystemChange(old_datapath, new_datapath, stagingdir)
    change = FilesystemChange(cached_old_path, cached_new_path, stagingdir)

    if is_subdir_newpath:
       indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(cached_new_path))
       subdir_nfiles = get_subdir_nfiles(new_datapath, indexdir)
       change.new_nfiles = subdir_nfiles

    if is_subdir_oldpath:
       indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(cached_old_path))
       subdir_nfiles = get_subdir_nfiles(old_datapath, indexdir)
       change.old_nfiles = subdir_nfiles

    change_data_dir = os.path.join(cachedir, change_dir)
    #print(change_data_dir)
    if not (is_subdir_oldpath or is_subdir_newpath):
       set_change_from_cache(change, change_data_dir)
    else:
       compare_hash = dacman_utils.hash_comparison_id(old_datapath, new_datapath)
       change_data_subdir = os.path.join(cachedir, compare_hash)
       if os.path.exists(change_data_subdir):
          set_change_from_cache(change, change_data_subdir)
       else:
          save_subdir_changes_to_cache(change, stagingdir,
                                       cached_old_path, cached_new_path,
                                       old_datapath, new_datapath,
                                       is_subdir_oldpath, is_subdir_newpath,
                                       change_data_dir, change_data_subdir)

          logger.info('Updating change cache entries')
          change_id = dacman_utils.hash_comparison_id(old_datapath, new_datapath)
          change_file = os.path.join(cachedir, 'ENTRIES')
          change_info = {new_datapath : {old_datapath: change_id}}
          dacman_utils.update_yaml(change_info, change_file)

    logger.info('Change retrieval completed')

    return change
Ejemplo n.º 11
0
    def get_change_pairs(self):
        if not (self.old_path and self.new_path):
            self.logger.error('Old and new datapaths are not specified!')
            sys.exit()

        change_pairs = []

        old_base = self.old_path
        new_base = self.new_path
        self.logger.info('Starting diff calculation')
        if self.old_path_is_file and self.new_path_is_file:
            change_pairs.append((self.old_path, self.new_path))
            return change_pairs
        elif self.old_path_is_file != self.new_path_is_file:
            self.logger.error('Datapaths are of different types')
            sys.exit()
        '''
        check if indexes on the data are present
        else, check for data types and invoke parallel comparison
        '''
        old_index_path = None
        new_index_path = None
        is_indexed = False
        indexdir = os.path.join(self.stagingdir, 'indexes')
        index_metafile = os.path.join(indexdir, 'INDEXED_PATHS')

        if os.path.exists(index_metafile):
            indexed_paths = dacman_utils.load_yaml(index_metafile)
            paths_indexed = [False, False]
            for path in indexed_paths:
                p = path + os.sep
                if self.old_path.startswith(p) or self.old_path == path:
                    old_index_path = os.path.join(
                        indexdir, get_hash_id(os.path.abspath(path)))
                    paths_indexed[0] = True
                if self.new_path.startswith(p) or self.new_path == path:
                    new_index_path = os.path.join(
                        indexdir, get_hash_id(os.path.abspath(path)))
                    paths_indexed[1] = True
                if all(paths_indexed):
                    is_indexed = True
                    break

        if is_indexed:
            changeManager = ChangeManager(self.old_path, self.new_path, False,
                                          self.stagingdir)
            status, cached_old_path, cached_new_path = changeManager.get_cached_paths(
            )
            change_data = changeManager.get_changes(status, cached_old_path,
                                                    cached_new_path)

            old_datapath_file = os.path.join(old_index_path, 'DATAPATH')
            new_datapath_file = os.path.join(new_index_path, 'DATAPATH')

            old_filelist = os.path.join(old_index_path, 'FILEPATHS')
            new_filelist = os.path.join(new_index_path, 'FILEPATHS')

            with open(old_datapath_file) as f:
                old_basepath = f.readline().split('\n')[0]

            with open(new_datapath_file) as f:
                new_basepath = f.readline().split('\n')[0]

            with open(old_filelist) as f:
                for relpath in f:
                    filepath = os.path.join(old_basepath, relpath)
                    if filepath == self.old_path:
                        self.old_path_is_file = True
                        break

            with open(new_filelist) as f:
                for relpath in f:
                    filepath = os.path.join(new_basepath, relpath)
                    if filepath == self.old_path:
                        self.new_path_is_file = True
                        break
        else:
            self.logger.warning(
                'Datapaths are not indexed. Trying to locate and index the data...'
            )
            '''
            The code below allows to check for a diff between any two random files
            '''
            # change_data = change.changes(old_base, new_base, False, self.stagingdir)
            changeManager = ChangeManager(old_base, new_base, False,
                                          self.stagingdir)
            status, cached_old_path, cached_new_path = changeManager.get_cached_paths(
            )
            change_data = changeManager.get_changes(status, cached_old_path,
                                                    cached_new_path)

        changes = change_data.modified

        self.logger.info('Searching for path indexes')
        '''
        find the old and new base directories which are indexed through
        '''
        path_prefix_new = cached_new_path
        path_prefix_old = cached_old_path
        '''
        save the metadata about the high-level diff between the directories
        '''
        if not self.old_path_is_file:
            if self.save_changes:
                self._save_dir_diff(change_data)
                self.logger.info('Change summary saved in: %s', self.outdir)
            change.display(change_data)
            '''
            for each file level change, a detailed change analysis is reqd
            '''
            for change_key in changes:
                new_path = os.path.join(path_prefix_new, change_key)
                old_path = os.path.join(path_prefix_old, changes[change_key])
                change_pairs.append((new_path, old_path))
        else:
            rel_new_path = os.path.relpath(self.new_path, path_prefix_new)
            rel_old_path = os.path.relpath(self.old_path, path_prefix_old)
            if rel_new_path in changes and changes[
                    rel_new_path] == rel_old_path:
                change_pairs.append((self.new_path, self.old_path))

        return change_pairs