def extract_netcdf_variable_recursive(output,data,level_desc,tree,options,check_empty=False,hdf5=None): level_name=level_desc[0] group_name=level_desc[1] if group_name==None or isinstance(group_name,list): for group in data.groups.keys(): if ( nc_Database.is_level_name_included_and_not_excluded(level_name,options,group) and nc_Database.retrieve_tree_recursive_check_not_empty(options,data.groups[group])): output_grp=replicate_group(output,data,group) if len(tree)>0: extract_netcdf_variable_recursive(output_grp,data.groups[group],tree[0],tree[1:],options,check_empty=check_empty,hdf5=hdf5[group]) else: netcdf_pointers=read_soft_links.read_netCDF_pointers(data.groups[group]) if hdf5!=None: netcdf_pointers.replicate(output_grp,check_empty=check_empty,hdf5=hdf5[group]) else: netcdf_pointers.replicate(output_grp,check_empty=check_empty) else: if len(tree)>0: if group_name=='': extract_netcdf_variable_recursive(output,data,tree[0],tree[1:],options,check_empty=check_empty,hdf5=hdf5) elif group_name in data.groups.keys(): if hdf5!=None: extract_netcdf_variable_recursive(output,data.groups[group_name],tree[0],tree[1:],options,check_empty=check_empty,hdf5=hdf5[group_name]) else: extract_netcdf_variable_recursive(output,data.groups[group_name],tree[0],tree[1:],options,check_empty=check_empty) else: netcdf_pointers=read_soft_links.read_netCDF_pointers(data.groups[group_name]) if hdf5!=None: netcdf_pointers.replicate(output,check_empty=check_empty,hdf5=hdf5[group_name]) else: netcdf_pointers.replicate(output,check_empty=check_empty) return
def descend_tree_recursive(database,file_expt,tree_desc,top_path,options,list_level=None,alt=False): if not isinstance(tree_desc,list): return if len(tree_desc)==1: file_list=glob.glob(top_path+'/*.nc') if len(file_list)>0: for file in file_list: file_expt_copy=copy.deepcopy(file_expt) #file_expt_copy.path='|'.join([file,retrieval_utils.md5_for_file(open(file,'r'))]) file_expt_copy.path=file+'|' if alt: file_expt_copy.model_version=file_expt_copy.model.split('-')[1] file_expt_copy.model='-'.join([file_expt_copy.institute,file_expt_copy.model.split('-')[0]]) database.nc_Database.session.add(file_expt_copy) database.nc_Database.session.commit() return file_list local_tree_desc=tree_desc[0] next_tree_desc=tree_desc[1:] subdir_list=[] #Loop through subdirectories: for subdir in get_immediate_subdirectories(top_path): if local_tree_desc+'_list' in database.header_simple.keys(): #We keep only the subdirectories that were requested if subdir in database.header_simple[local_tree_desc+'_list']: subdir_list.append(subdir) else: #Keep all other subdirs as long as they are #1) not latest version #2) of the form v{int} if not (local_tree_desc=='version' and (subdir=='latest' or (not RepresentsInt(subdir[1:])))): subdir_list.append(subdir) if list_level!=None and local_tree_desc==list_level: return subdir_list else: only_list=[] for subdir in subdir_list: file_expt_copy=copy.deepcopy(file_expt) setattr(file_expt_copy,local_tree_desc,subdir) #Include only subdirectories that were specified if this level was specified: if nc_Database.is_level_name_included_and_not_excluded(local_tree_desc,options,subdir): only_list.append(descend_tree_recursive(database,file_expt_copy, next_tree_desc,top_path+'/'+subdir, options,list_level=list_level,alt=alt)) return [item for sublist in only_list for item in sublist]
def retrieve_without_time(self, retrieval_function, output, semaphores=None, username=None, user_pass=None): #This function simply retrieves all the files: file_path = output for path_to_retrieve in self.paths_list: file_type = self.file_type_list[list( self.paths_list).index(path_to_retrieve)] version = 'v' + str(self.version_list[list( self.paths_list).index(path_to_retrieve)]) checksum = self.checksums_list[list( self.paths_list).index(path_to_retrieve)] #Get the file tree: args = ({ 'path': path_to_retrieve + '|' + checksum, 'var': self.tree[-1], 'file_path': file_path, 'version': version, 'file_type': file_type, 'username': username, 'user_pass': user_pass }, copy.deepcopy(self.tree)) #'sort_table':np.argsort(sorting_paths)[sorted_paths_link==path_id][time_slice], #Retrieve only if it is from the requested data node: data_node = retrieval_utils.get_data_node(path_to_retrieve, file_type) if nc_Database.is_level_name_included_and_not_excluded( 'data_node', self, data_node): if data_node in self.queues.keys(): #print 'Recovering '+var_to_retrieve+' in '+path_to_retrieve print 'Recovering ' + '/'.join(self.tree) self.queues[data_node].put((retrieval_function, ) + copy.deepcopy(args)) return
def retrieve_variables(self, retrieval_function, var_to_retrieve, time_restriction, output, semaphores=None, username=None, user_pass=None): #Replicate variable to output: if (isinstance(output, netCDF4.Dataset) or isinstance(output, netCDF4.Group)): output = netcdf_utils.replicate_netcdf_var(output, self.data_root, var_to_retrieve, chunksize=-1, zlib=True) #file_path=output.filepath() file_path = None if not 'soft_links' in self.data_root.groups.keys(): #Variable is stored here and simply retrieve it: output.variables[ var_to_retrieve][:] = self.data_root.variables[ var_to_retrieve][time_restriction] return else: file_path = output dimensions = dict() unsort_dimensions = dict() dims_length = [] for dim in self.data_root.variables[var_to_retrieve].dimensions: if dim != 'time': if dim in self.data_root.variables.keys(): dimensions[dim] = self.data_root.variables[dim][:] else: dimensions[dim] = np.arange( len(self.data_root.dimensions[dim])) unsort_dimensions[dim] = None dims_length.append(len(dimensions[dim])) # Determine the paths_ids for soft links: paths_link = self.data_root.groups['soft_links'].variables[ var_to_retrieve][time_restriction, 0] indices_link = self.data_root.groups['soft_links'].variables[ var_to_retrieve][time_restriction, 1] #Convert paths_link to id in path dimension: paths_link = np.array([ list(self.paths_id_list).index(path_id) for path_id in paths_link ]) #Sort the paths so that we query each only once: unique_paths_list_id, sorting_paths = np.unique(paths_link, return_inverse=True) #Maximum number of time step per request: max_request = 450 #maximum request in Mb max_time_steps = max( int( np.floor(max_request * 1024 * 1024 / (32 * np.prod(dims_length)))), 1) for unique_path_id, path_id in enumerate(unique_paths_list_id): path_to_retrieve = self.paths_list[path_id] #Next, we check if the file is available. If it is not we replace it #with another file with the same checksum, if there is one! file_type = self.file_type_list[list( self.paths_list).index(path_to_retrieve)] remote_data = remote_netcdf.remote_netCDF( path_to_retrieve.replace('fileServer', 'dodsC'), file_type, semaphores) if not file_type in ['FTPServer']: path_to_retrieve = remote_data.check_if_available_and_find_alternative( [ path.replace('fileServer', 'dodsC') for path in self.paths_list ], self.checksums_list).replace('dodsC', 'fileServer') #Get the file_type, checksum and version of the file to retrieve: file_type = self.file_type_list[list( self.paths_list).index(path_to_retrieve)] version = 'v' + str(self.version_list[list( self.paths_list).index(path_to_retrieve)]) checksum = self.checksums_list[list( self.paths_list).index(path_to_retrieve)] #Append the checksum: path_to_retrieve += '|' + checksum #time_indices=sorted_indices_link[sorted_paths_link==path_id] time_indices = indices_link[sorting_paths == unique_path_id] num_time_chunk = int( np.ceil(len(time_indices) / float(max_time_steps))) for time_chunk in range(num_time_chunk): time_slice = slice(time_chunk * max_time_steps, (time_chunk + 1) * max_time_steps, 1) dimensions['time'], unsort_dimensions[ 'time'] = indices_utils.prepare_indices( time_indices[time_slice]) #Get the file tree: args = ({ 'path': path_to_retrieve, 'var': var_to_retrieve, 'indices': dimensions, 'unsort_indices': unsort_dimensions, 'sort_table': np.arange(len(sorting_paths))[sorting_paths == unique_path_id][time_slice], 'file_path': file_path, 'version': version, 'file_type': file_type, 'username': username, 'user_pass': user_pass }, copy.deepcopy(self.tree)) #'sort_table':np.argsort(sorting_paths)[sorted_paths_link==path_id][time_slice], #Retrieve only if it is from the requested data node: data_node = retrieval_utils.get_data_node( path_to_retrieve, file_type) if nc_Database.is_level_name_included_and_not_excluded( 'data_node', self, data_node): if data_node in self.queues.keys(): if ((isinstance(output, netCDF4.Dataset) or isinstance(output, netCDF4.Group)) or time_chunk == 0): #If it is download: retrieve #If it is download_raw: retrieve only first time_chunk if var_to_retrieve == self.tree[-1]: #print 'Recovering '+var_to_retrieve+' in '+path_to_retrieve print 'Recovering ' + '/'.join(self.tree) self.queues[data_node].put((retrieval_function, ) + copy.deepcopy(args)) else: if (isinstance(output, netCDF4.Dataset) or isinstance(output, netCDF4.Group)): #netcdf_utils.assign_tree(output,*getattr(netcdf_utils,retrieval_function)(args[0],args[1])) netcdf_utils.assign_tree( output, *getattr(retrieval_utils, retrieval_function)(args[0], args[1])) return
def descend_tree_recursive(database, file_expt, tree_desc, top_path, options, ftp, list_level=None, alt=False): if not isinstance(tree_desc, list): return # Make sure we're at the top_path: try: ftp.cwd("/" + "/".join(top_path.split("/")[3:])) except ftplib.error_perm: return [] if len(tree_desc) == 1: # If we're at the end of the tree, we should expect files: file_list_raw = ftp.nlst() # ftp.voidcmd("TYPE I") file_list = [file_name for file_name in file_list_raw if (len(file_name) > 3 and file_name[-3:] == ".nc")] # if (len(file_name)>3 and file_name[-3:]=='.nc' and ftp.size(file_name)>0)] # ftp.voidcmd("TYPE A") if len(file_list) > 0: for file in file_list: file_expt_copy = copy.deepcopy(file_expt) # file_expt_copy.path='|'.join([file,retrieval_utils.md5_for_file(open(file,'r'))]) file_expt_copy.path = top_path + "/" + file + "|" if alt: file_expt_copy.model_version = file_expt_copy.model.split("-")[1] file_expt_copy.model = "-".join([file_expt_copy.institute, file_expt_copy.model.split("-")[0]]) database.nc_Database.session.add(file_expt_copy) database.nc_Database.session.commit() return file_list # We're not at the end of the tree, we should expect directories: local_tree_desc = tree_desc[0] next_tree_desc = tree_desc[1:] subdir_list = [] # Loop through subdirectories: for subdir in ftp.nlst(): if local_tree_desc + "_list" in database.header_simple.keys(): # We keep only the subdirectories that were requested if subdir in database.header_simple[local_tree_desc + "_list"]: subdir_list.append(subdir) else: # Keep all other subdirs as long as they are # 1) not latest version # 2) of the form v{int} if not (local_tree_desc == "version" and (subdir == "latest" or (not RepresentsInt(subdir[1:])))): subdir_list.append(subdir) if list_level != None and local_tree_desc == list_level: return subdir_list else: only_list = [] for subdir in subdir_list: file_expt_copy = copy.deepcopy(file_expt) setattr(file_expt_copy, local_tree_desc, subdir) # Include only subdirectories that were specified if this level was specified: if nc_Database.is_level_name_included_and_not_excluded(local_tree_desc, options, subdir): only_list.append( descend_tree_recursive( database, file_expt_copy, next_tree_desc, top_path + "/" + subdir, options, ftp, list_level=list_level, alt=alt, ) ) return [item for sublist in only_list for item in sublist]
def descend_tree_recursive(database,file_expt,tree_desc,top_path,options,ftp,list_level=None,alt=False): if not isinstance(tree_desc,list): return #Make sure we're at the top_path: try: ftp.cwd('/'+'/'.join(top_path.split('/')[3:])) except ftplib.error_perm: return [] if len(tree_desc)==1: #If we're at the end of the tree, we should expect files: file_list_raw=ftp.nlst() #ftp.voidcmd("TYPE I") file_list=[file_name for file_name in file_list_raw if (len(file_name)>3 and file_name[-3:]=='.nc' )] #if (len(file_name)>3 and file_name[-3:]=='.nc' and ftp.size(file_name)>0)] #ftp.voidcmd("TYPE A") if len(file_list)>0: for file in file_list: file_expt_copy=copy.deepcopy(file_expt) #file_expt_copy.path='|'.join([file,retrieval_utils.md5_for_file(open(file,'r'))]) file_expt_copy.path=top_path+'/'+file+'|' if alt: file_expt_copy.model_version=file_expt_copy.model.split('-')[1] file_expt_copy.model='-'.join([file_expt_copy.institute,file_expt_copy.model.split('-')[0]]) database.nc_Database.session.add(file_expt_copy) database.nc_Database.session.commit() return file_list #We're not at the end of the tree, we should expect directories: local_tree_desc=tree_desc[0] next_tree_desc=tree_desc[1:] subdir_list=[] #Loop through subdirectories: for subdir in ftp.nlst(): if local_tree_desc+'_list' in database.header_simple.keys(): #We keep only the subdirectories that were requested if subdir in database.header_simple[local_tree_desc+'_list']: subdir_list.append(subdir) else: #Keep all other subdirs as long as they are #1) not latest version #2) of the form v{int} if not (local_tree_desc=='version' and (subdir=='latest' or (not RepresentsInt(subdir[1:])))): subdir_list.append(subdir) if list_level!=None and local_tree_desc==list_level: return subdir_list else: only_list=[] for subdir in subdir_list: file_expt_copy=copy.deepcopy(file_expt) setattr(file_expt_copy,local_tree_desc,subdir) #Include only subdirectories that were specified if this level was specified: if nc_Database.is_level_name_included_and_not_excluded(local_tree_desc,options,subdir): only_list.append(descend_tree_recursive(database,file_expt_copy, next_tree_desc,top_path+'/'+subdir, options,ftp,list_level=list_level,alt=alt)) return [item for sublist in only_list for item in sublist]