def create_variable(self,output,var,years,months): #Recover time axis for all files: self.obtain_table() queryable_file_types_available=list(set(self.table['file_type']).intersection(queryable_file_types)) if len(self.table['paths'])>0: if len(queryable_file_types_available)>0: #Open the first file and use its metadata to populate container file: first_id=list(self.table['file_type']).index(queryable_file_types_available[0]) remote_data=remote_netcdf.remote_netCDF(self.table['paths'][first_id],self.table['file_type'][first_id],self.semaphores) #try: remote_data.open_with_error() netcdf_utils.replicate_netcdf_file(output,remote_data.Dataset) else: remote_data=remote_netcdf.remote_netCDF(self.table['paths'][0],self.table['file_type'][0],self.semaphores) #Convert time axis to numbers and find the unique time axis: self.unique_time_axis(remote_data.Dataset,years,months) self.reduce_paths_ordering() #Create time axis in ouptut: netcdf_utils.create_time_axis_date(output,remote_data.Dataset,self.time_axis_unique_date) self.create(output) #if len(queryable_file_types_available)>0: self.record_indices(output,remote_data.Dataset,var) #except dodsError as e: # e_mod=" This is an uncommon error. It is likely to be FATAL." # print e.value+e_mod remote_data.close() output.sync() return
def record_fx(self,output,username=None,user_pass=None): #Create soft links self.create(output) output.groups['soft_links'].createVariable(self.var,np.float32,(),zlib=True) #Find the most recent version: most_recent_version='v'+str(np.max([int(item['version'][1:]) for item in self.paths_list])) usable_paths_list=[ item for item in self.paths_list if item['version']==most_recent_version] queryable_paths_list=[item for item in usable_paths_list if item['file_type'] in queryable_file_types] if len(queryable_paths_list)==0: temp_file_handle, temp_file_name=tempfile.mkstemp() try: if len(queryable_paths_list)==0: path=usable_paths_list[0] #output.createVariable(self.var,np.float32,(),zlib=True) #Download the file to temp retrieval_utils.download_secure(path['path'].split('|')[0], temp_file_name, path['file_type'], username=username,user_pass=user_pass) remote_data=remote_netcdf.remote_netCDF(temp_file_name,path['file_type'],self.semaphores) else: #Check if data in available: path = queryable_paths_list[0] remote_data=remote_netcdf.remote_netCDF(path['path'].split('|')[0],path['file_type'],self.semaphores) alt_path_name=remote_data.check_if_available_and_find_alternative([item['path'].split('|')[0] for item in queryable_paths_list], [item['path'].split('|')[1] for item in queryable_paths_list]) #Use aternative path: path=queryable_paths_list[[item['path'].split('|')[0] for item in queryable_paths_list].index(alt_path_name)] remote_data=remote_netcdf.remote_netCDF(path['path'].split('|')[0],path['file_type'],self.semaphores) remote_data.retrieve_variables(output,zlib=True) for att in path.keys(): if att!='path': output.setncattr(att,path[att]) output.setncattr('path',path['path'].split('|')[0]) output.setncattr('checksum',path['path'].split('|')[1]) output.sync() finally: pass if len(queryable_paths_list)==0: os.remove(temp_file_name) return
def _recover_time(self,path): file_type=path['file_type'] checksum=path['checksum'] path_name=str(path['path']).split('|')[0] remote_data=remote_netcdf.remote_netCDF(path_name,file_type,self.semaphores) time_axis=remote_data.get_time(time_frequency=self.time_frequency, is_instant=self.is_instant) table_desc=[ ('paths','a255'), ('file_type','a255'), ('checksum','a255'), ('indices','int32') ] table=np.empty(time_axis.shape, dtype=table_desc) if len(time_axis)>0: table['paths']=np.array([str(path_name) for item in time_axis]) table['file_type']=np.array([str(file_type) for item in time_axis]) table['checksum']=np.array([str(checksum) for item in time_axis]) table['indices']=range(0,len(time_axis)) return time_axis,table
def retrieve_path_data(in_dict,pointer_var): #print 'Recovering '+'/'.join(self.tree) path=in_dict['path'].replace('fileServer','dodsC').split('|')[0] var=in_dict['var'] indices=copy.copy(in_dict['indices']) unsort_indices=copy.copy(in_dict['unsort_indices']) sort_table=in_dict['sort_table'] remote_data=remote_netcdf.remote_netCDF(path,'HTTPServer',[]) remote_data.open_with_error() dimensions=remote_data.retrieve_dimension_list(var) for dim in dimensions: if dim != 'time': remote_dim, attributes=remote_data.retrieve_dimension(dim) indices[dim], unsort_indices[dim] = indices_utils.prepare_indices( indices_utils.get_indices_from_dim(remote_dim,indices[dim])) retrieved_data=remote_data.grab_indices(var,indices,unsort_indices) remote_data.close() return (retrieved_data, sort_table,pointer_var+[var])
def retrieve_variables(self, retrieval_function, var_to_retrieve, time_restriction, output, semaphores=None, username=None, user_pass=None): #Replicate variable to output: if (isinstance(output, netCDF4.Dataset) or isinstance(output, netCDF4.Group)): output = netcdf_utils.replicate_netcdf_var(output, self.data_root, var_to_retrieve, chunksize=-1, zlib=True) #file_path=output.filepath() file_path = None if not 'soft_links' in self.data_root.groups.keys(): #Variable is stored here and simply retrieve it: output.variables[ var_to_retrieve][:] = self.data_root.variables[ var_to_retrieve][time_restriction] return else: file_path = output dimensions = dict() unsort_dimensions = dict() dims_length = [] for dim in self.data_root.variables[var_to_retrieve].dimensions: if dim != 'time': if dim in self.data_root.variables.keys(): dimensions[dim] = self.data_root.variables[dim][:] else: dimensions[dim] = np.arange( len(self.data_root.dimensions[dim])) unsort_dimensions[dim] = None dims_length.append(len(dimensions[dim])) # Determine the paths_ids for soft links: paths_link = self.data_root.groups['soft_links'].variables[ var_to_retrieve][time_restriction, 0] indices_link = self.data_root.groups['soft_links'].variables[ var_to_retrieve][time_restriction, 1] #Convert paths_link to id in path dimension: paths_link = np.array([ list(self.paths_id_list).index(path_id) for path_id in paths_link ]) #Sort the paths so that we query each only once: unique_paths_list_id, sorting_paths = np.unique(paths_link, return_inverse=True) #Maximum number of time step per request: max_request = 450 #maximum request in Mb max_time_steps = max( int( np.floor(max_request * 1024 * 1024 / (32 * np.prod(dims_length)))), 1) for unique_path_id, path_id in enumerate(unique_paths_list_id): path_to_retrieve = self.paths_list[path_id] #Next, we check if the file is available. If it is not we replace it #with another file with the same checksum, if there is one! file_type = self.file_type_list[list( self.paths_list).index(path_to_retrieve)] remote_data = remote_netcdf.remote_netCDF( path_to_retrieve.replace('fileServer', 'dodsC'), file_type, semaphores) if not file_type in ['FTPServer']: path_to_retrieve = remote_data.check_if_available_and_find_alternative( [ path.replace('fileServer', 'dodsC') for path in self.paths_list ], self.checksums_list).replace('dodsC', 'fileServer') #Get the file_type, checksum and version of the file to retrieve: file_type = self.file_type_list[list( self.paths_list).index(path_to_retrieve)] version = 'v' + str(self.version_list[list( self.paths_list).index(path_to_retrieve)]) checksum = self.checksums_list[list( self.paths_list).index(path_to_retrieve)] #Append the checksum: path_to_retrieve += '|' + checksum #time_indices=sorted_indices_link[sorted_paths_link==path_id] time_indices = indices_link[sorting_paths == unique_path_id] num_time_chunk = int( np.ceil(len(time_indices) / float(max_time_steps))) for time_chunk in range(num_time_chunk): time_slice = slice(time_chunk * max_time_steps, (time_chunk + 1) * max_time_steps, 1) dimensions['time'], unsort_dimensions[ 'time'] = indices_utils.prepare_indices( time_indices[time_slice]) #Get the file tree: args = ({ 'path': path_to_retrieve, 'var': var_to_retrieve, 'indices': dimensions, 'unsort_indices': unsort_dimensions, 'sort_table': np.arange(len(sorting_paths))[sorting_paths == unique_path_id][time_slice], 'file_path': file_path, 'version': version, 'file_type': file_type, 'username': username, 'user_pass': user_pass }, copy.deepcopy(self.tree)) #'sort_table':np.argsort(sorting_paths)[sorted_paths_link==path_id][time_slice], #Retrieve only if it is from the requested data node: data_node = retrieval_utils.get_data_node( path_to_retrieve, file_type) if nc_Database.is_level_name_included_and_not_excluded( 'data_node', self, data_node): if data_node in self.queues.keys(): if ((isinstance(output, netCDF4.Dataset) or isinstance(output, netCDF4.Group)) or time_chunk == 0): #If it is download: retrieve #If it is download_raw: retrieve only first time_chunk if var_to_retrieve == self.tree[-1]: #print 'Recovering '+var_to_retrieve+' in '+path_to_retrieve print 'Recovering ' + '/'.join(self.tree) self.queues[data_node].put((retrieval_function, ) + copy.deepcopy(args)) else: if (isinstance(output, netCDF4.Dataset) or isinstance(output, netCDF4.Group)): #netcdf_utils.assign_tree(output,*getattr(netcdf_utils,retrieval_function)(args[0],args[1])) netcdf_utils.assign_tree( output, *getattr(retrieval_utils, retrieval_function)(args[0], args[1])) return
def find_time_file(pointers, file_expt, file_available=False, file_queryable=False, semaphores=None): #session,file_expt,path_name): #If the path is a remote file, we must use the time stamp filename = os.path.basename(file_expt.path) #Check if file has fixed time_frequency or is a climatology: if file_expt.time_frequency in ['fx']: pointers.session.add(file_expt) pointers.session.commit() return else: time_stamp = filename.replace('.nc', '').split('_')[-1].split('|')[0] #time_stamp[0] == 'r': if not file_expt.experiment in pointers.header['experiment_list']: return years_requested = [ int(year) for year in pointers.header['experiment_list'][ file_expt.experiment].split(',') ] years_list_requested = range(*years_requested) years_list_requested.append(years_requested[1]) #Flag to check if the time axis is requested as relative: picontrol_min_time = (years_list_requested[0] <= 10) #Recover date range from filename: years_range = [int(date[:4]) for date in time_stamp.split('-')] #Check for yearly data if len(time_stamp.split('-')[0]) > 4: months_range = [int(date[4:6]) for date in time_stamp.split('-')] else: months_range = range(1, 13) years_list = range(*years_range) years_list.append(years_range[1]) if not picontrol_min_time: years_list = [ year for year in years_list if year in years_list_requested ] #Record in the database: for year_id, year in enumerate(years_list): if year_id == 0: #Check availability / queryability: if file_expt.file_type in ['local_file']: file_available = True file_queryable = True if not file_available: file_available = retrieval_utils.check_file_availability( file_expt.path.split('|')[0]) if file_available and not file_queryable: remote_data = remote_netcdf.remote_netCDF( file_expt.path.split('|')[0].replace( 'fileServer', 'dodsC'), semaphores) file_queryable = remote_data.is_available() for month in range(1, 13): if (not ((year == years_range[0] and month < months_range[0]) or (year == years_range[1] and month > months_range[1]))): attribute_time(pointers, file_expt, file_available, file_queryable, year, month) return