Example #1
0
    def create_variable(self,output,var,years,months):
        #Recover time axis for all files:
        self.obtain_table()

        queryable_file_types_available=list(set(self.table['file_type']).intersection(queryable_file_types))
        if len(self.table['paths'])>0:
            if len(queryable_file_types_available)>0:
                #Open the first file and use its metadata to populate container file:
                first_id=list(self.table['file_type']).index(queryable_file_types_available[0])
                remote_data=remote_netcdf.remote_netCDF(self.table['paths'][first_id],self.table['file_type'][first_id],self.semaphores)
                #try:
                remote_data.open_with_error()
                netcdf_utils.replicate_netcdf_file(output,remote_data.Dataset)
            else:
                remote_data=remote_netcdf.remote_netCDF(self.table['paths'][0],self.table['file_type'][0],self.semaphores)

            #Convert time axis to numbers and find the unique time axis:
            self.unique_time_axis(remote_data.Dataset,years,months)

            self.reduce_paths_ordering()
            #Create time axis in ouptut:
            netcdf_utils.create_time_axis_date(output,remote_data.Dataset,self.time_axis_unique_date)

            self.create(output)
            #if len(queryable_file_types_available)>0:
            self.record_indices(output,remote_data.Dataset,var)
            #except dodsError as e:
            #    e_mod=" This is an uncommon error. It is likely to be FATAL."
            #    print e.value+e_mod
            remote_data.close()

            output.sync()
        return
Example #2
0
    def record_fx(self,output,username=None,user_pass=None):
        #Create soft links
        self.create(output)
        output.groups['soft_links'].createVariable(self.var,np.float32,(),zlib=True)

        #Find the most recent version:
        most_recent_version='v'+str(np.max([int(item['version'][1:]) for item in self.paths_list]))
        usable_paths_list=[ item for item in self.paths_list if item['version']==most_recent_version]

        queryable_paths_list=[item for item in usable_paths_list if item['file_type'] in queryable_file_types]
        if len(queryable_paths_list)==0:
            temp_file_handle, temp_file_name=tempfile.mkstemp()

        try:
            if len(queryable_paths_list)==0:
                path=usable_paths_list[0]
                #output.createVariable(self.var,np.float32,(),zlib=True)
                #Download the file to temp
                retrieval_utils.download_secure(path['path'].split('|')[0],
                                temp_file_name,
                                path['file_type'],
                                username=username,user_pass=user_pass)
                remote_data=remote_netcdf.remote_netCDF(temp_file_name,path['file_type'],self.semaphores)
            else:
                #Check if data in available:
                path = queryable_paths_list[0]

                remote_data=remote_netcdf.remote_netCDF(path['path'].split('|')[0],path['file_type'],self.semaphores)
                alt_path_name=remote_data.check_if_available_and_find_alternative([item['path'].split('|')[0] for item in queryable_paths_list],
                                                                         [item['path'].split('|')[1] for item in queryable_paths_list])

                #Use aternative path:
                path=queryable_paths_list[[item['path'].split('|')[0] for item in queryable_paths_list].index(alt_path_name)]
                remote_data=remote_netcdf.remote_netCDF(path['path'].split('|')[0],path['file_type'],self.semaphores)
            remote_data.retrieve_variables(output,zlib=True)

            for att in path.keys():
                if att!='path':      
                    output.setncattr(att,path[att])
            output.setncattr('path',path['path'].split('|')[0])
            output.setncattr('checksum',path['path'].split('|')[1])
            output.sync()
        finally:
            pass
            if len(queryable_paths_list)==0:
                os.remove(temp_file_name)
        return
Example #3
0
 def _recover_time(self,path):
     file_type=path['file_type']
     checksum=path['checksum']
     path_name=str(path['path']).split('|')[0]
     remote_data=remote_netcdf.remote_netCDF(path_name,file_type,self.semaphores)
     time_axis=remote_data.get_time(time_frequency=self.time_frequency,
                                     is_instant=self.is_instant)
     table_desc=[
                ('paths','a255'),
                ('file_type','a255'),
                ('checksum','a255'),
                ('indices','int32')
                ]
     table=np.empty(time_axis.shape, dtype=table_desc)
     if len(time_axis)>0:
         table['paths']=np.array([str(path_name) for item in time_axis])
         table['file_type']=np.array([str(file_type) for item in time_axis])
         table['checksum']=np.array([str(checksum) for item in time_axis])
         table['indices']=range(0,len(time_axis))
     return time_axis,table
Example #4
0
def retrieve_path_data(in_dict,pointer_var):
    #print 'Recovering '+'/'.join(self.tree)

    path=in_dict['path'].replace('fileServer','dodsC').split('|')[0]
    var=in_dict['var']
    indices=copy.copy(in_dict['indices'])
    unsort_indices=copy.copy(in_dict['unsort_indices'])
    sort_table=in_dict['sort_table']

    remote_data=remote_netcdf.remote_netCDF(path,'HTTPServer',[])
    remote_data.open_with_error()
    dimensions=remote_data.retrieve_dimension_list(var)
    for dim in dimensions:
        if dim != 'time':
            remote_dim, attributes=remote_data.retrieve_dimension(dim)
            indices[dim], unsort_indices[dim] = indices_utils.prepare_indices(
                                                            indices_utils.get_indices_from_dim(remote_dim,indices[dim]))
    
    retrieved_data=remote_data.grab_indices(var,indices,unsort_indices)
    remote_data.close()
    return (retrieved_data, sort_table,pointer_var+[var])
Example #5
0
    def retrieve_variables(self,
                           retrieval_function,
                           var_to_retrieve,
                           time_restriction,
                           output,
                           semaphores=None,
                           username=None,
                           user_pass=None):
        #Replicate variable to output:
        if (isinstance(output, netCDF4.Dataset)
                or isinstance(output, netCDF4.Group)):
            output = netcdf_utils.replicate_netcdf_var(output,
                                                       self.data_root,
                                                       var_to_retrieve,
                                                       chunksize=-1,
                                                       zlib=True)
            #file_path=output.filepath()
            file_path = None
            if not 'soft_links' in self.data_root.groups.keys():
                #Variable is stored here and simply retrieve it:
                output.variables[
                    var_to_retrieve][:] = self.data_root.variables[
                        var_to_retrieve][time_restriction]
                return
        else:
            file_path = output

        dimensions = dict()
        unsort_dimensions = dict()
        dims_length = []
        for dim in self.data_root.variables[var_to_retrieve].dimensions:
            if dim != 'time':
                if dim in self.data_root.variables.keys():
                    dimensions[dim] = self.data_root.variables[dim][:]
                else:
                    dimensions[dim] = np.arange(
                        len(self.data_root.dimensions[dim]))
                unsort_dimensions[dim] = None
                dims_length.append(len(dimensions[dim]))

        # Determine the paths_ids for soft links:
        paths_link = self.data_root.groups['soft_links'].variables[
            var_to_retrieve][time_restriction, 0]
        indices_link = self.data_root.groups['soft_links'].variables[
            var_to_retrieve][time_restriction, 1]

        #Convert paths_link to id in path dimension:
        paths_link = np.array([
            list(self.paths_id_list).index(path_id) for path_id in paths_link
        ])

        #Sort the paths so that we query each only once:
        unique_paths_list_id, sorting_paths = np.unique(paths_link,
                                                        return_inverse=True)

        #Maximum number of time step per request:
        max_request = 450  #maximum request in Mb
        max_time_steps = max(
            int(
                np.floor(max_request * 1024 * 1024 /
                         (32 * np.prod(dims_length)))), 1)
        for unique_path_id, path_id in enumerate(unique_paths_list_id):
            path_to_retrieve = self.paths_list[path_id]

            #Next, we check if the file is available. If it is not we replace it
            #with another file with the same checksum, if there is one!
            file_type = self.file_type_list[list(
                self.paths_list).index(path_to_retrieve)]
            remote_data = remote_netcdf.remote_netCDF(
                path_to_retrieve.replace('fileServer', 'dodsC'), file_type,
                semaphores)
            if not file_type in ['FTPServer']:
                path_to_retrieve = remote_data.check_if_available_and_find_alternative(
                    [
                        path.replace('fileServer', 'dodsC')
                        for path in self.paths_list
                    ], self.checksums_list).replace('dodsC', 'fileServer')
            #Get the file_type, checksum and version of the file to retrieve:
            file_type = self.file_type_list[list(
                self.paths_list).index(path_to_retrieve)]
            version = 'v' + str(self.version_list[list(
                self.paths_list).index(path_to_retrieve)])
            checksum = self.checksums_list[list(
                self.paths_list).index(path_to_retrieve)]

            #Append the checksum:
            path_to_retrieve += '|' + checksum

            #time_indices=sorted_indices_link[sorted_paths_link==path_id]
            time_indices = indices_link[sorting_paths == unique_path_id]
            num_time_chunk = int(
                np.ceil(len(time_indices) / float(max_time_steps)))
            for time_chunk in range(num_time_chunk):
                time_slice = slice(time_chunk * max_time_steps,
                                   (time_chunk + 1) * max_time_steps, 1)
                dimensions['time'], unsort_dimensions[
                    'time'] = indices_utils.prepare_indices(
                        time_indices[time_slice])

                #Get the file tree:
                args = ({
                    'path':
                    path_to_retrieve,
                    'var':
                    var_to_retrieve,
                    'indices':
                    dimensions,
                    'unsort_indices':
                    unsort_dimensions,
                    'sort_table':
                    np.arange(len(sorting_paths))[sorting_paths ==
                                                  unique_path_id][time_slice],
                    'file_path':
                    file_path,
                    'version':
                    version,
                    'file_type':
                    file_type,
                    'username':
                    username,
                    'user_pass':
                    user_pass
                }, copy.deepcopy(self.tree))
                #'sort_table':np.argsort(sorting_paths)[sorted_paths_link==path_id][time_slice],

                #Retrieve only if it is from the requested data node:
                data_node = retrieval_utils.get_data_node(
                    path_to_retrieve, file_type)
                if nc_Database.is_level_name_included_and_not_excluded(
                        'data_node', self, data_node):
                    if data_node in self.queues.keys():
                        if ((isinstance(output, netCDF4.Dataset)
                             or isinstance(output, netCDF4.Group))
                                or time_chunk == 0):
                            #If it is download: retrieve
                            #If it is download_raw: retrieve only first time_chunk
                            if var_to_retrieve == self.tree[-1]:
                                #print 'Recovering '+var_to_retrieve+' in '+path_to_retrieve
                                print 'Recovering ' + '/'.join(self.tree)
                            self.queues[data_node].put((retrieval_function, ) +
                                                       copy.deepcopy(args))
                    else:
                        if (isinstance(output, netCDF4.Dataset)
                                or isinstance(output, netCDF4.Group)):
                            #netcdf_utils.assign_tree(output,*getattr(netcdf_utils,retrieval_function)(args[0],args[1]))
                            netcdf_utils.assign_tree(
                                output,
                                *getattr(retrieval_utils,
                                         retrieval_function)(args[0], args[1]))
        return
Example #6
0
def find_time_file(pointers,
                   file_expt,
                   file_available=False,
                   file_queryable=False,
                   semaphores=None):  #session,file_expt,path_name):
    #If the path is a remote file, we must use the time stamp
    filename = os.path.basename(file_expt.path)

    #Check if file has fixed time_frequency or is a climatology:
    if file_expt.time_frequency in ['fx']:
        pointers.session.add(file_expt)
        pointers.session.commit()
        return
    else:
        time_stamp = filename.replace('.nc', '').split('_')[-1].split('|')[0]
    #time_stamp[0] == 'r':

    if not file_expt.experiment in pointers.header['experiment_list']:
        return

    years_requested = [
        int(year) for year in pointers.header['experiment_list'][
            file_expt.experiment].split(',')
    ]
    years_list_requested = range(*years_requested)
    years_list_requested.append(years_requested[1])

    #Flag to check if the time axis is requested as relative:
    picontrol_min_time = (years_list_requested[0] <= 10)

    #Recover date range from filename:
    years_range = [int(date[:4]) for date in time_stamp.split('-')]
    #Check for yearly data
    if len(time_stamp.split('-')[0]) > 4:
        months_range = [int(date[4:6]) for date in time_stamp.split('-')]
    else:
        months_range = range(1, 13)
    years_list = range(*years_range)
    years_list.append(years_range[1])

    if not picontrol_min_time:
        years_list = [
            year for year in years_list if year in years_list_requested
        ]

    #Record in the database:
    for year_id, year in enumerate(years_list):
        if year_id == 0:
            #Check availability / queryability:
            if file_expt.file_type in ['local_file']:
                file_available = True
                file_queryable = True

            if not file_available:
                file_available = retrieval_utils.check_file_availability(
                    file_expt.path.split('|')[0])

            if file_available and not file_queryable:
                remote_data = remote_netcdf.remote_netCDF(
                    file_expt.path.split('|')[0].replace(
                        'fileServer', 'dodsC'), semaphores)
                file_queryable = remote_data.is_available()

        for month in range(1, 13):
            if (not ((year == years_range[0] and month < months_range[0]) or
                     (year == years_range[1] and month > months_range[1]))):
                attribute_time(pointers, file_expt, file_available,
                               file_queryable, year, month)
    return