Esempio n. 1
0
def populate_database_recursive(nc_Database,data,options,find_function,semaphores=None):
    if 'soft_links' in data.groups.keys():
        soft_links=data.groups['soft_links']
        paths=soft_links.variables['path'][:]
        for path_id, path in enumerate(paths):
            #id_list=['file_type','search']
            id_list=['file_type']
            for id in id_list:
                setattr(nc_Database.file_expt,id,soft_links.variables[id][path_id])

            #Check if data_node was included:
            data_node=retrieval_utils.get_data_node(soft_links.variables['path'][path_id],
                                                    soft_links.variables['file_type'][path_id])

            if is_level_name_included_and_not_excluded('data_node',options,data_node):
                setattr(nc_Database.file_expt,'path','|'.join([soft_links.variables['path'][path_id],
                                                       soft_links.variables['checksum'][path_id]]))
                setattr(nc_Database.file_expt,'version','v'+str(soft_links.variables['version'][path_id]))
                setattr(nc_Database.file_expt,'data_node',data_node)
                find_function(nc_Database,copy.deepcopy(nc_Database.file_expt),semaphores=semaphores)
    elif len(data.groups.keys())>0:
        for group in data.groups.keys():
            level_name=data.groups[group].getncattr('level_name')
            if is_level_name_included_and_not_excluded(level_name,options,group):
                setattr(nc_Database.file_expt,data.groups[group].getncattr('level_name'),group)
                populate_database_recursive(nc_Database,data.groups[group],options,find_function,semaphores=semaphores)
    elif 'path' in data.ncattrs():
        #for fx variables:
        #id_list=['file_type','search']
        id_list=['file_type']
        for id in id_list:
            setattr(nc_Database.file_expt,id,data.getncattr(id))

        #Check if data_node was included:
        data_node=retrieval_utils.get_data_node(data.getncattr('path'),
                                                data.getncattr('file_type'))
        if is_level_name_included_and_not_excluded('data_node',options,data_node):
            checksum=''
            if 'checksum' in data.ncattrs():
                checksum=data.getncattr('checksum')
            setattr(nc_Database.file_expt,'path','|'.join([data.getncattr('path'),
                                                   checksum]))
            setattr(nc_Database.file_expt,'version',str(data.getncattr('version')))

            setattr(nc_Database.file_expt,'data_node',
                        retrieval_utils.get_data_node(nc_Database.file_expt.path,
                                                      nc_Database.file_expt.file_type))
            find_function(nc_Database,copy.deepcopy(nc_Database.file_expt))
    else:
        #for retrieved datasets:
        #id_list=['file_type','search','path','version']
        id_list=['file_type','path','version']
        for id in id_list:
            setattr(nc_Database.file_expt,id,'')
        if len(data.variables.keys())>0:
            setattr(nc_Database.file_expt,'data_node',
                        retrieval_utils.get_data_node(nc_Database.file_expt.path,
                                                      nc_Database.file_expt.file_type))
            find_function(nc_Database,copy.deepcopy(nc_Database.file_expt))
    return
Esempio n. 2
0
def record_url(remote_file_desc, nc_Database):
    nc_Database.file_expt.path = remote_file_desc['url']
    nc_Database.file_expt.data_node = retrieval_utils.get_data_node(
        remote_file_desc['url'], remote_file_desc['file_type'])
    if remote_file_desc[
            'file_type'] in nc_Database.drs.remote_file_types and remote_file_desc[
                'checksum']:
        nc_Database.file_expt.path += '|' + remote_file_desc['checksum']
    else:
        nc_Database.file_expt.path += '|'

    for val in nc_Database.drs.remote_fields:
        setattr(nc_Database.file_expt, val, remote_file_desc[val])

    #Convert unicode to string:
    for val in dir(nc_Database.file_expt):
        if val[0] != '_' and val != 'case_id':
            setattr(nc_Database.file_expt, val,
                    str(getattr(nc_Database.file_expt, val)))

    list_of_knowns = [
        getattr(nc_Database.file_expt, field)
        for field in nc_Database.drs.known_fields
    ]
    list_of_retrieved = [
        remote_file_desc[field] for field in nc_Database.drs.known_fields
    ]
    if remote_file_desc['version']:
        if (remote_file_desc['version'][1:] != 'atest' and len([
                i for i, j in zip(list_of_knowns, list_of_retrieved) if i == j
        ]) == len(list_of_knowns)):
            nc_Database.session.add(copy.deepcopy(nc_Database.file_expt))
            nc_Database.session.commit()
    return nc_Database
Esempio n. 3
0
    def order_paths_by_preference(self):
        #FIND ORDERING:
        paths_desc=[]
        for id in self.sorts_list:
            paths_desc.append((id,np.int32))
        for id in self.id_list:
            paths_desc.append((id,'a255'))
        paths_ordering=np.empty((len(self.paths_list),), dtype=paths_desc)
        for file_id, file in enumerate(self.paths_list):
            paths_ordering['path'][file_id]=file['path'].split('|')[0]
            #Convert path name to 'unique' integer using hash.
            #The integer will not really be unique but collisions
            #should be extremely rare for similar strings with only small variations.
            paths_ordering['path_id'][file_id]=hash(
                                                    paths_ordering['path'][file_id]
                                                        )
            paths_ordering['checksum'][file_id]=file['path'].split('|')[1]
            paths_ordering['version'][file_id]=np.long(file['version'][1:])

            paths_ordering['file_type'][file_id]=file['file_type']
            paths_ordering['data_node'][file_id]=retrieval_utils.get_data_node(file['path'],paths_ordering['file_type'][file_id])

        #Sort paths from most desired to least desired:
        #First order desiredness for least to most:
        data_node_order=copy.copy(self.data_node_list)[::-1]#list(np.unique(paths_ordering['data_node']))
        file_type_order=copy.copy(self.file_type_list)[::-1]#list(np.unique(paths_ordering['file_type']))
        for file_id, file in enumerate(self.paths_list):
            paths_ordering['data_node_id'][file_id]=data_node_order.index(paths_ordering['data_node'][file_id])
            paths_ordering['file_type_id'][file_id]=file_type_order.index(paths_ordering['file_type'][file_id])
        #'version' is implicitly from least to most

        #sort and reverse order to get from most to least:
        return np.sort(paths_ordering,order=self.sorts_list)[::-1]
Esempio n. 4
0
 def __init__(self,netcdf_file_name,file_type,semaphores):
     self.file_name=netcdf_file_name
     self.semaphores=semaphores
     self.file_type=file_type
     self.remote_data_node=retrieval_utils.get_data_node(self.file_name, self.file_type)
     if isinstance(semaphores,dict):
         self.in_semaphores=(self.remote_data_node in  self.semaphores.keys())
     else:
         self.in_semaphores=False
     self.Dataset=None
     return
Esempio n. 5
0
 def __init__(self, netcdf_file_name, file_type, semaphores):
     self.file_name = netcdf_file_name
     self.semaphores = semaphores
     self.file_type = file_type
     self.remote_data_node = retrieval_utils.get_data_node(
         self.file_name, self.file_type)
     if isinstance(semaphores, dict):
         self.in_semaphores = (self.remote_data_node
                               in self.semaphores.keys())
     else:
         self.in_semaphores = False
     self.Dataset = None
     return
Esempio n. 6
0
    def __init__(self, search_path, options):
        self.file_type = "FTPServer"
        self.options = options
        self.search_path = search_path.rstrip("/")
        self.data_node = retrieval_utils.get_data_node(self.search_path, self.file_type)
        if self.options.username != None and "password" in dir(self.options) and self.options.password != None:
            # Use credentials:
            self.ftp = ftplib.FTP(self.data_node.split("/")[2], self.options.username, self.options.password)

        else:
            # Do not use credentials and hope for anonymous:
            self.ftp = ftplib.FTP(self.data_node.split("/")[2])
        return
Esempio n. 7
0
    def __init__(self,search_path,options):
        self.file_type='FTPServer'
        self.options=options
        self.search_path=search_path.rstrip('/')
        self.data_node=retrieval_utils.get_data_node(self.search_path,self.file_type)
        if (self.options.username!=None and 
            'password' in dir(self.options) and
            self.options.password!=None):
            #Use credentials:
            self.ftp=ftplib.FTP(self.data_node.split('/')[2],self.options.username,self.options.password)

        else:
            #Do not use credentials and hope for anonymous:
            self.ftp=ftplib.FTP(self.data_node.split('/')[2])
        return
Esempio n. 8
0
def define_queues(options, data_node_list):
    #from multiprocessing import Manager
    #manager=Manager()
    queues = {
        data_node: multiprocessing.Queue()
        for data_node in data_node_list
    }
    #sem=manager.Semaphore()
    #semaphores={data_node : manager.Semaphore() for data_node in data_node_list}
    #semaphores={data_node : sem for data_node in data_node_list}
    queues['end'] = multiprocessing.Queue()
    if 'source_dir' in dir(options) and options.source_dir != None:
        queues[retrieval_utils.get_data_node(
            options.source_dir, 'local_file')] = multiprocessing.Queue()
    return queues
Esempio n. 9
0
    def descend_tree(self,database,list_level=None):
        only_list=[]
        if self.file_type in database.header['file_type_list']:
            description={
                       'file_type':self.file_type,
                       'data_node':retrieval_utils.get_data_node(self.search_path,self.file_type),
                       'time':'0'}
            file_expt_copy=copy.deepcopy(database.nc_Database.file_expt)
            for att in description.keys():
                setattr(file_expt_copy,att,description[att])

            only_list.append(descend_tree_recursive(database,file_expt_copy,
                                    [item for item in database.drs.base_drs if not item in description.keys()],
                                    self.search_path,
                                    self.options,list_level=list_level))

            if 'alt_base_drs' in dir(database.drs):
                only_list.append(descend_tree_recursive(database,file_expt_copy,
                                        [item for item in database.drs.alt_base_drs if not item in description.keys()],
                                        self.search_path,
                                        self.options,list_level=list_level,alt=True))
        return [item for sublist in only_list for item in sublist]
Esempio n. 10
0
    def retrieve_without_time(self,
                              retrieval_function,
                              output,
                              semaphores=None,
                              username=None,
                              user_pass=None):
        #This function simply retrieves all the files:
        file_path = output
        for path_to_retrieve in self.paths_list:
            file_type = self.file_type_list[list(
                self.paths_list).index(path_to_retrieve)]
            version = 'v' + str(self.version_list[list(
                self.paths_list).index(path_to_retrieve)])
            checksum = self.checksums_list[list(
                self.paths_list).index(path_to_retrieve)]
            #Get the file tree:
            args = ({
                'path': path_to_retrieve + '|' + checksum,
                'var': self.tree[-1],
                'file_path': file_path,
                'version': version,
                'file_type': file_type,
                'username': username,
                'user_pass': user_pass
            }, copy.deepcopy(self.tree))
            #'sort_table':np.argsort(sorting_paths)[sorted_paths_link==path_id][time_slice],

            #Retrieve only if it is from the requested data node:
            data_node = retrieval_utils.get_data_node(path_to_retrieve,
                                                      file_type)
            if nc_Database.is_level_name_included_and_not_excluded(
                    'data_node', self, data_node):
                if data_node in self.queues.keys():
                    #print 'Recovering '+var_to_retrieve+' in '+path_to_retrieve
                    print 'Recovering ' + '/'.join(self.tree)
                    self.queues[data_node].put((retrieval_function, ) +
                                               copy.deepcopy(args))
        return
Esempio n. 11
0
def record_url(remote_file_desc,nc_Database):
    nc_Database.file_expt.path=remote_file_desc['url']
    nc_Database.file_expt.data_node=retrieval_utils.get_data_node(remote_file_desc['url'],remote_file_desc['file_type'])
    if remote_file_desc['file_type'] in nc_Database.drs.remote_file_types and remote_file_desc['checksum']:
        nc_Database.file_expt.path+='|'+remote_file_desc['checksum']
    else:
        nc_Database.file_expt.path+='|'

    for val in nc_Database.drs.remote_fields:
        setattr(nc_Database.file_expt,val,remote_file_desc[val])

    #Convert unicode to string:
    for val in dir(nc_Database.file_expt):
        if val[0]!='_' and val!='case_id':
            setattr(nc_Database.file_expt,val,str(getattr(nc_Database.file_expt,val)))

    list_of_knowns=[ getattr(nc_Database.file_expt,field) for field in nc_Database.drs.known_fields] 
    list_of_retrieved=[ remote_file_desc[field] for field in nc_Database.drs.known_fields] 
    if remote_file_desc['version']:
        if (remote_file_desc['version'][1:]!='atest' and
            len([i for i,j in zip(list_of_knowns,list_of_retrieved) if i==j])==len(list_of_knowns)):
            nc_Database.session.add(copy.deepcopy(nc_Database.file_expt))
            nc_Database.session.commit()
    return nc_Database
Esempio n. 12
0
    def retrieve_variables(self,
                           retrieval_function,
                           var_to_retrieve,
                           time_restriction,
                           output,
                           semaphores=None,
                           username=None,
                           user_pass=None):
        #Replicate variable to output:
        if (isinstance(output, netCDF4.Dataset)
                or isinstance(output, netCDF4.Group)):
            output = netcdf_utils.replicate_netcdf_var(output,
                                                       self.data_root,
                                                       var_to_retrieve,
                                                       chunksize=-1,
                                                       zlib=True)
            #file_path=output.filepath()
            file_path = None
            if not 'soft_links' in self.data_root.groups.keys():
                #Variable is stored here and simply retrieve it:
                output.variables[
                    var_to_retrieve][:] = self.data_root.variables[
                        var_to_retrieve][time_restriction]
                return
        else:
            file_path = output

        dimensions = dict()
        unsort_dimensions = dict()
        dims_length = []
        for dim in self.data_root.variables[var_to_retrieve].dimensions:
            if dim != 'time':
                if dim in self.data_root.variables.keys():
                    dimensions[dim] = self.data_root.variables[dim][:]
                else:
                    dimensions[dim] = np.arange(
                        len(self.data_root.dimensions[dim]))
                unsort_dimensions[dim] = None
                dims_length.append(len(dimensions[dim]))

        # Determine the paths_ids for soft links:
        paths_link = self.data_root.groups['soft_links'].variables[
            var_to_retrieve][time_restriction, 0]
        indices_link = self.data_root.groups['soft_links'].variables[
            var_to_retrieve][time_restriction, 1]

        #Convert paths_link to id in path dimension:
        paths_link = np.array([
            list(self.paths_id_list).index(path_id) for path_id in paths_link
        ])

        #Sort the paths so that we query each only once:
        unique_paths_list_id, sorting_paths = np.unique(paths_link,
                                                        return_inverse=True)

        #Maximum number of time step per request:
        max_request = 450  #maximum request in Mb
        max_time_steps = max(
            int(
                np.floor(max_request * 1024 * 1024 /
                         (32 * np.prod(dims_length)))), 1)
        for unique_path_id, path_id in enumerate(unique_paths_list_id):
            path_to_retrieve = self.paths_list[path_id]

            #Next, we check if the file is available. If it is not we replace it
            #with another file with the same checksum, if there is one!
            file_type = self.file_type_list[list(
                self.paths_list).index(path_to_retrieve)]
            remote_data = remote_netcdf.remote_netCDF(
                path_to_retrieve.replace('fileServer', 'dodsC'), file_type,
                semaphores)
            if not file_type in ['FTPServer']:
                path_to_retrieve = remote_data.check_if_available_and_find_alternative(
                    [
                        path.replace('fileServer', 'dodsC')
                        for path in self.paths_list
                    ], self.checksums_list).replace('dodsC', 'fileServer')
            #Get the file_type, checksum and version of the file to retrieve:
            file_type = self.file_type_list[list(
                self.paths_list).index(path_to_retrieve)]
            version = 'v' + str(self.version_list[list(
                self.paths_list).index(path_to_retrieve)])
            checksum = self.checksums_list[list(
                self.paths_list).index(path_to_retrieve)]

            #Append the checksum:
            path_to_retrieve += '|' + checksum

            #time_indices=sorted_indices_link[sorted_paths_link==path_id]
            time_indices = indices_link[sorting_paths == unique_path_id]
            num_time_chunk = int(
                np.ceil(len(time_indices) / float(max_time_steps)))
            for time_chunk in range(num_time_chunk):
                time_slice = slice(time_chunk * max_time_steps,
                                   (time_chunk + 1) * max_time_steps, 1)
                dimensions['time'], unsort_dimensions[
                    'time'] = indices_utils.prepare_indices(
                        time_indices[time_slice])

                #Get the file tree:
                args = ({
                    'path':
                    path_to_retrieve,
                    'var':
                    var_to_retrieve,
                    'indices':
                    dimensions,
                    'unsort_indices':
                    unsort_dimensions,
                    'sort_table':
                    np.arange(len(sorting_paths))[sorting_paths ==
                                                  unique_path_id][time_slice],
                    'file_path':
                    file_path,
                    'version':
                    version,
                    'file_type':
                    file_type,
                    'username':
                    username,
                    'user_pass':
                    user_pass
                }, copy.deepcopy(self.tree))
                #'sort_table':np.argsort(sorting_paths)[sorted_paths_link==path_id][time_slice],

                #Retrieve only if it is from the requested data node:
                data_node = retrieval_utils.get_data_node(
                    path_to_retrieve, file_type)
                if nc_Database.is_level_name_included_and_not_excluded(
                        'data_node', self, data_node):
                    if data_node in self.queues.keys():
                        if ((isinstance(output, netCDF4.Dataset)
                             or isinstance(output, netCDF4.Group))
                                or time_chunk == 0):
                            #If it is download: retrieve
                            #If it is download_raw: retrieve only first time_chunk
                            if var_to_retrieve == self.tree[-1]:
                                #print 'Recovering '+var_to_retrieve+' in '+path_to_retrieve
                                print 'Recovering ' + '/'.join(self.tree)
                            self.queues[data_node].put((retrieval_function, ) +
                                                       copy.deepcopy(args))
                    else:
                        if (isinstance(output, netCDF4.Dataset)
                                or isinstance(output, netCDF4.Group)):
                            #netcdf_utils.assign_tree(output,*getattr(netcdf_utils,retrieval_function)(args[0],args[1]))
                            netcdf_utils.assign_tree(
                                output,
                                *getattr(retrieval_utils,
                                         retrieval_function)(args[0], args[1]))
        return