Example #1
0
    def test_command(self):

        with self.assertRaises(SystemExit):
            self.cmd.run([])

        with self.assertRaises(SystemExit):
            self.cmd.run(['--crawl=%s/cmip5' % self.tmpdir])

        # test crawl dir
        output = '/tmp/crawl_output.txt'
        self.cmd.run(
            ['--crawl=%s/cmip5' % self.tmpdir,
             '--output=%s' % output])
        self.assertTrue(os.path.isfile(output))
        crawl_obj = UserCrawl.objects.get(tar_file=output.split('/')[-1])
        self.assertEqual(crawl_obj.status, 'crawling')
        # test ingesting
        self.assertEqual(len(list(SolrFindFiles.search())), 0)
        self.cmd.run(['--ingest=%s' % output])
        crawl_obj = UserCrawl.objects.get(tar_file=output.split('/')[-1])
        self.assertEqual(crawl_obj.status, 'success')
        self.assertEqual(len(list(SolrFindFiles.search())), 3)

        # test custom host and port
        self.cmd.run([
            '--ingest=%s' % output,
            '--solr-url=http://%s:%s' % (self.solr_host, self.solr_port)
        ])
        self.assertEqual(len(list(SolrFindFiles.search(latest_version=False))),
                         5)

        os.remove(output)
Example #2
0
 def test_solr_search(self):
     # search some files
     solr_search = SolrFindFiles()
     all_files = solr_search.search()
     self.assertEqual(len(list(all_files)), 3)
     hist = solr_search.search(experiment='historical')
     self.assertEqual(list(hist), [os.path.join(self.tmpdir, self.files[0])])
     all_files = solr_search.search(latest_version=False)
     self.assertEqual(len(list(all_files)), 5)
     # test OR query
     or_result = solr_search.search(variable=['tauu', 'wetso2'])
     self.assertEqual(set([os.path.join(self.tmpdir, e) for e in self.files[:2]]), set(or_result))
Example #3
0
    def search_files(self,
                     decadals=None,
                     project="baseline1",
                     firstyear=None,
                     lastyear=None,
                     product="*",
                     time_frequency='6hr',
                     model="mpi-esm-lr",
                     ensembles=["*"],
                     experiment=None,
                     variable="ta",
                     institute="MPI-M",
                     realm="atmos",
                     driving_model=None,
                     rcm_ensemble=None,
                     domain=None,
                     find_variables=False):
        """
        use solr_search to find the files needed by the plugin
        set find_variables=True to get a list of available variables instead of searching files
        """
        # convert some arguments

        #         if '*' in ensembles:
        #             Logger.Error('All ensembles not allowed\n'
        #                          'Please select one',-1)

        if ensembles != '\*':
            ensembles = ensembles.split(',')
        else:
            ensembles = [ensembles]
        if decadals is not None:
            years = map(int, decadals)
        else:
            years = None
        if firstyear is not None:
            firstyear = int(firstyear)
        if lastyear is not None:
            lastyear = int(lastyear)

        # use solr_search to get the input files --------------------------------------------------
        # construct a search string for experiments
        experiment_prefix = experiment
        if experiment_prefix is None or experiment_prefix == "*":
            if project.lower() == 'baseline0':
                experiment_prefix = 'decadal'
            elif project.lower() == 'baseline1':
                experiment_prefix = 'decs4e'
            elif project.lower() == 'prototype':
                experiment_prefix = 'dffs4e'
            elif project.lower() == 'cmip5':
                experiment_prefix = 'decadal'
            elif project.lower() == 'historical':
                experiment_prefix = 'historical'
            else:
                experiment_prefix = '*'
        # if not experiment_prefix.endswith("*"):
        #    experiment_prefix += "*"

        # compose solr_search arguments
        ssargs = {}
        ssargs["project"] = project
        ssargs["institute"] = institute
        ssargs["realm"] = realm
        if type(variable) == list:
            variables = variable
        else:
            variables = [variable]
        ssargs["time_frequency"] = time_frequency
        # are there products in this project?
        product_facets = SolrFindFiles.facets(facets=["product"], **ssargs)
        if len(product_facets["product"]) > 0:
            ssargs["product"] = product
        # are there models in this project?
        model_facets = SolrFindFiles.facets(facets=["model"], **ssargs)
        if len(model_facets["model"]) > 0:
            ssargs["model"] = model
        # are there multiple experiments?
        experiment_facets = SolrFindFiles.facets(facets=["experiment"],
                                                 **ssargs)
        if len(experiment_facets["experiment"]) > 0:
            ssargs["experiment"] = experiment_prefix
        # additional parameters for regional models
        if rcm_ensemble is not None:
            rcm_ensemble_facets = SolrFindFiles.facets(facets=["rcm_ensemble"],
                                                       **ssargs)
            if len(rcm_ensemble_facets["rcm_ensemble"]) > 0:
                ssargs["rcm_ensemble"] = rcm_ensemble
        if driving_model is not None:
            driving_model_facets = SolrFindFiles.facets(
                facets=["driving_model"], **ssargs)
            if len(driving_model_facets["driving_model"]) > 0:
                ssargs["driving_model"] = driving_model
        if domain is not None:
            domain_facets = SolrFindFiles.facets(facets=["domain"], **ssargs)
            if len(domain_facets["domain"]) > 0:
                ssargs["domain"] = domain

        # search variables instead of files?
        if find_variables:
            variable_facets = SolrFindFiles.facets(facets=["variable"],
                                                   **ssargs)
            return variable_facets["variable"]

        # put all files into a list

        self.inputfiles = []
        if years is not None or firstyear is not None or lastyear is not None:
            self.inputfilesByDecade = {}
            # we have multiple experiments that contain the decade
            if "experiment" in ssargs and project != "observations" and project != "reanalysis" and years is not None:
                for year in years:
                    yearfiles = []
                    ssargs["experiment"] = "%s%d" % (experiment_prefix, year)
                    for ens in ensembles:
                        if ens != "*":
                            ssargs["ensemble"] = ens
                        for onefile in solr_search_multivar(variables, ssargs):
                            self.inputfiles.append(onefile)
                            yearfiles.append(onefile)
                    self.inputfilesByDecade[year] = yearfiles

            # we have only one experiment. fetch all files and filter them by year
            elif firstyear is not None or lastyear is not None:
                for ens in ensembles:
                    if ens != "*":
                        ssargs["ensemble"] = ens
                    for onefile in solr_search_multivar(variables, ssargs):
                        starttime, endtime = get_start_and_end_time_from_DRSFile(
                            onefile, include_str=False)
                        if firstyear is None and \
                                any([True for e in range(starttime.year, endtime.year + 1, 1) \
                                     if e <= lastyear]):
                            self.inputfiles.append(onefile)
                        elif lastyear is None and \
                                any([True for e in range(starttime.year, endtime.year + 1, 1) \
                                     if e >= firstyear]):
                            self.inputfiles.append(onefile)
                        elif any([True for e in range(firstyear, lastyear + 1, 1)\
                                  if e in range(starttime.year, endtime.year + 1, 1)]):
                            self.inputfiles.append(onefile)

            else:
                for ens in ensembles:
                    if ens != "*":
                        ssargs["ensemble"] = ens
                    for onefile in solr_search_multivar(variables, ssargs):
                        starttime, endtime = get_start_and_end_time_from_DRSFile(
                            onefile, include_str=False)
                        for year in years:
                            if starttime.year > year and endtime.year <= year + 10:
                                self.inputfiles.append(onefile)
                                if year not in self.inputfilesByDecade:
                                    self.inputfilesByDecade[year] = [onefile]
                                else:
                                    self.inputfilesByDecade[year].append(
                                        onefile)
                                break
                                # we want all files, not only those for a special decade
        else:
            self.inputfilesByDecade = None
            for ens in ensembles:
                if ens != "*":
                    ssargs["ensemble"] = ens
                for onefile in solr_search_multivar(variables, ssargs):
                    self.inputfiles.append(onefile)

        # nothing found? cancel!
        if len(self.inputfiles) == 0:
            Logger.Error(
                "No input files found!\n"
                "Data-Browser command:\t"
                "freva --databrowser project='%s' product='%s' institute='%s' model='%s' experiment='%s'"
                " time_frequency='%s' realm='%s' variable='%s'" %
                (project, product, institute, model, experiment,
                 time_frequency, realm, variable), -1)

        # changed the time part if only a single lead year is of interest or remove files that do not belong to the
        # requested lead year

        # check for overlapping time-periods within the same folder
        self.inputfiles = self.remove_overlapping_time_periods_from_file_list(
            self.inputfiles)

        # check if all ensemble members have the same number of files
        self.inputfiles = self.check_ensemble_completeness(self.inputfiles)

        # repair some known special cases
        apply_workarounds_for_path(self.inputfiles)

        # merge multiple variables
        merged_by_var = self.merge_multiple_variables(self.inputfiles)
        if len(merged_by_var) == 0:
            Logger.Error(
                "no files found for different variables and same time steps!")
Example #4
0
    def getFiles(self, year, fileType, model, variable, time_frequency='mon', product='*', ensemblemembers='*',
                 institute='*', exp_prefix='d*', maxleadtime=10, minLeadtime=1):
        """
        Method to get model files with solr_search.
        
        :param year: decadal starting year
        :param fileType: baseline1, cmip5, historical or...
        :param model: model name i.e. MPI-ESM-LR
        :param variable: CMOR variable
        :param time_frequency: monthly, yearly, daily and so on
        
        :return: list with all ensemblemembers members found
        """
        # TODO: BUGFIX for minLeadyear
        minLeadtime = 1
        output = list() 
        decStr = exp_prefix+str(year)
        project = fileType.lower()    
        tmpList = list()
        for fn in SolrFindFiles.search(experiment=decStr, latest_version=True, product=product, institute=institute,
                                       variable=variable, time_frequency=time_frequency, model=model, project=project):
            if str(fn).split('.')[-1] == 'nc':
                tmpList.append(str(fn))
        try:
            test = tmpList[0]
        except:
            import time
            time.sleep(5)  # delays for 5 seconds
            for fn in SolrFindFiles.search(experiment=decStr, latest_version=True, product=product, institute=institute,
                                           variable=variable, time_frequency=time_frequency, model=model,
                                           project=project):
                if str(fn).split('.')[-1] == 'nc':
                    tmpList.append(str(fn))
            try:
                test = tmpList[0]
            except:
                if exp_prefix.find('*') != -1:
                    raise NoFilesFoundError,\
                        "Couldn't find files for %s in %s %s %s experiment: %s" % (variable, fileType, model,
                                                                                   product, year)
                # OK we can't find files, now try one last time using only the exp_prefix, i.e. "historical"
                decStr = exp_prefix
                for fn in SolrFindFiles.search(experiment=exp_prefix, latest_version=True, product=product,
                                               institute=institute, variable=variable, time_frequency=time_frequency,
                                               model=model, project=project):
                    if str(fn).split('.')[-1] == 'nc':
                        tmpList.append(str(fn))
                try:
                    test = tmpList[0]
                except:
                    # OK, there are no Files...
                    raise NoFilesFoundError,\
                        "Couldn't find files for %s in %s %s %s experiment: %s" % (variable, fileType, model,
                                                                                   product, year)

        # Check if we have time-splitted files
        time_values = SolrFindFiles.facets(facets='time', experiment=decStr, latest_version=True, product=product,
                                           institute=institute, variable=variable, time_frequency=time_frequency,
                                           model=model, project=project)
        if len(time_values['time']) > 1:
            tmpList = self.mergeSplittedFiles(tmpList)        
              
        # select only wanted ensemblemembers
        if type(ensemblemembers) == list and ensemblemembers[0] != '*':
            ensList = list()
            for ens in ensemblemembers:
                onlyfiles = [f for f in tmpList if f.lower().find(ens) != -1]
                if len(onlyfiles) > 0:
                    ensList = ensList + onlyfiles
                else:
                    raise EnsembleMemberError, "Ensemble member %s not found for  %s %s %s for starting year %s" % (ens,fileType, model, product, year)
            tmpList = ensList
        
        for fn in tmpList:
            years = cdo.showyear(input=str(fn))[0]
            yearList = years.split(' ')

            if str(year+minLeadtime) not in yearList or str(year+maxleadtime) not in yearList:
                raise NotEnoughYearsInFile, \
                    "1Not enough years in %s %s %s for starting year %s" % (fileType, model, product, year)

            selStr = ','.join(map(str, range(year+minLeadtime, year+1+maxleadtime)))
            fileName = str(fn).split('/')[-1]
            output.append(cdo.selyear(selStr, input=str(fn),
                                      output=self.tmpDir+fileName+self.getRandomStr()+'_'+str(year+minLeadtime)+'-'+str(year+maxleadtime),
                                      options='-f nc'))
                
            if len(cdo.showyear(input=output[-1])[0].split(' ')) < maxleadtime-minLeadtime: 
                raise NotEnoughYearsInFile,\
                    "2Not enough years in %s %s %s for starting year %s" % (fileType, model, product, year)
                
        if not output or not isinstance(output, list):
            raise NoFilesFoundError, \
                "Couldn't find files for %s in %s %s %s for starting year %s" % (variable, fileType, model,
                                                                                 product, year)

        # check for curvilinear grid
        if not hasattr(self,'curvilinearGrid') or self.curvilinearGrid == True:
            output = self.checkGrid(output, model)

        # user wants to select levels
        if self.level is not None:
            return self.selectLevel(output)
        else:
            return output
Example #5
0
    def getReanalysis(self, year, fileType, experiment, variable, filePath='', time_frequency='mon', maxLeadtime=10,
                      observation_ensemble='*', minLeadtime=1):
        """
        Wrapper method to find reanalysis file with solr_search.
        
        :param year: startyear
        :param fileType: reanalysis or observation
        :param experiment: i.e. NCEP, HadCrut or MERRA
        :param variable: CMOR Variable
        :param time_frequency: monthly, yearly, daily and so on
        :return: "decadal" file with observations  
        """
        # TODO: BUGFIX for minLeadyear
        minLeadtime = 1
        reanFiles = list()
        if experiment == 'HadCrut' and variable == 'tas':
            return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime)

        # to use your own reanalysis data
        if os.path.isfile(self.observation):
            return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime, minLeadtime=minLeadtime)

        if not hasattr(self, 'mergedReanFile'):
            # Observation or reanalysis?
            facet = SolrFindFiles.facets(facets='data_type', experiment=experiment, variable=variable, 
                                         time_frequency=time_frequency)
            try:
                if 'reanalysis' in facet['data_type']:
                    searchList = SolrFindFiles.search(project=['reanalysis', 'observations'],
                                                      experiment=experiment, variable=variable,
                                                      time_frequency=time_frequency, ensemble=observation_ensemble)
                else:
                    searchList = SolrFindFiles.search(project=['reanalysis', 'observations'],
                                                      experiment=experiment, variable=variable,
                                                      time_frequency=time_frequency, product='grid',
                                                      ensemble=observation_ensemble)
            except IndexError:
                raise NoFilesFoundError, "Couldn't find files for %s in %s" % (variable, experiment)
            for fn in searchList:
                yearTmp = cdo.showyear(input=str(fn))[0]
                fname = str(fn).split('/')[-1]
                reanFiles.append(str(fn))
                # if more than one year in File we break the loop and expect it to be a observationsfile
                if len(yearTmp.split(' ')) > 1:
                    break
            if len(reanFiles) == 0:
                raise NoFilesFoundError,\
                    "Couldn't find files for %s in %s " % (variable, experiment)
            mergedFile = cdo.mergetime(input=' '.join(reanFiles), output=self.tmpDir+'mergedREAN_YEARMEAN')
            tmpMean = cdo.timmean(input=mergedFile)
            self.mergedReanFile = cdo.sub(input=' '.join([mergedFile, tmpMean]), output=self.tmpDir+'reananomalies.nc')
            if self.level is not None:
                self.mergedReanFile = self._selectLevel(self.mergedReanFile)
        
        if not hasattr(self, 'mergedReanFile'):
            raise NoFilesFoundError, "Couldn't find files for %s in %s" % (variable, experiment)
            
        years = cdo.showyear(input=self.mergedReanFile)[0]
        if years.find(str(year+minLeadtime)) != -1 and years.find(str(year+maxLeadtime)) != -1:
            # create tmp decadal file
            fileStr = ','.join(map(str, range(year+minLeadtime, year+maxLeadtime+1)))
            tmp = cdo.selyear(fileStr, input=self.mergedReanFile,
                              output=self.tmpDir+'reanalysis_'+experiment+str(year+1)+'-'+str(year+maxLeadtime)+'.nc',
                              options='-f nc')
            return tmp
        else:
            raise NotEnoughYearsInFile,\
                "%s-%s are not part of %s reanalysis" % (year+minLeadtime, year+maxLeadtime, experiment)
Example #6
0
    def getFiles(self,
                 year,
                 fileType,
                 model,
                 variable,
                 time_frequency='mon',
                 product='*',
                 ensemblemembers='*',
                 institute='*',
                 exp_prefix='d*',
                 maxleadtime=10):
        '''
        Method to get model files with solr_search.
        
        :param year: decadal starting year
        :param fileType: baseline1, cmip5, historical or...
        :param model: model name i.e. MPI-ESM-LR
        :param variable: CMOR variable
        :param time_frequency: monthly, yearly, daily and so on
        
        :return: list with all ensemblemembers members found
        '''
        output = list()
        decStr = exp_prefix + str(year)
        project = fileType.lower()
        tmpList = list()
        for fn in SolrFindFiles.search(experiment=decStr,
                                       latest_version=True,
                                       product=product,
                                       institute=institute,
                                       variable=variable,
                                       time_frequency=time_frequency,
                                       model=model,
                                       project=project):
            if (str(fn).split('.')[-1] == 'nc'):
                tmpList.append(str(fn))
        try:
            test = tmpList[0]
        except:
            import time
            time.sleep(5)  # delays for 5 seconds
            for fn in SolrFindFiles.search(experiment=decStr,
                                           latest_version=True,
                                           product=product,
                                           institute=institute,
                                           variable=variable,
                                           time_frequency=time_frequency,
                                           model=model,
                                           project=project):
                print str(fn)
                if (str(fn).split('.')[-1] == 'nc'):
                    tmpList.append(str(fn))
            try:
                test = tmpList[0]
            except:
                if exp_prefix.find('*') != -1:
                    raise NoFilesFoundError, "Couldn't find files for %s in %s %s %s experiment: %s" % (
                        variable, fileType, model, product, year)
                #OK we can't find files, now try one last time using only the exp_prefix, i.e. "historical"
                for fn in SolrFindFiles.search(experiment=exp_prefix,
                                               latest_version=True,
                                               product=product,
                                               institute=institute,
                                               variable=variable,
                                               time_frequency=time_frequency,
                                               model=model,
                                               project=project):
                    if (str(fn).split('.')[-1] == 'nc'):
                        tmpList.append(str(fn))
                try:
                    test = tmpList[0]
                except:
                    #OK, there are no Files...
                    raise NoFilesFoundError, "Couldn't find files for %s in %s %s %s experiment: %s" % (
                        variable, fileType, model, product, year)

        #select only wanted ensemblemembers
        if type(ensemblemembers) == list and ensemblemembers[0] != '*':
            ensList = list()
            for ens in ensemblemembers:
                onlyfiles = [f for f in tmpList if f.find(ens) != -1]
                if len(onlyfiles) > 0:
                    ensList.append(onlyfiles[0])

            tmpList = ensList

        for fn in tmpList:

            years = cdo.showyear(input=str(fn))[0]
            yearList = years.split(' ')
            #print years
            #print fn
            if str(year +
                   1) not in yearList or str(year +
                                             maxleadtime) not in yearList:
                print year
                raise NotEnoughYearsInFile, "1Not enough years in %s %s %s for starting year %s" % (
                    fileType, model, product, year)

            if (len(years.split(' ')) > maxleadtime):
                selStr = ','.join(
                    map(str, range(year + 1, year + 1 + maxleadtime)))
                fileName = str(fn).split('/')[-1]
                output.append(
                    cdo.selyear(selStr,
                                input=str(fn),
                                output=self.tmpDir + fileName + '_' +
                                str(year + 1) + '-' + str(year + maxleadtime)))
            else:
                output.append(str(fn))

            if len(cdo.showyear(input=output[-1])[0].split(' ')) < maxleadtime:
                raise NotEnoughYearsInFile, "2Not enough years in %s %s %s for starting year %s" % (
                    fileType, model, product, year)

        if (not output or not isinstance(output, list)):
            raise NoFilesFoundError, "Couldn't find files for %s in %s %s %s for starting year %s" % (
                variable, fileType, model, product, year)

        #check for curvilinear grid
        if (not hasattr(self, 'curvilinearGrid')
                or self.curvilinearGrid == True):
            output = self.checkGrid(output, model)

        #user wants to select levels
        if self.level is not None:
            return self.selectLevel(output)
        else:
            return output
Example #7
0
    def getReanalysis(self,
                      year,
                      fileType,
                      experiment,
                      variable,
                      filePath='',
                      time_frequency='mon',
                      maxLeadtime=10):
        '''
        Wrapper method to find reanalysis file with solr_search.
        
        :param year: startyear
        :param fileType: reanalysis or observation
        :param experiment: i.e. NCEP, HadCrut or MERRA
        :param variable: CMOR Variable
        :param time_frequency: monthly, yearly, daily and so on
        :return: "decadal" file with observations  
        '''
        reanFiles = list()
        if ((experiment == 'HadCrut') and (variable == 'tas')):
            return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime)

        #to use your own reanalysis data
        if os.path.isfile(self.observation):
            return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime)

        if (not hasattr(self, 'mergedReanFile')):
            #Observation or reanalysis?
            facet = SolrFindFiles.facets(facets='data_type',
                                         experiment=experiment,
                                         variable=variable,
                                         time_frequency=time_frequency)
            try:
                if facet['data_type'][0] == 'reanalysis':
                    searchList = SolrFindFiles.search(
                        data_type=['reanalysis', 'observations'],
                        experiment=experiment,
                        variable=variable,
                        time_frequency=time_frequency)
                else:
                    searchList = SolrFindFiles.search(
                        data_type=['reanalysis', 'observations'],
                        experiment=experiment,
                        variable=variable,
                        time_frequency=time_frequency,
                        data_structure='grid')
            except IndexError:
                raise NoFilesFoundError, "Couldn't find files for %s in %s" % (
                    variable, experiment)

            for fn in searchList:
                yearTmp = cdo.showyear(input=str(fn))[0]
                fname = str(fn).split('/')[-1]
                #reanFiles.append(cdo.yearmean(input=str(fn), output=self.tmpDir+fname+'_YEARMEAN'))
                reanFiles.append(str(fn))
                #if more than one year in File we break the loop and expect it to be a observationsfile
                if (len(yearTmp.split(' ')) > 1):
                    break
            if (len(reanFiles) == 0):
                raise NoFilesFoundError, "Couldn't find files for %s in %s" % (
                    variable, experiment)
            mergedFile = cdo.mergetime(input=' '.join(reanFiles),
                                       output=self.tmpDir +
                                       'mergedREAN_YEARMEAN')
            tmpMean = cdo.timmean(input=mergedFile)
            self.mergedReanFile = cdo.sub(
                input=' '.join([mergedFile, tmpMean]),
                output=self.tmpDir + 'reananomalies.nc')
            #print self.mergedReanFile
            if self.level is not None:
                self.mergedReanFile = self._selectLevel(self.mergedReanFile)

            #print self.mergedReanFile

        if (not hasattr(self, 'mergedReanFile')):
            raise NoFilesFoundError, "Couldn't find files for %s in %s" % (
                variable, experiment)

        years = cdo.showyear(input=self.mergedReanFile)[0]
        if ((years.find(str(year + 1)) != -1)
                and (years.find(str(year + maxLeadtime)) != -1)):
            #create tmp decadal file
            fileStr = ','.join(
                map(str, range(year + 1, year + maxLeadtime + 1)))
            tmp = cdo.selyear(fileStr,
                              input=self.mergedReanFile,
                              output=self.tmpDir + 'reanalysis_' + experiment +
                              str(year + 1) + '-' + str(year + maxLeadtime) +
                              '.nc')
            return tmp
        else:
            raise NotEnoughYearsInFile, "%s-%s are not part of %s reanalysis" % (
                year + 1, year + maxLeadtime, experiment)
Example #8
0
    def test_ingest(self):
        supermakedirs('/tmp/some_temp_solr_core', 0777)
        tmpdir = '/tmp/some_temp_solr_core'
        orig_dir = DRSFile.DRS_STRUCTURE[CMIP5]['root_dir']
        DRSFile.DRS_STRUCTURE[CMIP5]['root_dir'] = tmpdir

        files = [
            'cmip5/output1/MOHC/HadCM3/historical/mon/aerosol/aero/r2i1p1/v20110728/wetso2/wetso2_aero_HadCM3_historical_r2i1p1_190912-193411.nc',
            'cmip5/output1/MOHC/HadCM3/decadal2008/mon/atmos/Amon/r9i3p1/v20120523/tauu/tauu_Amon_HadCM3_decadal2008_r9i3p1_200811-201812.nc',
            'cmip5/output1/MOHC/HadCM3/decadal2009/mon/atmos/Amon/r7i2p1/v20110719/ua/ua_Amon_HadCM3_decadal2009_r7i2p1_200911-201912.nc',
            'cmip5/output1/MOHC/HadCM3/decadal2009/mon/atmos/Amon/r7i2p1/v20110819/ua/ua_Amon_HadCM3_decadal2009_r7i2p1_200911-201912.nc',
            'cmip5/output1/MOHC/HadCM3/decadal2009/mon/atmos/Amon/r7i2p1/v20110419/ua/ua_Amon_HadCM3_decadal2009_r7i2p1_200911-201912.nc'
        ]
        latest_versions = [files[0], files[1], files[3]]
        multiversion_latest = files[3]
        old_versions = [files[2], files[4]]

        for f in files:
            abs_path = os.path.abspath(os.path.join(tmpdir, f))
            try:
                os.makedirs(os.path.dirname(abs_path))
            except:  # pragma nocover
                pass
            with open(abs_path, 'w') as f_out:
                f_out.write(' ')

        dump_file = tmpdir + '/dump1.csv'
        SolrCore.dump_fs_to_file(tmpdir + '/cmip5',
                                 dump_file,
                                 check=True,
                                 abort_on_errors=True)
        # test instances, check they are as expected
        SolrCore.load_fs_from_file(dump_file,
                                   abort_on_errors=True,
                                   core_all_files=self.all_files,
                                   core_latest=self.latest)

        # check
        ff_all = SolrFindFiles(core='files',
                               host=self.solr_host,
                               port=self.solr_port)
        ff_latest = SolrFindFiles(core='latest',
                                  host=self.solr_host,
                                  port=self.solr_port)
        all_entries = [i for i in ff_all._search()]
        latest_entries = [i for i in ff_latest._search()]
        # old version should be only on the general core
        self.assertTrue(all([tmpdir + '/' + e in all_entries for e in files]))
        self.assertTrue(
            all([tmpdir + '/' + e in latest_entries for e in latest_versions]))
        self.assertTrue(
            all([tmpdir + '/' + e not in latest_entries
                 for e in old_versions]))

        # add new version
        new_version = tmpdir + '/' + 'cmip5/output1/MOHC/HadCM3/decadal2009/mon/atmos/Amon/r7i2p1/v20120419/ua/ua_Amon_HadCM3_decadal2009_r7i2p1_200911-201912.nc'
        with open(dump_file, 'r') as f:
            content = f.readlines()
        content.insert(3, new_version + ',1564083682.09\n')
        with open(dump_file, "w") as f:
            contents = "".join(content)
            f.write(contents)
            f.close()

        SolrCore.load_fs_from_file(dump_file,
                                   abort_on_errors=True,
                                   core_all_files=self.all_files,
                                   core_latest=self.latest)

        self.assertTrue(
            set(ff_all._search()).symmetric_difference(set(all_entries)).pop()
            == new_version)
        self.assertTrue((set(ff_latest._search()) -
                         set(latest_entries)).pop() == new_version)
        self.assertTrue((set(latest_entries) -
                         set(ff_latest._search())).pop() == tmpdir + '/' +
                        multiversion_latest)

        # test get_solr_fields (facets)
        facets = self.all_files.get_solr_fields().keys()
        print self.all_files.get_solr_fields()
        facets_to_be = [
            'model', 'product', 'realm', 'version', 'data_type', 'institute',
            'file_name', 'creation_time', 'cmor_table', 'time_frequency',
            'experiment', 'timestamp', 'file', 'time', 'variable', '_version_',
            'file_no_version', 'project', 'ensemble'
        ]
        self.assertEqual(facets, facets_to_be)

        DRSFile.DRS_STRUCTURE[CMIP5]['root_dir'] = orig_dir
Example #9
0
    def _run(self):

        args = self.args
        last_args = self.last_args

        # Are we searching for facets or files?
        facets = []
        if args.all_facets:
            facets = None
        if args.facet:
            facets.append(args.facet)

        latest = not args.multiversion
        batch_size = args.batch_size if args.batch_size else 10

        search_dict = {}
        # contruct search_dict by looping over last_args
        for arg in last_args:
            if '=' not in arg:
                raise CommandError("Invalid format for query: %s" % arg)

            items = arg.split('=')
            key, value = items[0], ''.join(items[1:])

            if key not in search_dict:
                search_dict[key] = value
            else:
                if not isinstance(search_dict[key], list):
                    search_dict[key] = [search_dict[key]]
                search_dict[key].append(value)

        if 'version' in search_dict and latest:
            # it makes no sense to look for a specific version just among the latest
            # the speedup is marginal and it might not be what the user expects
            sys.stderr.write(
                'Turning latest of when searching for a specific version.')
            latest = False

        logging.debug("Searching dictionary: %s\n", search_dict)
        # exit()
        # flush stderr in case we have something pending
        sys.stderr.flush()

        if facets != [] and not args.attributes:
            if 'facet.limit' in search_dict:
                facet_limit = int(search_dict['facet.limit'])
            else:
                # default
                facet_limit = 1000
                search_dict['facet.limit'] = -1

            for att, values in SolrFindFiles.facets(facets=facets,
                                                    latest_version=latest,
                                                    **search_dict).items():
                # values come in pairs: (value, count)
                value_count = len(values) / 2
                if args.relevant_only and value_count < 2:
                    continue

                if args.count_facet_values:
                    sys.stdout.write('%s: %s' % (att, ','.join([
                        '%s (%s)' % (v, c) for v, c in zip(*[iter(values)] * 2)
                    ])))
                else:
                    sys.stdout.write('%s: %s' % (att, ','.join(values[::2])))

                if value_count == facet_limit:
                    sys.stdout.write('...')

                sys.stdout.write('\n')
                sys.stdout.flush()
        elif args.attributes:
            # select all is none defined but this flag was set
            if not facets:
                facets = None
            results = SolrFindFiles.facets(facets=facets,
                                           latest_version=latest,
                                           **search_dict)
            if args.relevant_only:
                atts = ', '.join([k for k in results if len(results[k]) > 2])
            else:
                atts = ', '.join(
                    SolrFindFiles.facets(facets=facets,
                                         latest_version=latest,
                                         **search_dict))
            sys.stdout.write(atts)
            sys.stdout.write('\n')
            sys.stdout.flush()
        else:
            # find the files and display them
            for f in SolrFindFiles.search(batch_size=batch_size,
                                          latest_version=latest,
                                          **search_dict):
                sys.stdout.write(str(f))
                sys.stdout.write('\n')
                sys.stdout.flush()