Beispiel #1
0
    def test_command(self):

        with self.assertRaises(SystemExit):
            self.cmd.run([])

        with self.assertRaises(SystemExit):
            self.cmd.run(['--crawl=%s/cmip5' % self.tmpdir])

        # test crawl dir
        output = '/tmp/crawl_output.txt'
        self.cmd.run(
            ['--crawl=%s/cmip5' % self.tmpdir,
             '--output=%s' % output])
        self.assertTrue(os.path.isfile(output))
        crawl_obj = UserCrawl.objects.get(tar_file=output.split('/')[-1])
        self.assertEqual(crawl_obj.status, 'crawling')
        # test ingesting
        self.assertEqual(len(list(SolrFindFiles.search())), 0)
        self.cmd.run(['--ingest=%s' % output])
        crawl_obj = UserCrawl.objects.get(tar_file=output.split('/')[-1])
        self.assertEqual(crawl_obj.status, 'success')
        self.assertEqual(len(list(SolrFindFiles.search())), 3)

        # test custom host and port
        self.cmd.run([
            '--ingest=%s' % output,
            '--solr-url=http://%s:%s' % (self.solr_host, self.solr_port)
        ])
        self.assertEqual(len(list(SolrFindFiles.search(latest_version=False))),
                         5)

        os.remove(output)
Beispiel #2
0
 def test_solr_search(self):
     # search some files
     solr_search = SolrFindFiles()
     all_files = solr_search.search()
     self.assertEqual(len(list(all_files)), 3)
     hist = solr_search.search(experiment='historical')
     self.assertEqual(list(hist), [os.path.join(self.tmpdir, self.files[0])])
     all_files = solr_search.search(latest_version=False)
     self.assertEqual(len(list(all_files)), 5)
     # test OR query
     or_result = solr_search.search(variable=['tauu', 'wetso2'])
     self.assertEqual(set([os.path.join(self.tmpdir, e) for e in self.files[:2]]), set(or_result))
Beispiel #3
0
    def getFiles(self, year, fileType, model, variable, time_frequency='mon', product='*', ensemblemembers='*',
                 institute='*', exp_prefix='d*', maxleadtime=10, minLeadtime=1):
        """
        Method to get model files with solr_search.
        
        :param year: decadal starting year
        :param fileType: baseline1, cmip5, historical or...
        :param model: model name i.e. MPI-ESM-LR
        :param variable: CMOR variable
        :param time_frequency: monthly, yearly, daily and so on
        
        :return: list with all ensemblemembers members found
        """
        # TODO: BUGFIX for minLeadyear
        minLeadtime = 1
        output = list() 
        decStr = exp_prefix+str(year)
        project = fileType.lower()    
        tmpList = list()
        for fn in SolrFindFiles.search(experiment=decStr, latest_version=True, product=product, institute=institute,
                                       variable=variable, time_frequency=time_frequency, model=model, project=project):
            if str(fn).split('.')[-1] == 'nc':
                tmpList.append(str(fn))
        try:
            test = tmpList[0]
        except:
            import time
            time.sleep(5)  # delays for 5 seconds
            for fn in SolrFindFiles.search(experiment=decStr, latest_version=True, product=product, institute=institute,
                                           variable=variable, time_frequency=time_frequency, model=model,
                                           project=project):
                if str(fn).split('.')[-1] == 'nc':
                    tmpList.append(str(fn))
            try:
                test = tmpList[0]
            except:
                if exp_prefix.find('*') != -1:
                    raise NoFilesFoundError,\
                        "Couldn't find files for %s in %s %s %s experiment: %s" % (variable, fileType, model,
                                                                                   product, year)
                # OK we can't find files, now try one last time using only the exp_prefix, i.e. "historical"
                decStr = exp_prefix
                for fn in SolrFindFiles.search(experiment=exp_prefix, latest_version=True, product=product,
                                               institute=institute, variable=variable, time_frequency=time_frequency,
                                               model=model, project=project):
                    if str(fn).split('.')[-1] == 'nc':
                        tmpList.append(str(fn))
                try:
                    test = tmpList[0]
                except:
                    # OK, there are no Files...
                    raise NoFilesFoundError,\
                        "Couldn't find files for %s in %s %s %s experiment: %s" % (variable, fileType, model,
                                                                                   product, year)

        # Check if we have time-splitted files
        time_values = SolrFindFiles.facets(facets='time', experiment=decStr, latest_version=True, product=product,
                                           institute=institute, variable=variable, time_frequency=time_frequency,
                                           model=model, project=project)
        if len(time_values['time']) > 1:
            tmpList = self.mergeSplittedFiles(tmpList)        
              
        # select only wanted ensemblemembers
        if type(ensemblemembers) == list and ensemblemembers[0] != '*':
            ensList = list()
            for ens in ensemblemembers:
                onlyfiles = [f for f in tmpList if f.lower().find(ens) != -1]
                if len(onlyfiles) > 0:
                    ensList = ensList + onlyfiles
                else:
                    raise EnsembleMemberError, "Ensemble member %s not found for  %s %s %s for starting year %s" % (ens,fileType, model, product, year)
            tmpList = ensList
        
        for fn in tmpList:
            years = cdo.showyear(input=str(fn))[0]
            yearList = years.split(' ')

            if str(year+minLeadtime) not in yearList or str(year+maxleadtime) not in yearList:
                raise NotEnoughYearsInFile, \
                    "1Not enough years in %s %s %s for starting year %s" % (fileType, model, product, year)

            selStr = ','.join(map(str, range(year+minLeadtime, year+1+maxleadtime)))
            fileName = str(fn).split('/')[-1]
            output.append(cdo.selyear(selStr, input=str(fn),
                                      output=self.tmpDir+fileName+self.getRandomStr()+'_'+str(year+minLeadtime)+'-'+str(year+maxleadtime),
                                      options='-f nc'))
                
            if len(cdo.showyear(input=output[-1])[0].split(' ')) < maxleadtime-minLeadtime: 
                raise NotEnoughYearsInFile,\
                    "2Not enough years in %s %s %s for starting year %s" % (fileType, model, product, year)
                
        if not output or not isinstance(output, list):
            raise NoFilesFoundError, \
                "Couldn't find files for %s in %s %s %s for starting year %s" % (variable, fileType, model,
                                                                                 product, year)

        # check for curvilinear grid
        if not hasattr(self,'curvilinearGrid') or self.curvilinearGrid == True:
            output = self.checkGrid(output, model)

        # user wants to select levels
        if self.level is not None:
            return self.selectLevel(output)
        else:
            return output
Beispiel #4
0
    def getReanalysis(self, year, fileType, experiment, variable, filePath='', time_frequency='mon', maxLeadtime=10,
                      observation_ensemble='*', minLeadtime=1):
        """
        Wrapper method to find reanalysis file with solr_search.
        
        :param year: startyear
        :param fileType: reanalysis or observation
        :param experiment: i.e. NCEP, HadCrut or MERRA
        :param variable: CMOR Variable
        :param time_frequency: monthly, yearly, daily and so on
        :return: "decadal" file with observations  
        """
        # TODO: BUGFIX for minLeadyear
        minLeadtime = 1
        reanFiles = list()
        if experiment == 'HadCrut' and variable == 'tas':
            return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime)

        # to use your own reanalysis data
        if os.path.isfile(self.observation):
            return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime, minLeadtime=minLeadtime)

        if not hasattr(self, 'mergedReanFile'):
            # Observation or reanalysis?
            facet = SolrFindFiles.facets(facets='data_type', experiment=experiment, variable=variable, 
                                         time_frequency=time_frequency)
            try:
                if 'reanalysis' in facet['data_type']:
                    searchList = SolrFindFiles.search(project=['reanalysis', 'observations'],
                                                      experiment=experiment, variable=variable,
                                                      time_frequency=time_frequency, ensemble=observation_ensemble)
                else:
                    searchList = SolrFindFiles.search(project=['reanalysis', 'observations'],
                                                      experiment=experiment, variable=variable,
                                                      time_frequency=time_frequency, product='grid',
                                                      ensemble=observation_ensemble)
            except IndexError:
                raise NoFilesFoundError, "Couldn't find files for %s in %s" % (variable, experiment)
            for fn in searchList:
                yearTmp = cdo.showyear(input=str(fn))[0]
                fname = str(fn).split('/')[-1]
                reanFiles.append(str(fn))
                # if more than one year in File we break the loop and expect it to be a observationsfile
                if len(yearTmp.split(' ')) > 1:
                    break
            if len(reanFiles) == 0:
                raise NoFilesFoundError,\
                    "Couldn't find files for %s in %s " % (variable, experiment)
            mergedFile = cdo.mergetime(input=' '.join(reanFiles), output=self.tmpDir+'mergedREAN_YEARMEAN')
            tmpMean = cdo.timmean(input=mergedFile)
            self.mergedReanFile = cdo.sub(input=' '.join([mergedFile, tmpMean]), output=self.tmpDir+'reananomalies.nc')
            if self.level is not None:
                self.mergedReanFile = self._selectLevel(self.mergedReanFile)
        
        if not hasattr(self, 'mergedReanFile'):
            raise NoFilesFoundError, "Couldn't find files for %s in %s" % (variable, experiment)
            
        years = cdo.showyear(input=self.mergedReanFile)[0]
        if years.find(str(year+minLeadtime)) != -1 and years.find(str(year+maxLeadtime)) != -1:
            # create tmp decadal file
            fileStr = ','.join(map(str, range(year+minLeadtime, year+maxLeadtime+1)))
            tmp = cdo.selyear(fileStr, input=self.mergedReanFile,
                              output=self.tmpDir+'reanalysis_'+experiment+str(year+1)+'-'+str(year+maxLeadtime)+'.nc',
                              options='-f nc')
            return tmp
        else:
            raise NotEnoughYearsInFile,\
                "%s-%s are not part of %s reanalysis" % (year+minLeadtime, year+maxLeadtime, experiment)
Beispiel #5
0
    def getFiles(self,
                 year,
                 fileType,
                 model,
                 variable,
                 time_frequency='mon',
                 product='*',
                 ensemblemembers='*',
                 institute='*',
                 exp_prefix='d*',
                 maxleadtime=10):
        '''
        Method to get model files with solr_search.
        
        :param year: decadal starting year
        :param fileType: baseline1, cmip5, historical or...
        :param model: model name i.e. MPI-ESM-LR
        :param variable: CMOR variable
        :param time_frequency: monthly, yearly, daily and so on
        
        :return: list with all ensemblemembers members found
        '''
        output = list()
        decStr = exp_prefix + str(year)
        project = fileType.lower()
        tmpList = list()
        for fn in SolrFindFiles.search(experiment=decStr,
                                       latest_version=True,
                                       product=product,
                                       institute=institute,
                                       variable=variable,
                                       time_frequency=time_frequency,
                                       model=model,
                                       project=project):
            if (str(fn).split('.')[-1] == 'nc'):
                tmpList.append(str(fn))
        try:
            test = tmpList[0]
        except:
            import time
            time.sleep(5)  # delays for 5 seconds
            for fn in SolrFindFiles.search(experiment=decStr,
                                           latest_version=True,
                                           product=product,
                                           institute=institute,
                                           variable=variable,
                                           time_frequency=time_frequency,
                                           model=model,
                                           project=project):
                print str(fn)
                if (str(fn).split('.')[-1] == 'nc'):
                    tmpList.append(str(fn))
            try:
                test = tmpList[0]
            except:
                if exp_prefix.find('*') != -1:
                    raise NoFilesFoundError, "Couldn't find files for %s in %s %s %s experiment: %s" % (
                        variable, fileType, model, product, year)
                #OK we can't find files, now try one last time using only the exp_prefix, i.e. "historical"
                for fn in SolrFindFiles.search(experiment=exp_prefix,
                                               latest_version=True,
                                               product=product,
                                               institute=institute,
                                               variable=variable,
                                               time_frequency=time_frequency,
                                               model=model,
                                               project=project):
                    if (str(fn).split('.')[-1] == 'nc'):
                        tmpList.append(str(fn))
                try:
                    test = tmpList[0]
                except:
                    #OK, there are no Files...
                    raise NoFilesFoundError, "Couldn't find files for %s in %s %s %s experiment: %s" % (
                        variable, fileType, model, product, year)

        #select only wanted ensemblemembers
        if type(ensemblemembers) == list and ensemblemembers[0] != '*':
            ensList = list()
            for ens in ensemblemembers:
                onlyfiles = [f for f in tmpList if f.find(ens) != -1]
                if len(onlyfiles) > 0:
                    ensList.append(onlyfiles[0])

            tmpList = ensList

        for fn in tmpList:

            years = cdo.showyear(input=str(fn))[0]
            yearList = years.split(' ')
            #print years
            #print fn
            if str(year +
                   1) not in yearList or str(year +
                                             maxleadtime) not in yearList:
                print year
                raise NotEnoughYearsInFile, "1Not enough years in %s %s %s for starting year %s" % (
                    fileType, model, product, year)

            if (len(years.split(' ')) > maxleadtime):
                selStr = ','.join(
                    map(str, range(year + 1, year + 1 + maxleadtime)))
                fileName = str(fn).split('/')[-1]
                output.append(
                    cdo.selyear(selStr,
                                input=str(fn),
                                output=self.tmpDir + fileName + '_' +
                                str(year + 1) + '-' + str(year + maxleadtime)))
            else:
                output.append(str(fn))

            if len(cdo.showyear(input=output[-1])[0].split(' ')) < maxleadtime:
                raise NotEnoughYearsInFile, "2Not enough years in %s %s %s for starting year %s" % (
                    fileType, model, product, year)

        if (not output or not isinstance(output, list)):
            raise NoFilesFoundError, "Couldn't find files for %s in %s %s %s for starting year %s" % (
                variable, fileType, model, product, year)

        #check for curvilinear grid
        if (not hasattr(self, 'curvilinearGrid')
                or self.curvilinearGrid == True):
            output = self.checkGrid(output, model)

        #user wants to select levels
        if self.level is not None:
            return self.selectLevel(output)
        else:
            return output
Beispiel #6
0
    def getReanalysis(self,
                      year,
                      fileType,
                      experiment,
                      variable,
                      filePath='',
                      time_frequency='mon',
                      maxLeadtime=10):
        '''
        Wrapper method to find reanalysis file with solr_search.
        
        :param year: startyear
        :param fileType: reanalysis or observation
        :param experiment: i.e. NCEP, HadCrut or MERRA
        :param variable: CMOR Variable
        :param time_frequency: monthly, yearly, daily and so on
        :return: "decadal" file with observations  
        '''
        reanFiles = list()
        if ((experiment == 'HadCrut') and (variable == 'tas')):
            return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime)

        #to use your own reanalysis data
        if os.path.isfile(self.observation):
            return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime)

        if (not hasattr(self, 'mergedReanFile')):
            #Observation or reanalysis?
            facet = SolrFindFiles.facets(facets='data_type',
                                         experiment=experiment,
                                         variable=variable,
                                         time_frequency=time_frequency)
            try:
                if facet['data_type'][0] == 'reanalysis':
                    searchList = SolrFindFiles.search(
                        data_type=['reanalysis', 'observations'],
                        experiment=experiment,
                        variable=variable,
                        time_frequency=time_frequency)
                else:
                    searchList = SolrFindFiles.search(
                        data_type=['reanalysis', 'observations'],
                        experiment=experiment,
                        variable=variable,
                        time_frequency=time_frequency,
                        data_structure='grid')
            except IndexError:
                raise NoFilesFoundError, "Couldn't find files for %s in %s" % (
                    variable, experiment)

            for fn in searchList:
                yearTmp = cdo.showyear(input=str(fn))[0]
                fname = str(fn).split('/')[-1]
                #reanFiles.append(cdo.yearmean(input=str(fn), output=self.tmpDir+fname+'_YEARMEAN'))
                reanFiles.append(str(fn))
                #if more than one year in File we break the loop and expect it to be a observationsfile
                if (len(yearTmp.split(' ')) > 1):
                    break
            if (len(reanFiles) == 0):
                raise NoFilesFoundError, "Couldn't find files for %s in %s" % (
                    variable, experiment)
            mergedFile = cdo.mergetime(input=' '.join(reanFiles),
                                       output=self.tmpDir +
                                       'mergedREAN_YEARMEAN')
            tmpMean = cdo.timmean(input=mergedFile)
            self.mergedReanFile = cdo.sub(
                input=' '.join([mergedFile, tmpMean]),
                output=self.tmpDir + 'reananomalies.nc')
            #print self.mergedReanFile
            if self.level is not None:
                self.mergedReanFile = self._selectLevel(self.mergedReanFile)

            #print self.mergedReanFile

        if (not hasattr(self, 'mergedReanFile')):
            raise NoFilesFoundError, "Couldn't find files for %s in %s" % (
                variable, experiment)

        years = cdo.showyear(input=self.mergedReanFile)[0]
        if ((years.find(str(year + 1)) != -1)
                and (years.find(str(year + maxLeadtime)) != -1)):
            #create tmp decadal file
            fileStr = ','.join(
                map(str, range(year + 1, year + maxLeadtime + 1)))
            tmp = cdo.selyear(fileStr,
                              input=self.mergedReanFile,
                              output=self.tmpDir + 'reanalysis_' + experiment +
                              str(year + 1) + '-' + str(year + maxLeadtime) +
                              '.nc')
            return tmp
        else:
            raise NotEnoughYearsInFile, "%s-%s are not part of %s reanalysis" % (
                year + 1, year + maxLeadtime, experiment)
Beispiel #7
0
    def _run(self):

        args = self.args
        last_args = self.last_args

        # Are we searching for facets or files?
        facets = []
        if args.all_facets:
            facets = None
        if args.facet:
            facets.append(args.facet)

        latest = not args.multiversion
        batch_size = args.batch_size if args.batch_size else 10

        search_dict = {}
        # contruct search_dict by looping over last_args
        for arg in last_args:
            if '=' not in arg:
                raise CommandError("Invalid format for query: %s" % arg)

            items = arg.split('=')
            key, value = items[0], ''.join(items[1:])

            if key not in search_dict:
                search_dict[key] = value
            else:
                if not isinstance(search_dict[key], list):
                    search_dict[key] = [search_dict[key]]
                search_dict[key].append(value)

        if 'version' in search_dict and latest:
            # it makes no sense to look for a specific version just among the latest
            # the speedup is marginal and it might not be what the user expects
            sys.stderr.write(
                'Turning latest of when searching for a specific version.')
            latest = False

        logging.debug("Searching dictionary: %s\n", search_dict)
        # exit()
        # flush stderr in case we have something pending
        sys.stderr.flush()

        if facets != [] and not args.attributes:
            if 'facet.limit' in search_dict:
                facet_limit = int(search_dict['facet.limit'])
            else:
                # default
                facet_limit = 1000
                search_dict['facet.limit'] = -1

            for att, values in SolrFindFiles.facets(facets=facets,
                                                    latest_version=latest,
                                                    **search_dict).items():
                # values come in pairs: (value, count)
                value_count = len(values) / 2
                if args.relevant_only and value_count < 2:
                    continue

                if args.count_facet_values:
                    sys.stdout.write('%s: %s' % (att, ','.join([
                        '%s (%s)' % (v, c) for v, c in zip(*[iter(values)] * 2)
                    ])))
                else:
                    sys.stdout.write('%s: %s' % (att, ','.join(values[::2])))

                if value_count == facet_limit:
                    sys.stdout.write('...')

                sys.stdout.write('\n')
                sys.stdout.flush()
        elif args.attributes:
            # select all is none defined but this flag was set
            if not facets:
                facets = None
            results = SolrFindFiles.facets(facets=facets,
                                           latest_version=latest,
                                           **search_dict)
            if args.relevant_only:
                atts = ', '.join([k for k in results if len(results[k]) > 2])
            else:
                atts = ', '.join(
                    SolrFindFiles.facets(facets=facets,
                                         latest_version=latest,
                                         **search_dict))
            sys.stdout.write(atts)
            sys.stdout.write('\n')
            sys.stdout.flush()
        else:
            # find the files and display them
            for f in SolrFindFiles.search(batch_size=batch_size,
                                          latest_version=latest,
                                          **search_dict):
                sys.stdout.write(str(f))
                sys.stdout.write('\n')
                sys.stdout.flush()