Python UriFileの例、geoslurp.datapull.UriFile Pythonの例

コード例 #1

0

ファイルを表示

ファイル: RadsDsets.py プロジェクト: whigg/geoslurp

    def register(self,cycle=None,since=None):
        if since:
           since=datetime.strptime(since,"%Y-%m-%d")
           print(since)
        else:
           since=self._dbinvent.lastupdate
        #create a list of files which need to be (re)registered
        if self.updated:
            files=self.updated
        else:
            slurplogger().info("Listing files to process (this can take a while)...")

            if cycle:
                files=[UriFile(file) for file in findFiles(os.path.join(self._dbinvent.datadir,self.sat,self.phase,"c%03d"%(cycle)),'.*\.nc$',since=since)]
            else:
                files=[UriFile(file) for file in findFiles(os.path.join(self._dbinvent.datadir,self.sat,self.phase),'.*\.nc$',since=since)]
        if not files:
           slurplogger().info("No updated files found")
           return

        newfiles=self.retainnewUris(files)
        if not newfiles:
            slurplogger().info("Nothing to update")
            return

        for uri in newfiles:
            meta=radsMetaDataExtractor(uri)
            if not meta:
               #don't register empty entries
               continue

            self.addEntry(meta)

        self.updateInvent()

コード例 #2

0

ファイルを表示

ファイル: motu.py プロジェクト: whigg/geoslurp

    def download(self):
        """Download file"""
        muri = Uri(self.mopts)

        #check if download is needed
        muri.requestInfo()
        uristacked = UriFile(self.mopts.fullname())
        if uristacked.lastmod:
            if muri.lastmod <= uristacked.lastmod:
                slurplogger().info("Already downloaded %s" % (uristacked.url))
                #quick return when there is no need to merge/download
                return uristacked, False

        #check if download is allowed
        kb, maxkb = muri.updateSize()
        if kb > maxkb:
            #split up request and try again

            #create 2 bounding boxes split on time
            Abbox, Bbbox = muri.opts.btdbox.timeSplit()

            AmotuRec = MotuRecursive(copy.deepcopy(self.mopts))
            AmotuRec.mopts.syncbtdbox(Abbox)
            AmotuRec.mopts.out_name = self.mopts.out_name.replace(
                '.nc', '_A.nc')
            AmotuRec.mopts.out_dir = AmotuRec.mopts.cache

            BmotuRec = MotuRecursive(copy.deepcopy(self.mopts))
            BmotuRec.mopts.syncbtdbox(Bbbox)
            BmotuRec.mopts.out_name = self.mopts.out_name.replace(
                '.nc', '_B.nc')
            BmotuRec.mopts.out_dir = BmotuRec.mopts.cache

            Auri, Aupd = AmotuRec.download()
            Buri, Bupd = BmotuRec.download()

            #possible improvement here split a dataset at an unlimited dimensions and append the second one to the first one
            #patch files together (if updated)
            if Aupd or Bupd or not os.path.exists(self.mopts.fullname()):
                uristacked, upd = stackNcFiles(self.mopts.fullname(), Auri.url,
                                               Buri.url, 'time')
                if not self.keepfiles:
                    #remove the partial files
                    os.remove(AmotuRec.mopts.fullname())
                    os.remove(BmotuRec.mopts.fullname())
            else:
                uristacked = UriFile(self.mopts.fullname())
                upd = False
            return uristacked, True
        else:
            return muri.download(self.mopts.out_dir, check=True)

コード例 #3

0

ファイルを表示

ファイル: icgemDsets.py プロジェクト: strawpants/geoslurp

    def register(self, pattern=None):
        """Register static gravity fields donwloaded in the data director
        :param pattern: only register files whose filename obeys this regular expression
        """
        if not pattern:
            pattern = '.*\.gz'
        #create a list of files which need to be (re)registered
        if self.updated:
            files = self.updated
        else:
            files = [
                UriFile(file) for file in findFiles(self.dataDir(), pattern)
            ]

        #loop over files
        for uri in files:
            urilike = os.path.basename(uri.url)

            if not self.uriNeedsUpdate(urilike, uri.lastmod):
                continue

            meta = icgemMetaExtractor(uri)
            self.addEntry(meta)

        self.updateInvent()

コード例 #4

0

ファイルを表示

ファイル: EasyCora.py プロジェクト: strawpants/geoslurp

 def pull(self,pattern='.*'):
     """ Pulls the Easy CORA file from the copernicus FTP server , and unpacks them
     :param pattern (string) only download data which obey this regular expression file pattern (e.g. 20[0-9][0-9] to download from 2000 and onward)
     """
     ftproot="ftp://my.cmems-du.eu/Core/INSITU_GLO_TS_REP_OBSERVATIONS_013_001_b/CORIOLIS-GLOBAL-EasyCORA-OBS/global"
     
     #get  cmems authentication details from database
     cred=self.conf.authCred("cmems")
     ftpcr=ftpCrawler(ftproot,auth=cred, pattern=pattern)
     
     updated=ftpcr.parallelDownload(self.cacheDir(),check=True,maxconn=10,continueonError=True)
     
     #unpack the downloaded files in the data directory
     datadir=self.dataDir()
     for tarf in [UriFile(f) for f in findFiles(self.cacheDir(),".*tgz$")]:
         succesfile=os.path.join(datadir,os.path.basename(tarf.url)+".isextracted")
         try:
             #check if the files need unpacking (only unpack when needed)
                 #check if the last file is already extracted
                 if os.path.exists(succesfile):
                     slurplogger().info(f"{tarf.url} is already extracted, skipping")
                 else:
                     with tarfile.open(tarf.url,"r:gz") as tf:
                         slurplogger().info(f"Extracting trajectory files from {tarf.url}")
                         tf.extractall(datadir)
                         #touch the sucessfile to indcate this archive has been sucessfully extracted
                     Path(succesfile).touch()
         except tarfile.ReadError as exc:
             raise exc

コード例 #5

0

ファイルを表示

ファイル: Argo.py プロジェクト: strawpants/geoslurp

    def register(self, center=None):
        """register downloaded commbined prof files"""
        #create a list of files which need to be (re)registered
        if self.updated:
            files = self.updated
        else:
            slurplogger().info("Building file list..")
            files = [
                UriFile(file) for file in findFiles(self.dataDir(), '.*nc',
                                                    self._dbinvent.lastupdate)
            ]

        if len(files) == 0:
            slurplogger().info("Argo: No new files found since last update")
            return

        filesnew = self.retainnewUris(files)
        if len(filesnew) == 0:
            slurplogger().info("Argo: No database update needed")
            return
        #loop over files
        for uri in filesnew:
            if center and not re.search(center, uri.url):
                continue
            meta = argoMetaExtractor(uri)
            if meta:
                self.addEntry(meta)

        self.updateInvent()

コード例 #6

0

ファイルを表示

    def download(self,
                 direc,
                 check=False,
                 gzip=False,
                 outfile=None,
                 continueonError=False):

        if not self.webdav:
            self.connect()

        if outfile:
            outf = os.path.join(direc, outfile)
        else:
            outf = os.path.join(direc, self.fname)

        uri = UriFile(url=outf)

        if check and self.lastmod and uri.lastmod:
            if self.lastmod <= uri.lastmod:
                #no need to download the file
                slurplog.info("Already Downloaded, skipping %s" % (uri.url))
                return uri, False

        slurplog.info("Downloading %s" % (uri.url))
        self.webdav.download(self.fname, uri.url)

        #change modification and access time to that provided by the ftp server
        setFtime(uri.url, self.lastmod)
        return uri, True

コード例 #7

0

ファイルを表示

ファイル: motu.py プロジェクト: whigg/geoslurp

    def download(self, direc, check=False, gzip=False, outfile=None):
        #check whether the file exists and retrive thelast update date

        if outfile:
            self.opts.out_name = outfile

        self.opts.out_dir = direc

        fout = os.path.join(direc, self.opts.out_name)

        uri = UriFile(fout)
        if check and os.path.exists(fout):
            self.updateModTime()

            if self.lastmod <= uri.lastmod:
                slurplogger().info("No need to download file %s" % (fout))
                return uri, False

        slurplogger().info("Downloading %s" % (fout))
        try:
            execute_request(self.opts)
        except Exception as e:
            slurplogger().error("failed to download file %s", e)
            raise (e)
        return uri, True

コード例 #8

0

ファイルを表示

    def retainnewUris(self, urilist):
        """Filters those uris which have table entries which are too old or are not present in the database"""
        #create a temporary table with uri and lastmodification time entries
        cols = [
            Column('id', Integer, primary_key=True),
            Column('uri', String),
            Column('lastmod', TIMESTAMP)
        ]

        #setup a seperate session  and transaction in order to work with a temporary table
        trans, ses = self.db.transsession()

        tmptable = self.db.createTable('tmpuris',
                                       cols,
                                       temporary=True,
                                       bind=ses.get_bind())
        # tmptable=self.db.createTable('tmpuris',cols,temporary=False,bind=ses.get_bind())
        # import pdb;pdb.set_trace()
        #fill the table with the file list and last modification timsstamps
        count = 0
        for uri in urilist:
            entry = tmptable(uri=uri.url, lastmod=uri.lastmod)
            ses.add(entry)
            count += 1
            if count > self.commitperN:
                ses.commit()
                count = 0

        ses.commit()

        #delete all entries which require updating
        # first gather all the ids of i entries which are expired
        subqry = ses.query(self.table.id).join(
            tmptable,
            and_(tmptable.uri == self.table.uri,
                 tmptable.lastmod > self.table.lastupdate)).subquery()
        # #then delete those entries from the table
        # import pdb;pdb.set_trace()

        delqry = self.table.__table__.delete().where(self.table.id.in_(subqry))
        ses.execute(delqry)

        #now make a list of new uris
        qrynew = ses.query(tmptable).outerjoin(
            self.table,
            self.table.uri == tmptable.uri).filter(self.table.uri == None)

        #submit transaction
        trans.commit()
        #return entried which need updating he entries in the original table which need updating
        return [UriFile(x.uri, x.lastmod) for x in qrynew]

コード例 #9

0

ファイルを表示

ファイル: grdc.py プロジェクト: danilecug/geoslurp

 def register(self):
     slurplogger().info("Building file list..")
     files=[UriFile(file) for file in findFiles(self.dataDir(),'.*gz',self._dbinvent.lastupdate)]
     # import pdb;pdb.set_trace() 
     filesnew=self.retainnewUris(files)
     if len(filesnew) == 0:
         slurplogger().info("GRDC: No database update needed")
         return
     # filesnew=[UriFile(os.path.join(self.dataDir(),"4208270_Q_Month.txt.gz"))]
     #loop over files
     for uri in filesnew:
         meta=GRDCmetaExtractor(uri)
         self.addEntry(meta)
     
     self.updateInvent()

コード例 #10

0

ファイルを表示

ファイル: EasyCora.py プロジェクト: strawpants/geoslurp

    def register(self,pattern='.*\.nc$'):
        """Register downloaded trajectory files from CORA
        :param pattern (string) file pattern to look for (defaults to all files ending with .nc)
        """
        #create a list of files which need to be (re)registered
        newfiles=self.retainnewUris([UriFile(file) for file in findFiles(self.dataDir(),pattern)])
        for uri in newfiles:
            meta=coraMetaExtractor(uri)
            if not meta:
                #don't register empty entries
                continue

            self.addEntry(meta)
        self._dbinvent.data["Description"]="EasyCora output data table"
        self._dbinvent.data["CORAversion"] = "5.2"
        self.updateInvent()

コード例 #11

0

ファイルを表示

    def register(self):
        slurplogger().info("Building file list..")
        files = [
            UriFile(file) for file in findFiles(self.cacheDir(), '.*love',
                                                self._dbinvent.lastupdate)
        ]

        if len(files) == 0:
            slurplogger().info("LLove: No new files found since last update")
            return

        self.truncateTable()
        #loop over files
        for uri in files:
            self.addEntry(lloveMetaExtractor(uri))
        self.updateInvent()

コード例 #12

0

ファイルを表示

    def parallelDownload(self, outdir, check=False):
        updated = []
        if check:
            cmd = [
                'rsync', '-avz', '--del', '--update',
                self.auth.user + "@" + self.rooturl, outdir
            ]
        else:
            cmd = [
                'rsync', '-avz', '--del', self.auth.user + "@" + self.rooturl,
                outdir
            ]

        for file in self.startrsync(cmd):
            updated.append(UriFile(file))
        return updated

コード例 #13

0

ファイルを表示

ファイル: snrei.py プロジェクト: danilecug/geoslurp

    def register(self):
        slurplogger().info("Building file list..")
        files=[UriFile(file) for file in findFiles(self.dataDir(),'.*love',self._dbinvent.lastupdate)]

        if len(files) == 0:
            slurplogger().info("LLove: No new files found since last update")
            return

        filesnew=self.retainnewUris(files)
        if len(filesnew) == 0:
            slurplogger().info("LLove: No database update needed")
            return
        #loop over files
        for uri in filesnew:
            self.addEntry(lloveMetaExtractor(uri))
        self.updateInvent()

コード例 #14

0

ファイルを表示

    def register(self):

        #create a list of files which need to be (re)registered
        if self.updated:
            files=self.updated
        else:
            files=[UriFile(file) for file in findFiles(self._dbinvent.datadir,'.*gfc.gz',since=self._dbinvent.lastupdate)]

        newfiles=self.retainnewUris(files)
        #loop over files
        for uri in newfiles:
            slurplogger().info("extracting meta info from %s"%(uri.url))
            meta=icgemMetaExtractor(uri)
            meta=enhanceMeta(meta)
            self.addEntry(meta)

        self.updateInvent()

コード例 #15

0

ファイルを表示

ファイル: cdsbase.py プロジェクト: strawpants/geoslurp

    def register(self):
        if not self.table:
            #create a new table on the fly
            self.createTable(self.columns)

        #create a list of files which need to be (re)registered
        newfiles = self.retainnewUris([
            UriFile(file)
            for file in findFiles(self.dataDir(), f".*\{self.app}$")
        ])
        for uri in newfiles:
            meta = self.metaExtractor(uri)
            if not meta:
                #don't register empty entries
                continue
            slurplogger().info(f"Adding metadata from {uri.url}")
            self.addEntry(meta)
        self._dbinvent.data["Description"] = self.description
        self.updateInvent()

コード例 #16

0

ファイルを表示

ファイル: GRACEDsets.py プロジェクト: danilecug/geoslurp

    def register(self):

        #create a list of files which need to be (re)registered
        if self.updated:
            files = self.updated
        else:
            files = [
                UriFile(file) for file in findFiles(self.dataDir(), 'G.*\.gz',
                                                    self._dbinvent.lastupdate)
            ]

        filesnew = self.retainnewUris(files)

        #loop over the newer files
        for uri in filesnew:
            meta = graceMetaExtractor(uri)
            self.addEntry(meta)

        self.updateInvent()

コード例 #17

0

ファイルを表示

    def download(self,
                 direc,
                 check=False,
                 outfile=None,
                 continueonError=False,
                 restdict=None):
        """Download file into directory and possibly check the modification time
        :param check : check whether the file needs updating
        :param gzip: additionally gzips the file (adds .gz to file name)
        :param continueonError (bool): don't raise an exception when a download error occurrs
        """

        #setup the output uri
        if outfile:
            outf = os.path.join(direc, self.subdirs, outfile)
        else:
            outf = os.path.join(direc, self.subdirs,
                                os.path.basename(self.url))

        #create directory if it does not exist
        if not os.path.exists(os.path.dirname(outf)):
            os.makedirs(os.path.dirname(outf), exist_ok=True)

        uri = UriFile(url=outf)
        if check and self.lastmod and uri.lastmod:
            if self.lastmod <= uri.lastmod:
                #no need to download the file
                slurplog.info("Already Downloaded, skipping %s" % (uri.url))
                return uri, False
        slurplog.info("Downloading %s" % (uri.url))

        stat = self.sftpconnection.stat(self.rpath)
        mtime = datetime.fromtimestamp(stat.st_mtime)
        self.sftpconnection.get(self.rpath, outf)
        #set the modification time to match the server
        setFtime(outf, mtime)

        return uri, True

コード例 #18

0

ファイルを表示

ファイル: rsync.py プロジェクト: strawpants/geoslurp

    def parallelDownload(self,
                         outdir,
                         check=False,
                         includes=None,
                         dryrun=False):
        updated = []
        cmd = ['rsync', '-avz', '--del']
        if check:
            cmd.append('--update')
        if dryrun:
            cmd.append('--dry-run')
        if includes:
            cmd.extend([f'--include={inc}' for inc in includes])
            # inclist='{"'+'","'.join(includes)+'"}'
            # cmd.append(f'--include={inclist}')
            #exclude everything else which is not obeying the include filters
            cmd.append('--exclude=*')

        cmd.append(self.auth.user + "@" + self.rooturl)
        cmd.append(outdir)
        for file in self.startrsync(cmd):
            updated.append(UriFile(os.path.join(outdir, file)))
        return updated

コード例 #19

0

ファイルを表示

ファイル: ORAS5.py プロジェクト: whigg/geoslurp

    def register(self,rundir=None,pattern='.*\.nc$'):
        """register netcdf output files
        @param rundir: directory where the netcdf files reside
        @param pattern: regular expression which the netcdfiles must obey defaults tkakes all files ending with nc"""
        if not rundir:
            raise RuntimeError("A directory/regex with output data needs to be supplied when registering this dataset")

        newfiles=self.retainnewUris([UriFile(file) for file in findFiles(rundir,pattern)])

        for uri in newfiles:
            meta=orasMetaExtractor(uri)
            if not meta:
                #don't register empty entries
                continue

            self.addEntry(meta)



        self._dbinvent.data["Description"]="ORAS5 output data table"
        self.setDataDir(os.path.abspath(rundir))
        self._dbinvent.data["grid"]="025"
        self.updateInvent()

コード例 #20

0

ファイルを表示

    def register(self):
        """ Register all downloaded fronts (in text files)"""

        slurplogger().info("Building file list..")
        files = [
            UriFile(file) for file in findFiles(self.cacheDir(), '.*txt',
                                                self._dbinvent.lastupdate)
        ]

        if len(files) == 0:
            slurplogger().info(
                "Orsifronts: No new files found since last update")
            return

        #possibly empty table
        self.truncateTable()

        #loop over files
        for uri in files:
            slurplogger().info("adding %s" % (uri.url))
            self.addEntry(orsiMetaExtractor(uri))

        self.updateInvent()

コード例 #21

0

ファイルを表示

    def register(self, pattern=None):
        if not pattern:
            pattern = '.*\.gz'
        #create a list of files which need to be (re)registered
        if self.updated:
            files = self.updated
        else:
            files = [
                UriFile(file) for file in findFiles(self.dataDir(), pattern)
            ]

        #loop over files
        for uri in files:
            urilike = os.path.basename(uri.url)

            if not self.uriNeedsUpdate(urilike, uri.lastmod):
                continue

            meta = icgemMetaExtractor(uri)

            self.addEntry(meta)

        self.updateInvent()

コード例 #22

0

ファイルを表示

ファイル: motuGridsBase.py プロジェクト: whigg/geoslurp

    def pull(self, name=None, wsne=None, tstart=None, tend=None):
        """Pulls a subset of a gridded dataset as netcdf from an motu enabled server
        This routine calls the internal routines of the motuclient python client
        :param name: Name of the  output datatset (file will be named 'name.nc')
        :param wsne: bounding box of the section of interest as [West,South,North,East]
        :param tstart: start date (as yyyy-mm-dd) for the extraction
        :param tend: end date (as yyyy-mm-dd) for the extraction
        """

        if not name:
            raise RuntimeError(
                "A name must be supplied to MotuGridsBase.pull !!")

        if None in wsne:
            raise RuntimeError("Please supply a geographical bounding box")

        try:
            bbox = BtdBox(w=wsne[0],
                          n=wsne[2],
                          s=wsne[1],
                          e=wsne[3],
                          ts=tstart,
                          te=tend)
        except:
            raise RuntimeError("Invalid bounding box provided to Duacs pull")

        cred = self.conf.authCred(self.authalias)
        ncout = os.path.join(self.dataDir(), name + ".nc")

        mOpts = MotuOpts(moturoot=self.moturoot,
                         service=self.motuservice,
                         product=self.motuproduct,
                         btdbox=bbox,
                         fout=ncout,
                         cache=self.cacheDir(),
                         variables=self.variables,
                         auth=cred)

        if bbox.isGMTCentered():
            # we need 2 downloads to split the  and a merging of the grids !
            # split the bounding box in two
            bboxleft, bboxright = bbox.lonSplit(0.0)
            bboxleft.to0_360()
            bboxright.to0_360()

            ncoutleft = os.path.join(self.cacheDir(), name + "_left.nc")

            mOptsleft = copy.deepcopy(mOpts)
            mOptsleft.syncbtdbox(bboxleft)
            mOptsleft.syncfilename(ncoutleft)

            MotuRecleft = MotuRecursive(mOptsleft)
            urileft, updleft = MotuRecleft.download()

            ncoutright = os.path.join(self.cacheDir(), name + "_right.nc")
            mOptsright = copy.deepcopy(mOpts)
            mOptsright.syncbtdbox(bboxright)
            mOptsright.syncfilename(ncoutright)

            MotuRecright = MotuRecursive(mOptsright)
            uriright, updright = MotuRecright.download()

            stackNcFiles(ncout, urileft.url, uriright.url, 'longitude')
            if updleft or updright:
                #change the longitude representation to -180..0 (without reshuffeling the data
                ncSwapLongitude(urileft.url)
                # patch files
                uri, upd = stackNcFiles(ncout, urileft.url, uriright.url,
                                        'longitude')
            else:
                upd = False
                uri = UriFile(ncout)
        else:
            #we can handle this by a single recursive motu instance

            MotuRec = MotuRecursive(mOpts)
            uri, upd = MotuRec.download()

        if upd:
            self.updated.append(uri)