Esempio n. 1
0
    def __init__(self,dir, recurse=True, archive=False):
        ''' Iterator for metadata crawling

            @type  dir: C{str}
            @param dir: The directory to start the metadata crawl.
        '''

        format_regex  = formats.format_regex
        dir=utilities.uncpath(utilities.realpath(utilities.normcase(utilities.encode(dir))))
        #Build a dict of matching files and regexes then sort according to the priority of the regex formats
        fileformats={}
        for f in utilities.rglob(dir,'|'.join(format_regex), True, re.IGNORECASE, recurse=recurse, archive=archive):
            #Don't return existing overviews
            if f[-7:] in ('qlk.jpg','thm.jpg'):continue
            #Use utf-8 encoding to fix Issue 20
            if f[:4]=='/vsi':f=utilities.encode(f)
            else:f=utilities.realpath(utilities.normcase(utilities.encode(f)))
            for r in format_regex: #This is so we always return _default_ format datasets last.
                if re.search(r,os.path.basename(f),re.IGNORECASE):
                    if fileformats.has_key(r):fileformats[r].append(f)
                    else:fileformats[r]=[f]
                    break
        files=[]
        for r in format_regex:
            if fileformats.has_key(r):files.extend(fileformats[r])

        #Class vars
        self.errors=[] #A list of files that couldn't be opened. Contains a tuple with file name, error info, debug info
        self.files=files
        self.file=''
        self.filecount=len(self.files)
Esempio n. 2
0
    def __init__(self, dir, recurse=True, archive=False, excludes=[]):
        ''' Iterator for metadata crawling

            @type  dir: C{str}
            @param dir: The directory to start the metadata crawl.
            @type    recurse: C{bool}
            @param   recurse: Recurse into subdirectories?
            @type    archive: C{bool}
            @param   archive: Look in zip/gzip archives
            @type    excludes: C{list}
            @param   excludes: List of glob style file/directory exclusion pattern/s
        '''

        #Class vars
        self.errors = [
        ]  #A list of files that couldn't be opened. Contains a tuple with file name, error info, debug info

        format_regex = formats.format_regex
        dir = utilities.uncpath(
            utilities.realpath(utilities.normcase(utilities.encode(dir))))
        #Build a dict of matching files and regexes then sort according to the priority of the regex formats
        fileformats = {}
        for f in utilities.rglob(dir,
                                 '|'.join(format_regex),
                                 True,
                                 re.IGNORECASE,
                                 recurse=recurse,
                                 archive=archive,
                                 excludes=excludes,
                                 onerror=self.onerror,
                                 followlinks=False):
            #Don't return existing overviews
            if f[-7:] in ('qlk.jpg', 'thm.jpg'): continue
            #Use utf-8 encoding to fix Issue 20
            if f[:4] == '/vsi': f = utilities.encode(f)
            else:
                f = utilities.realpath(utilities.normcase(utilities.encode(f)))
            for r in format_regex:  #This is so we always return _default_ format datasets last.
                if re.search(r, os.path.basename(f), re.IGNORECASE):
                    if fileformats.has_key(r): fileformats[r].append(f)
                    else: fileformats[r] = [f]
                    break
        files = []
        for r in format_regex:
            if fileformats.has_key(r): files.extend(fileformats[r])

        #Class vars
        self.files = files
        self.file = ''
        self.filecount = len(self.files)
Esempio n. 3
0
    def __init__(self, dir, recurse=True, archive=False):
        ''' Iterator for metadata crawling

            @type  dir: C{str}
            @param dir: The directory to start the metadata crawl.
        '''

        format_regex = formats.format_regex
        dir = utilities.uncpath(
            utilities.realpath(utilities.normcase(utilities.encode(dir))))
        #Build a dict of matching files and regexes then sort according to the priority of the regex formats
        fileformats = {}
        for f in utilities.rglob(dir,
                                 '|'.join(format_regex),
                                 True,
                                 re.IGNORECASE,
                                 recurse=recurse,
                                 archive=archive):
            #Use utf-8 encoding to fix Issue 20
            if f[:4] == '/vsi': f = utilities.encode(f)
            else:
                f = utilities.realpath(utilities.normcase(utilities.encode(f)))
            for r in format_regex:  #This is so we always return _default_ format datasets last.
                if re.search(r, os.path.basename(f), re.IGNORECASE):
                    if fileformats.has_key(r): fileformats[r].append(f)
                    else: fileformats[r] = [f]
                    break
        files = []
        for r in format_regex:
            if fileformats.has_key(r): files.extend(fileformats[r])

        #Class vars
        self.errors = [
        ]  #A list of files that couldn't be opened. Contains a tuple with file name, error info, debug info
        self.files = files
        self.file = ''
        self.filecount = len(self.files)
Esempio n. 4
0
def main(dir, xlsx, logger, mediaid=None, update=False, getovs=False, recurse=False, archive=False):

    """ Run the Metadata Crawler

        @type  dir:    C{str}
        @param dir:    The directory to start the metadata crawl.
        @type  xlsx:    C{str}
        @param xlsx:    Excel spreadsheet to write metadata to
        @type  logger: C{progresslogger.ProgressLogger}
        @param logger: Use an already instantiated logger
        @type  mediaid:C{str}
        @param mediaid:CD/DVD media ID
        @type  getovs: C{boolean}
        @param getovs: Generate overview (quicklook/thumbnail) images
        @type  recurse: C{boolean}
        @param recurse: Search directory recursively?
        @type  archive: C{boolean}
        @param archive: Search compressed archives (tar/zip)?
        @return:  C{progresslogger.ProgressLogger}
    """

    shp=xlsx.replace('.xlsx','.shp')

    format_regex  = formats.format_regex
    format_fields = formats.fields

    logger.debug(' '.join(sys.argv))

    #raise Exception
    #ExcelWriter=utilities.ExcelWriter(xlsx,format_fields.keys(),update=update)
    with utilities.ExcelWriter(xlsx,format_fields.keys(),update=update) as ExcelWriter:
        try:
            #Are we updating an existing crawl?
            records={}
            if update and os.path.exists(xlsx):

                #Do we need to recreate the shapefile?
                if os.path.exists(shp):
                    ShapeWriter=False
                else:
                    logger.info('%s does not exist, it will be recreated...'%shp)
                    ShapeWriter=geometry.ShapeWriter(shp,format_fields,update=False)

                #Build a dict of existing records
                row=-1
                #with utilities.ExcelReader(xlsx) as ExcelReader: #Using a context manager ensures closure before writing
                for row,rec in enumerate(utilities.ExcelReader(xlsx)):
                    #Check if the dataset still exists, mark it DELETED if it doesn't
                    if os.path.exists(rec['filepath']) or rec['mediaid'] !='' or \
                       (rec['filepath'][0:4]=='/vsi' and utilities.compressed_file_exists(rec['filepath'],False)):
                        if ShapeWriter:
                            ext=[rec['UL'].split(','),rec['UR'].split(','),rec['LR'].split(','),rec['LL'].split(',')]
                            ShapeWriter.WriteRecord(ext,rec)
                        #Kludge to ensure backwards compatibility with previously generated guids
                        #records[rec['guid']]=rec
                        records[utilities.uuid(rec['filepath'])]=(row,rec)
                    else:
                        if rec.get('DELETED',0)not in [1,'1']:
                            rec['DELETED']=1
                            ExcelWriter.UpdateRecord(rec,row)
                            logger.info('Marked %s as deleted' % (rec['filepath']))
                if row==-1:logger.info('Output spreadsheet is empty, no records to update')
                ExcelWriter.save()
                del ShapeWriter
            ShapeWriter=geometry.ShapeWriter(shp,format_fields,update=update)

        except Exception,err:
            logger.error('%s' % utilities.ExceptionInfo())
            logger.debug(utilities.ExceptionInfo(10))
            #sys.exit(1)
            return

        logger.info('Searching for files...')
        now=time.time()
        Crawler=crawler.Crawler(dir,recurse=recurse,archive=archive)
        logger.info('Found %s files...'%Crawler.filecount)

        #Loop thru dataset objects returned by Crawler
        for ds in Crawler:
            try:
                logger.debug('Attempting to open %s'%Crawler.file)
                fi=ds.fileinfo
                fi['filepath']=utilities.uncpath(fi['filepath'])
                fi['filelist']='|'.join(utilities.uncpath(ds.filelist))
                #qlk=utilities.uncpath(os.path.join(os.path.dirname(xlsx),'%s.%s.qlk.jpg'%(fi['filename'],fi['guid'])))
                #thm=utilities.uncpath(os.path.join(os.path.dirname(xlsx),'%s.%s.thm.jpg'%(fi['filename'],fi['guid'])))
                qlk=os.path.join(os.path.dirname(xlsx),'%s.%s.qlk.jpg'%(fi['filename'],fi['guid']))
                thm=os.path.join(os.path.dirname(xlsx),'%s.%s.thm.jpg'%(fi['filename'],fi['guid']))

                if update and ds.guid in records:
                    row,rec=records[ds.guid]
                    #Issue 35: if it's not modified, but we've asked for overview images and it doesn't already have them....
                    if ismodified(rec,fi,os.path.dirname(xlsx)) or (not rec['quicklook'] and getovs):
                        md=ds.metadata
                        geom=ds.extent
                        md.update(fi)
                        logger.info('Updated metadata for %s, %s files remaining' % (Crawler.file,len(Crawler.files)))
                        try:
                            if rec['quicklook'] and os.path.exists(rec['quicklook']):getovs=False #Don't update overview
                            if getovs:
                                qlk=ds.getoverview(qlk, width=800)
                                #We don't need to regenerate it, just resize it
                                #thm=ds.getoverview(thm, width=150)
                                thm=overviews.resize(qlk,thm,width=150)
                                md['quicklook']=os.path.basename(qlk)
                                md['thumbnail']=os.path.basename(thm)
                                #md['quicklook']=utilities.uncpath(qlk)
                                #md['thumbnail']=utilities.uncpath(thm)
                                logger.info('Updated overviews for %s' % Crawler.file)
                        except Exception,err:
                            logger.error('%s\n%s' % (Crawler.file, utilities.ExceptionInfo()))
                            logger.debug(utilities.ExceptionInfo(10))
                        try:
                            ExcelWriter.UpdateRecord(md,row)
                        except Exception,err:
                            logger.error('%s\n%s' % (Crawler.file, utilities.ExceptionInfo()))
                            logger.debug(utilities.ExceptionInfo(10))
                        try:
                            ShapeWriter.UpdateRecord(geom,md,'guid="%s"'%rec['guid'])
                        except Exception,err:
                            logger.error('%s\n%s' % (Crawler.file, utilities.ExceptionInfo()))
                            logger.debug(utilities.ExceptionInfo(10))
                    else:
                        logger.info('Metadata did not need updating for %s, %s files remaining' % (Crawler.file,len(Crawler.files)))
                        continue