def __init__(self,dir, recurse=True, archive=False): ''' Iterator for metadata crawling @type dir: C{str} @param dir: The directory to start the metadata crawl. ''' format_regex = formats.format_regex dir=utilities.uncpath(utilities.realpath(utilities.normcase(utilities.encode(dir)))) #Build a dict of matching files and regexes then sort according to the priority of the regex formats fileformats={} for f in utilities.rglob(dir,'|'.join(format_regex), True, re.IGNORECASE, recurse=recurse, archive=archive): #Don't return existing overviews if f[-7:] in ('qlk.jpg','thm.jpg'):continue #Use utf-8 encoding to fix Issue 20 if f[:4]=='/vsi':f=utilities.encode(f) else:f=utilities.realpath(utilities.normcase(utilities.encode(f))) for r in format_regex: #This is so we always return _default_ format datasets last. if re.search(r,os.path.basename(f),re.IGNORECASE): if fileformats.has_key(r):fileformats[r].append(f) else:fileformats[r]=[f] break files=[] for r in format_regex: if fileformats.has_key(r):files.extend(fileformats[r]) #Class vars self.errors=[] #A list of files that couldn't be opened. Contains a tuple with file name, error info, debug info self.files=files self.file='' self.filecount=len(self.files)
def __init__(self, dir, recurse=True, archive=False, excludes=[]): ''' Iterator for metadata crawling @type dir: C{str} @param dir: The directory to start the metadata crawl. @type recurse: C{bool} @param recurse: Recurse into subdirectories? @type archive: C{bool} @param archive: Look in zip/gzip archives @type excludes: C{list} @param excludes: List of glob style file/directory exclusion pattern/s ''' #Class vars self.errors = [ ] #A list of files that couldn't be opened. Contains a tuple with file name, error info, debug info format_regex = formats.format_regex dir = utilities.uncpath( utilities.realpath(utilities.normcase(utilities.encode(dir)))) #Build a dict of matching files and regexes then sort according to the priority of the regex formats fileformats = {} for f in utilities.rglob(dir, '|'.join(format_regex), True, re.IGNORECASE, recurse=recurse, archive=archive, excludes=excludes, onerror=self.onerror, followlinks=False): #Don't return existing overviews if f[-7:] in ('qlk.jpg', 'thm.jpg'): continue #Use utf-8 encoding to fix Issue 20 if f[:4] == '/vsi': f = utilities.encode(f) else: f = utilities.realpath(utilities.normcase(utilities.encode(f))) for r in format_regex: #This is so we always return _default_ format datasets last. if re.search(r, os.path.basename(f), re.IGNORECASE): if fileformats.has_key(r): fileformats[r].append(f) else: fileformats[r] = [f] break files = [] for r in format_regex: if fileformats.has_key(r): files.extend(fileformats[r]) #Class vars self.files = files self.file = '' self.filecount = len(self.files)
def __init__(self, dir, recurse=True, archive=False): ''' Iterator for metadata crawling @type dir: C{str} @param dir: The directory to start the metadata crawl. ''' format_regex = formats.format_regex dir = utilities.uncpath( utilities.realpath(utilities.normcase(utilities.encode(dir)))) #Build a dict of matching files and regexes then sort according to the priority of the regex formats fileformats = {} for f in utilities.rglob(dir, '|'.join(format_regex), True, re.IGNORECASE, recurse=recurse, archive=archive): #Use utf-8 encoding to fix Issue 20 if f[:4] == '/vsi': f = utilities.encode(f) else: f = utilities.realpath(utilities.normcase(utilities.encode(f))) for r in format_regex: #This is so we always return _default_ format datasets last. if re.search(r, os.path.basename(f), re.IGNORECASE): if fileformats.has_key(r): fileformats[r].append(f) else: fileformats[r] = [f] break files = [] for r in format_regex: if fileformats.has_key(r): files.extend(fileformats[r]) #Class vars self.errors = [ ] #A list of files that couldn't be opened. Contains a tuple with file name, error info, debug info self.files = files self.file = '' self.filecount = len(self.files)
def main(dir, xlsx, logger, mediaid=None, update=False, getovs=False, recurse=False, archive=False): """ Run the Metadata Crawler @type dir: C{str} @param dir: The directory to start the metadata crawl. @type xlsx: C{str} @param xlsx: Excel spreadsheet to write metadata to @type logger: C{progresslogger.ProgressLogger} @param logger: Use an already instantiated logger @type mediaid:C{str} @param mediaid:CD/DVD media ID @type getovs: C{boolean} @param getovs: Generate overview (quicklook/thumbnail) images @type recurse: C{boolean} @param recurse: Search directory recursively? @type archive: C{boolean} @param archive: Search compressed archives (tar/zip)? @return: C{progresslogger.ProgressLogger} """ shp=xlsx.replace('.xlsx','.shp') format_regex = formats.format_regex format_fields = formats.fields logger.debug(' '.join(sys.argv)) #raise Exception #ExcelWriter=utilities.ExcelWriter(xlsx,format_fields.keys(),update=update) with utilities.ExcelWriter(xlsx,format_fields.keys(),update=update) as ExcelWriter: try: #Are we updating an existing crawl? records={} if update and os.path.exists(xlsx): #Do we need to recreate the shapefile? if os.path.exists(shp): ShapeWriter=False else: logger.info('%s does not exist, it will be recreated...'%shp) ShapeWriter=geometry.ShapeWriter(shp,format_fields,update=False) #Build a dict of existing records row=-1 #with utilities.ExcelReader(xlsx) as ExcelReader: #Using a context manager ensures closure before writing for row,rec in enumerate(utilities.ExcelReader(xlsx)): #Check if the dataset still exists, mark it DELETED if it doesn't if os.path.exists(rec['filepath']) or rec['mediaid'] !='' or \ (rec['filepath'][0:4]=='/vsi' and utilities.compressed_file_exists(rec['filepath'],False)): if ShapeWriter: ext=[rec['UL'].split(','),rec['UR'].split(','),rec['LR'].split(','),rec['LL'].split(',')] ShapeWriter.WriteRecord(ext,rec) #Kludge to ensure backwards compatibility with previously generated guids #records[rec['guid']]=rec records[utilities.uuid(rec['filepath'])]=(row,rec) else: if rec.get('DELETED',0)not in [1,'1']: rec['DELETED']=1 ExcelWriter.UpdateRecord(rec,row) logger.info('Marked %s as deleted' % (rec['filepath'])) if row==-1:logger.info('Output spreadsheet is empty, no records to update') ExcelWriter.save() del ShapeWriter ShapeWriter=geometry.ShapeWriter(shp,format_fields,update=update) except Exception,err: logger.error('%s' % utilities.ExceptionInfo()) logger.debug(utilities.ExceptionInfo(10)) #sys.exit(1) return logger.info('Searching for files...') now=time.time() Crawler=crawler.Crawler(dir,recurse=recurse,archive=archive) logger.info('Found %s files...'%Crawler.filecount) #Loop thru dataset objects returned by Crawler for ds in Crawler: try: logger.debug('Attempting to open %s'%Crawler.file) fi=ds.fileinfo fi['filepath']=utilities.uncpath(fi['filepath']) fi['filelist']='|'.join(utilities.uncpath(ds.filelist)) #qlk=utilities.uncpath(os.path.join(os.path.dirname(xlsx),'%s.%s.qlk.jpg'%(fi['filename'],fi['guid']))) #thm=utilities.uncpath(os.path.join(os.path.dirname(xlsx),'%s.%s.thm.jpg'%(fi['filename'],fi['guid']))) qlk=os.path.join(os.path.dirname(xlsx),'%s.%s.qlk.jpg'%(fi['filename'],fi['guid'])) thm=os.path.join(os.path.dirname(xlsx),'%s.%s.thm.jpg'%(fi['filename'],fi['guid'])) if update and ds.guid in records: row,rec=records[ds.guid] #Issue 35: if it's not modified, but we've asked for overview images and it doesn't already have them.... if ismodified(rec,fi,os.path.dirname(xlsx)) or (not rec['quicklook'] and getovs): md=ds.metadata geom=ds.extent md.update(fi) logger.info('Updated metadata for %s, %s files remaining' % (Crawler.file,len(Crawler.files))) try: if rec['quicklook'] and os.path.exists(rec['quicklook']):getovs=False #Don't update overview if getovs: qlk=ds.getoverview(qlk, width=800) #We don't need to regenerate it, just resize it #thm=ds.getoverview(thm, width=150) thm=overviews.resize(qlk,thm,width=150) md['quicklook']=os.path.basename(qlk) md['thumbnail']=os.path.basename(thm) #md['quicklook']=utilities.uncpath(qlk) #md['thumbnail']=utilities.uncpath(thm) logger.info('Updated overviews for %s' % Crawler.file) except Exception,err: logger.error('%s\n%s' % (Crawler.file, utilities.ExceptionInfo())) logger.debug(utilities.ExceptionInfo(10)) try: ExcelWriter.UpdateRecord(md,row) except Exception,err: logger.error('%s\n%s' % (Crawler.file, utilities.ExceptionInfo())) logger.debug(utilities.ExceptionInfo(10)) try: ShapeWriter.UpdateRecord(geom,md,'guid="%s"'%rec['guid']) except Exception,err: logger.error('%s\n%s' % (Crawler.file, utilities.ExceptionInfo())) logger.debug(utilities.ExceptionInfo(10)) else: logger.info('Metadata did not need updating for %s, %s files remaining' % (Crawler.file,len(Crawler.files))) continue