def main(): from optparse import OptionParser parser = OptionParser(option_class=magicdate.MagicDateOption, usage='%prog [options]', version='%prog ' + __version__) parser.add_option('-s', '--start-date', type='magicdate', default=None, help='Force a start time (magicdate)') parser.add_option( '-e', '--end-date', type='magicdate', default=magicdate.magicdate('today'), help='Force an end time (magicdate) [default: %default]') parser.add_option( '-l', '--log-format', default='%Y%m%d.log', help= 'datetime strftime format string to create the log file names to search for [default: %default]' ) (options, args) = parser.parse_args() assert options.start_date is not None for d in date_generator(options.start_date, options.end_date): #print d.strftime(options.log_format) filename = d.strftime(options.log_format) try: s = os.stat(filename) except OSError: print 'missing', filename
def reports_for_date(date_string): try: date_object = magicdate(date_string) output_list = parser.earnings_reports_on_date(date_object) return jsonify(earnings_reports=output_list) except: print('Could not parse date') return jsonify(earning_events=None)
def do_once(self): cx = self.cx cu = self.cu v = self.verbose tzoffset = datetime.timedelta( seconds=time.timezone) # Always work in utc. Magicdate is local if self.track_start is not None: track_start = mark_utc( magicdate.magicdate(self.track_start) + tzoffset) if v: cu.execute( 'SELECT COUNT(*) FROM track_lines WHERE update_timestamp < %s;', (track_start, )) print 'Deleting from track_lines:', cu.fetchone()[0] cu.execute('DELETE FROM track_lines WHERE update_timestamp < %s;', (track_start, )) # if the points are too old to be in a track_line, then delete them if v: cu.execute( 'SELECT COUNT(*) FROM position WHERE cg_timestamp < %s;', (track_start, )) print 'Deleting from position:', cu.fetchone()[0] cu.execute('DELETE FROM position WHERE cg_timestamp < %s;', (track_start, )) if self.last_position_start is not None: last_position_start = mark_utc( magicdate.magicdate(self.last_position_start) + tzoffset) if v: cu.execute( 'SELECT COUNT(*) FROM last_position WHERE cg_timestamp < %s;', (track_start, )) print 'Deleting from last_position:', cu.fetchone()[0] cu.execute('DELETE FROM last_position WHERE cg_timestamp < %s;', (last_position_start, )) if self.last_position_start is not None or self.track_start is not None: cx.commit()
def get_container_by_channel(slug, number=10, depth=1, include_children=True, **kwargs): box = None magic_date = kwargs.pop('magic_date', False) date = timezone.now() if magic_date: try: date = magicdate(magic_date) except Exception: pass if include_children: try: kwargs['channel_long_slug__in'] = cache.get( 'get_container_by_channel-{}'.format(slug)) if not kwargs['channel_long_slug__in']: base_channel = Channel.objects.get(long_slug=slug) kwargs['channel_long_slug__in'] = [base_channel.long_slug] def _append_recursivelly(channel, current_level=0): # Depth test if current_level >= depth: return elif current_level < depth: current_level += 1 for children in channel.get_children(): kwargs['channel_long_slug__in'].append( children.long_slug) # Recursion _channel = Channel.objects.get( long_slug=children.long_slug) _append_recursivelly(_channel, current_level) _append_recursivelly(base_channel) cache.set('get_container_by_channel-{}'.format(slug), kwargs['channel_long_slug__in'], settings.OPPS_CACHE_EXPIRE) except Channel.DoesNotExist: kwargs['channel_long_slug__in'] = [] try: kwargs['site'] = settings.SITE_ID if settings.OPPS_CONTAINERS_SITE_ID: kwargs['site'] = settings.OPPS_CONTAINERS_SITE_ID kwargs['show_on_root_channel'] = include_children kwargs['date_available__lte'] = date kwargs['published'] = True box = Container.objects.distinct().filter( **kwargs).order_by('-date_available')[:number] except: pass return box
def do_process(self, input_value): if isinstance(input_value, tuple): input_value = input_value[0] try: return (int(input_value),) except ValueError: try: datetime = magicdate.magicdate(input_value) return (datetime.year,) except: return None
def do_process(self, input_value): if isinstance(input_value, tuple): input_value = input_value[0] try: return (int(input_value), ) except ValueError: try: datetime = magicdate.magicdate(input_value) return (datetime.year, ) except: return None
def get_container_by_channel(slug, number=10, depth=1, include_children=True, **kwargs): box = None magic_date = kwargs.pop('magic_date', False) date = timezone.now() if magic_date: try: date = magicdate(magic_date) except Exception: pass # __in split treatment splited = dict([ (key, value.split(',')) for key, value in kwargs.items() if key.endswith('__in') and type(value) is not list]) kwargs.update(splited) if include_children: k = 'channel_id__in' kwargs[k] = cache.get( 'get_container_by_channel-{0}'.format(slug)) if not kwargs[k]: try: channel = Channel.objects.get(long_slug=slug) qs = channel.get_descendants(include_self=True) qs = qs.filter(level__lte=channel.level + depth) kwargs[k] = \ qs.values_list("id", flat=True) cache.set( 'get_container_by_channel-{0}'.format(slug), kwargs[k], settings.OPPS_CACHE_EXPIRE) except Channel.DoesNotExist: kwargs[k] = [] try: kwargs['site'] = settings.SITE_ID if settings.OPPS_CONTAINERS_SITE_ID: kwargs['site'] = settings.OPPS_CONTAINERS_SITE_ID kwargs['show_on_root_channel'] = include_children kwargs['date_available__lte'] = date kwargs['published'] = True box = Container.objects.distinct().filter( **kwargs).order_by('-date_available')[:number] except: pass return box
def guess(s, parse=True, is_gmt=False, set_gmt=False, try_iso=True, try_num=True, try_en=True): """Guess the format, and optionally parse, the input string. If 'is_gmt' is True, assume timezone is GMT when not given. Otherwise, assume localtime. If 'set_gmt' is True then set the timezone to GMT, otherwise set it to localtime. The answer is a pair containing the guessed format and, if the 'parse' flag was given, the parsed value as seconds since the epoch, otherwise None. The format is a constant defined in this module: UNKNOWN - Cannot guess the format (associated value is None) ISO8601 - This is a prefix of the ISO8601 format accepted by completeISO() ENGLISH - This is an natural English-language format accepted by makeISO() SECONDS - This is seconds since the UNIX epoch (Midnight on 1970/1/1). """ if not(s): return UNKNOWN, None sec = None s = s.strip() # try ISO8601 if try_iso: m = ISO_DATE_PARTS.match(s) if m and m.start() == 0 and m.end() == len(s): if parse: if s[-1] == 'Z': # explicit timezone overrides option is_gmt = True iso_s = completeISO(s, is_gmt=is_gmt, set_gmt=set_gmt) sec = parseISO(iso_s) return ISO8601, sec # try number if try_num: m = NUMBER_DATE.match(s) if m and m.start() == 0 and m.end() == len(s): if parse: sec = float(s) return SECONDS, sec # try natural language if try_en: try: d = magicdate.magicdate(s) except Exception, E: d = None if d is not None: if parse: partial_iso = d.isoformat() iso = completeISO(partial_iso, is_gmt=False, set_gmt=set_gmt) sec = parseISO(iso) return ENGLISH, sec
def get_container_by_channel(slug, number=10, depth=1, include_children=True, **kwargs): box = None magic_date = kwargs.pop('magic_date', False) date = timezone.now() if magic_date: try: date = magicdate(magic_date) except Exception: pass # __in split treatment splited = dict([(key, value.split(',')) for key, value in kwargs.items() if key.endswith('__in') and type(value) is not list]) kwargs.update(splited) if include_children: k = 'channel_id__in' kwargs[k] = cache.get('get_container_by_channel-{0}'.format(slug)) if not kwargs[k]: try: channel = Channel.objects.get(long_slug=slug) qs = channel.get_descendants(include_self=True) qs = qs.filter(level__lte=channel.level + depth) kwargs[k] = \ qs.values_list("id", flat=True) cache.set('get_container_by_channel-{0}'.format(slug), kwargs[k], settings.OPPS_CACHE_EXPIRE) except Channel.DoesNotExist: kwargs[k] = [] try: kwargs['site'] = settings.SITE_ID if settings.OPPS_CONTAINERS_SITE_ID: kwargs['site'] = settings.OPPS_CONTAINERS_SITE_ID kwargs['show_on_root_channel'] = include_children kwargs['date_available__lte'] = date kwargs['published'] = True box = Container.objects.distinct().filter( **kwargs).order_by('-date_available')[:number] except: pass return box
def do_once(self): cx = self.cx cu = self.cu v = self.verbose tzoffset = datetime.timedelta(seconds=time.timezone) # Always work in utc. Magicdate is local if self.track_start is not None: track_start = mark_utc(magicdate.magicdate(self.track_start)+tzoffset ) if v: cu.execute('SELECT COUNT(*) FROM track_lines WHERE update_timestamp < %s;', (track_start,)) print 'Deleting from track_lines:',cu.fetchone()[0] cu.execute('DELETE FROM track_lines WHERE update_timestamp < %s;', (track_start,)) # if the points are too old to be in a track_line, then delete them if v: cu.execute('SELECT COUNT(*) FROM position WHERE cg_timestamp < %s;', (track_start,)) print 'Deleting from position:',cu.fetchone()[0] cu.execute('DELETE FROM position WHERE cg_timestamp < %s;', (track_start,)) if self.last_position_start is not None: last_position_start = mark_utc(magicdate.magicdate(self.last_position_start)+tzoffset ) if v: cu.execute('SELECT COUNT(*) FROM last_position WHERE cg_timestamp < %s;', (track_start,)) print 'Deleting from last_position:',cu.fetchone()[0] cu.execute('DELETE FROM last_position WHERE cg_timestamp < %s;', (last_position_start,)) if self.last_position_start is not None or self.track_start is not None: cx.commit()
def main(): from optparse import OptionParser parser = OptionParser(option_class=magicdate.MagicDateOption, usage='%prog [options]', version='%prog '+__version__) parser.add_option('-s', '--start-date', type='magicdate', default=None, help='Force a start time (magicdate)') parser.add_option('-e', '--end-date', type='magicdate', default=magicdate.magicdate('today'), help='Force an end time (magicdate) [default: %default]') parser.add_option('-l', '--log-format', default='%Y%m%d.log', help='datetime strftime format string to create the log file names to search for [default: %default]') (options,args) = parser.parse_args() assert options.start_date is not None for d in date_generator(options.start_date, options.end_date): #print d.strftime(options.log_format) filename = d.strftime(options.log_format) try: s = os.stat(filename) except OSError: print 'missing',filename
def makeISO(value, is_gmt=False, set_gmt=False): """If value is a tuple, assume it is the one returned by time.gmtime() or time.localtime() Otherwise, assume value is an English language description (for partial ISO strings, use completeISO() instead). Return an ISO8601 string, with timezone set to GMT or localtime. """ tz_str = 'Z' # assume GMT if isinstance(value,tuple) or isinstance(value,list): fmt = ("%04d","-%02d","-%02d","T%02d",":%02d", ":%02d") s = ''.join([f % v for f,v in zip(fmt,value)]) if not gmt: tz_str = getLocaltimeISO(value) iso = s + tz_str else: try: d = magicdate.magicdate(value) except Exception, E: raise ValueError("magicdate cannot parse '%s'" % value) partial_iso = d.isoformat() iso = completeISO(partial_iso, is_gmt=is_gmt, set_gmt=set_gmt)
from boto.s3.key import Key import magicdate import gzip import json import glob import logging from filechunkio import FileChunkIO import math import shutil from pprint import pprint import subprocess print "Daily file tody up. Pre S3 push" # we are bundling the previous days data. yesterday = magicdate.magicdate('yesterday') dir_file = gzip.open("/home/TfL_feeds/directory_data.json.gz") dir_json = json.load(dir_file) os.chdir(dir_json['home_directory'] + "/data/") # Define folder name within local directory folder_name = str(yesterday) message = "Tarring and compressing " + str(folder_name) print message folder_exists = os.path.isdir(folder_name) folder = (folder_name + ".tar.gz") print "Working on, ", str(folder)
def __setitem__(self, key, value) : new_value = value if key == 'Date' : new_value = magicdate(value) self._dict.__setitem__(key, new_value)
def bag2kmlbbox(in_name, out_file, title=None, kml_complete=False, verbose=False, placemark=False): v = verbose f = h5py.File(in_name) #'H11302_OLS_OSS/H11302_2m_1.bag') #o = file(out_name,'w') # FIX: if out_file is a string, then open o = out_file #print f.listobjects() #print f.listitems() bag_root = f['/BAG_root'] metadata_xml = ''.join(bag_root['metadata']) #o = file('metadata.xml','w') #o.write(metadata_xml) #del o #root = etree.parse(StringIO(metadata_xml)).getroot() #root = etree.parse(StringIO(metadata_xml.replace('smXML:',''))).getroot() root = etree.XML(metadata_xml.replace('smXML:', '')) xmin = float(root.xpath('//*/westBoundLongitude')[0].text) xmax = float(root.xpath('//*/eastBoundLongitude')[0].text) ymin = float(root.xpath('//*/southBoundLatitude')[0].text) ymax = float(root.xpath('//*/northBoundLatitude')[0].text) # WARNING: This date does not relate to the dates the survey was collected! date = root.xpath('//*/CI_Date/date')[0].text abstract = root.xpath('//*/abstract')[0].text timestamp = '' # No timestamp if we can't handle it try: import datetime, magicdate adate = magicdate.magicdate(date) timestamp = '<TimeStamp>' + adate.strftime( iso8601_timeformat) + '</TimeStamp>' except: print 'WARNING: Unable to handle timestamp:', date if v: print xmin, xmax, '->', ymin, ymax print 'date:', date print 'abstract:', abstract #import subprocess #p = subprocess.Popen( # ['source-highlight','-s', 'xml', '--out-format=html'], # stdin=subprocess.PIPE, # stdout=subprocess.PIPE # ) #metadata_html = p.communicate(input=etree.tostring(root, pretty_print=True ) ) [0] metadata_html = etree.tostring(root, pretty_print=True).replace( '</', ' ').replace('<', ' ').replace('>', ' ') #.replace('\n','<br/>\n') if v: print metadata_html if not title: title = '%s : %s' % (abstract, date) kml_data = { 'title': title, 'x': (xmin + xmax) / 2., 'y': (ymin + ymax) / 2., 'xmin': xmin, 'xmax': xmax, 'ymin': ymin, 'ymax': ymax, 'metadata': metadata_html, 'timestamp': timestamp, } #o = file('out.kml','w') if kml_complete: o.write('''<?xml version="1.0" encoding="UTF-8"?> <kml xmlns="http://www.opengis.net/kml/2.2" xmlns:gx="http://www.google.com/kml/ext/2.2" xmlns:kml="http://www.opengis.net/kml/2.2" xmlns:atom="http://www.w3.org/2005/Atom"> <Document>''') if placemark: o.write(''' <Placemark> <name>{title}</name> {timestamp} <description><![CDATA[ <pre> {metadata} </pre> ]]> </description> <Point> <coordinates> {x},{y},0 </coordinates> </Point> </Placemark> ''') o.write(''' <Placemark> <name>{title}</name> {timestamp} <LineString> <coordinates> {xmin},{ymin},0 {xmax},{ymin},0 {xmax},{ymax},0 {xmin},{ymax},0 {xmin},{ymin},0 </coordinates> </LineString> </Placemark> '''.format(**kml_data)) if kml_complete: o.write('''</Document> </kml> ''') return
# Invalid record, skip it. continue else: output_list.append(tmp_dict) except Exception, e: continue return output_list def earnings_report_date_for_symbol(symbol): url_string = 'http://biz.yahoo.com/rr/?s=' + symbol + '&d=research%2Fearncal' print url_string try: page = urllib2.urlopen(url_string) soup = BeautifulSoup(page, 'html.parser') except Exception, e: print e return None date_element = soup.find_all("font", attrs={'face': 'arial', 'size': '+1'})[0] date_text = date_element.find('b').string.replace('\n', ' ') sanitized_date_text = date_text.replace('US Earnings Calendar for ', '') output_date = str(magicdate(sanitized_date_text)) return output_date
def add_bag_to_db(cx, infile_name, survey, filename_base, verbose=False, write_xml=False): # filename_base - without .bag or path #print ('file:',infile_name, file=sys.stderr) v = verbose #if v: # print ('opening:',infile_name,os.path.getsize(infile_name)) f = h5py.File(infile_name) #'H11302_OLS_OSS/H11302_2m_1.bag') #o = file('foo.out','w') bag_root = f['/BAG_root'] metadata_xml = ''.join(bag_root['metadata']) #o = file('metadata.xml','w') #o.write(metadata_xml) #del o #root = etree.parse(StringIO(metadata_xml)).getroot() #root = etree.parse(StringIO(metadata_xml.replace('smXML:',''))).getroot() try: root = etree.XML(metadata_xml.replace('smXML:','')) except: print ('bad_metadata:',infile_name) # What can we do? return # ouch... better if we could try to fix it somehow x_min_metadata = float(root.xpath('//*/westBoundLongitude')[0].text) x_max_metadata = float(root.xpath('//*/eastBoundLongitude')[0].text) y_min_metadata = float(root.xpath('//*/southBoundLatitude')[0].text) y_max_metadata = float(root.xpath('//*/northBoundLatitude')[0].text) software = root.xpath('//*/BAG_ProcessStep/description')[0].text #print ('software:',software) utm_zone = int(root.xpath('//*/zone')[0].text) # The WGS84 geographic is often foulded up. utm_coords = root.xpath('//*/gml:coordinates', namespaces={'gml':"http://www.opengis.net/gml"})[0].text #print ('\t',utm_coords) utm_coords = utm_coords.split() utm_x_min,utm_y_min = [float(coord) for coord in utm_coords[0].split(',')] utm_x_max,utm_y_max = [float(coord) for coord in utm_coords[1].split(',')] params = {'proj':'utm', 'zone':utm_zone} proj = Proj(params) x_min,y_min = proj(utm_x_min,utm_y_min, inverse=True) x_max,y_max = proj(utm_x_max,utm_y_max, inverse=True) #print ('\t',utm_x_min,utm_y_min, utm_x_max,utm_y_max) #print ('\t\t',x_min,y_min,x_max,y_max) #print ('\t\t',x_min_metadata,y_min_metadata,x_max_metadata,y_max_metadata) if abs(x_max - x_max_metadata) > 0.05 or abs(y_max - y_max_metadata) > 0.05: print ('%s: %.4f %.4f %.4f %.4f' % (filename_base, x_min - x_min_metadata,y_min - y_min_metadata, x_max - x_max_metadata,y_max - y_max_metadata) ) vdatum = None datums = [entry.text.strip() for entry in root.xpath('//*/datum/RS_Identifier/code')] if len(datums)==0: pass elif 'MLLW' in datums: vdatum = 'MLLW' else: vdatum = datums[-1] # just guess that it is the last one print('datums:',datums,'->',vdatum,filename_base) axes = (root.xpath('//*/axisDimensionProperties')) dx = dy = None width = height = None for axis in axes: #print(etree.tostring(axis, pretty_print=True)) dim_name = axis.xpath('*/dimensionName')[0].text dim_size = int(axis.xpath('*/dimensionSize')[0].text) delta = float(axis.xpath('*/*/*/value')[0].text) #print ('dim_name: "%s"' % (dim_name,)) if 'row' == dim_name: dy = delta height = dim_size elif 'column' == dim_name: dx = delta width = dim_size else: print ('ERROR: unable to handle dim',dim_name) assert False # WARNING: This date does not relate to the dates the survey was collected! date = root.xpath('//*/CI_Date/date')[0].text abstract = root.xpath('//*/abstract')[0].text title = root.xpath('//*/title')[0].text timestamp = '' # No timestamp if we can't handle it try: import datetime, magicdate #timestamp = magicdate.magicdate(date) creation = magicdate.magicdate(date) #timestamp = adate.strftime(iso8601_timeformat) except: print ('WARNING: Unable to handle timestamp:',date) creation = None # if v: # print (x_min,x_max,'->',y_min,y_max) # print ('date:',date) # print ('abstract:',abstract) metadata_txt = etree.tostring(root, pretty_print=True ).replace('</',' ').replace('<',' ').replace('>',' ') #.replace('\n','<br/>\n') # FIX: base url must change based on the number of the survey base_url = 'http://surveys.ngdc.noaa.gov/mgg/NOS/coast/H10001-H12000/' dr_url = base_url + survey + '/DR/' + survey + '.pdf' bag_url = base_url + survey + '/BAG/' + filename_base + '.bag.gz' sql_field_names = ('file', 'survey', 'title','abstract', 'survey', 'creation', 'x_min', 'y_min', 'x_max', 'y_max', 'width', 'height', 'dx', 'dy', 'vdatum', 'utm_zone', 'dr_url', 'bag_url', 'metadata_txt','metadata_xml', 'utm_x_min','utm_y_min' , 'utm_x_max' ,'utm_y_max', 'software') file = filename_base # check for errors #for field in sql_field_names: # print('%s:' % (field,) ,locals()[field]) sql_insert = 'INSERT INTO bag (' + ','.join(sql_field_names) + ') VALUES (' + ', '.join([':%s' %(field,) for field in sql_field_names ]) + ');' #print (bag_data) #print (sql_insert) cx.execute(sql_insert,locals()) # Passing locals sees crazy cx.commit()
def bag2kmlbbox(in_name, out_file, title=None, kml_complete=False, verbose=False, placemark=False): v = verbose f = h5py.File(in_name) #'H11302_OLS_OSS/H11302_2m_1.bag') # o = file(out_name,'w') # FIX: if out_file is a string, then open o = out_file # print f.listobjects() # print f.listitems() bag_root = f["/BAG_root"] metadata_xml = "".join(bag_root["metadata"]) # o = file('metadata.xml','w') # o.write(metadata_xml) # del o # root = etree.parse(StringIO(metadata_xml)).getroot() # root = etree.parse(StringIO(metadata_xml.replace('smXML:',''))).getroot() root = etree.XML(metadata_xml.replace("smXML:", "")) xmin = float(root.xpath("//*/westBoundLongitude")[0].text) xmax = float(root.xpath("//*/eastBoundLongitude")[0].text) ymin = float(root.xpath("//*/southBoundLatitude")[0].text) ymax = float(root.xpath("//*/northBoundLatitude")[0].text) # WARNING: This date does not relate to the dates the survey was collected! date = root.xpath("//*/CI_Date/date")[0].text abstract = root.xpath("//*/abstract")[0].text timestamp = "" # No timestamp if we can't handle it try: import datetime, magicdate adate = magicdate.magicdate(date) timestamp = "<TimeStamp>" + adate.strftime(iso8601_timeformat) + "</TimeStamp>" except: print "WARNING: Unable to handle timestamp:", date if v: print xmin, xmax, "->", ymin, ymax print "date:", date print "abstract:", abstract # import subprocess # p = subprocess.Popen( # ['source-highlight','-s', 'xml', '--out-format=html'], # stdin=subprocess.PIPE, # stdout=subprocess.PIPE # ) # metadata_html = p.communicate(input=etree.tostring(root, pretty_print=True ) ) [0] metadata_html = ( etree.tostring(root, pretty_print=True).replace("</", " ").replace("<", " ").replace(">", " ") ) # .replace('\n','<br/>\n') if v: print metadata_html if not title: title = "%s : %s" % (abstract, date) kml_data = { "title": title, "x": (xmin + xmax) / 2.0, "y": (ymin + ymax) / 2.0, "xmin": xmin, "xmax": xmax, "ymin": ymin, "ymax": ymax, "metadata": metadata_html, "timestamp": timestamp, } # o = file('out.kml','w') if kml_complete: o.write( """<?xml version="1.0" encoding="UTF-8"?> <kml xmlns="http://www.opengis.net/kml/2.2" xmlns:gx="http://www.google.com/kml/ext/2.2" xmlns:kml="http://www.opengis.net/kml/2.2" xmlns:atom="http://www.w3.org/2005/Atom"> <Document>""" ) if placemark: o.write( """ <Placemark> <name>{title}</name> {timestamp} <description><![CDATA[ <pre> {metadata} </pre> ]]> </description> <Point> <coordinates> {x},{y},0 </coordinates> </Point> </Placemark> """ ) o.write( """ <Placemark> <name>{title}</name> {timestamp} <LineString> <coordinates> {xmin},{ymin},0 {xmax},{ymin},0 {xmax},{ymax},0 {xmin},{ymax},0 {xmin},{ymin},0 </coordinates> </LineString> </Placemark> """.format( **kml_data ) ) if kml_complete: o.write( """</Document> </kml> """ ) return
def add_bag_to_db(cx, infile_name, survey, filename_base, verbose=False, write_xml=False): # filename_base - without .bag or path #print ('file:',infile_name, file=sys.stderr) v = verbose #if v: # print ('opening:',infile_name,os.path.getsize(infile_name)) f = h5py.File(infile_name) #'H11302_OLS_OSS/H11302_2m_1.bag') #o = file('foo.out','w') bag_root = f['/BAG_root'] metadata_xml = ''.join(bag_root['metadata']) #o = file('metadata.xml','w') #o.write(metadata_xml) #del o #root = etree.parse(StringIO(metadata_xml)).getroot() #root = etree.parse(StringIO(metadata_xml.replace('smXML:',''))).getroot() try: root = etree.XML(metadata_xml.replace('smXML:', '')) except: print('bad_metadata:', infile_name) # What can we do? return # ouch... better if we could try to fix it somehow x_min_metadata = float(root.xpath('//*/westBoundLongitude')[0].text) x_max_metadata = float(root.xpath('//*/eastBoundLongitude')[0].text) y_min_metadata = float(root.xpath('//*/southBoundLatitude')[0].text) y_max_metadata = float(root.xpath('//*/northBoundLatitude')[0].text) software = root.xpath('//*/BAG_ProcessStep/description')[0].text #print ('software:',software) utm_zone = int(root.xpath('//*/zone')[0].text) # The WGS84 geographic is often foulded up. utm_coords = root.xpath('//*/gml:coordinates', namespaces={'gml': "http://www.opengis.net/gml"})[0].text #print ('\t',utm_coords) utm_coords = utm_coords.split() utm_x_min, utm_y_min = [float(coord) for coord in utm_coords[0].split(',')] utm_x_max, utm_y_max = [float(coord) for coord in utm_coords[1].split(',')] params = {'proj': 'utm', 'zone': utm_zone} proj = Proj(params) x_min, y_min = proj(utm_x_min, utm_y_min, inverse=True) x_max, y_max = proj(utm_x_max, utm_y_max, inverse=True) #print ('\t',utm_x_min,utm_y_min, utm_x_max,utm_y_max) #print ('\t\t',x_min,y_min,x_max,y_max) #print ('\t\t',x_min_metadata,y_min_metadata,x_max_metadata,y_max_metadata) if abs(x_max - x_max_metadata) > 0.05 or abs(y_max - y_max_metadata) > 0.05: print('%s: %.4f %.4f %.4f %.4f' % (filename_base, x_min - x_min_metadata, y_min - y_min_metadata, x_max - x_max_metadata, y_max - y_max_metadata)) vdatum = None datums = [ entry.text.strip() for entry in root.xpath('//*/datum/RS_Identifier/code') ] if len(datums) == 0: pass elif 'MLLW' in datums: vdatum = 'MLLW' else: vdatum = datums[-1] # just guess that it is the last one print('datums:', datums, '->', vdatum, filename_base) axes = (root.xpath('//*/axisDimensionProperties')) dx = dy = None width = height = None for axis in axes: #print(etree.tostring(axis, pretty_print=True)) dim_name = axis.xpath('*/dimensionName')[0].text dim_size = int(axis.xpath('*/dimensionSize')[0].text) delta = float(axis.xpath('*/*/*/value')[0].text) #print ('dim_name: "%s"' % (dim_name,)) if 'row' == dim_name: dy = delta height = dim_size elif 'column' == dim_name: dx = delta width = dim_size else: print('ERROR: unable to handle dim', dim_name) assert False # WARNING: This date does not relate to the dates the survey was collected! date = root.xpath('//*/CI_Date/date')[0].text abstract = root.xpath('//*/abstract')[0].text title = root.xpath('//*/title')[0].text timestamp = '' # No timestamp if we can't handle it try: import datetime, magicdate #timestamp = magicdate.magicdate(date) creation = magicdate.magicdate(date) #timestamp = adate.strftime(iso8601_timeformat) except: print('WARNING: Unable to handle timestamp:', date) creation = None # if v: # print (x_min,x_max,'->',y_min,y_max) # print ('date:',date) # print ('abstract:',abstract) metadata_txt = etree.tostring(root, pretty_print=True).replace( '</', ' ').replace('<', ' ').replace('>', ' ') #.replace('\n','<br/>\n') # FIX: base url must change based on the number of the survey base_url = 'http://surveys.ngdc.noaa.gov/mgg/NOS/coast/H10001-H12000/' dr_url = base_url + survey + '/DR/' + survey + '.pdf' bag_url = base_url + survey + '/BAG/' + filename_base + '.bag.gz' sql_field_names = ('file', 'survey', 'title', 'abstract', 'survey', 'creation', 'x_min', 'y_min', 'x_max', 'y_max', 'width', 'height', 'dx', 'dy', 'vdatum', 'utm_zone', 'dr_url', 'bag_url', 'metadata_txt', 'metadata_xml', 'utm_x_min', 'utm_y_min', 'utm_x_max', 'utm_y_max', 'software') file = filename_base # check for errors #for field in sql_field_names: # print('%s:' % (field,) ,locals()[field]) sql_insert = 'INSERT INTO bag (' + ','.join( sql_field_names) + ') VALUES (' + ', '.join( [':%s' % (field, ) for field in sql_field_names]) + ');' #print (bag_data) #print (sql_insert) cx.execute(sql_insert, locals()) # Passing locals sees crazy cx.commit()