def _parse_waf(context, repos, record, identifier): recobjs = [] content = util.http_request('GET', record) LOGGER.debug(content) try: parser = etree.HTMLParser() tree = etree.fromstring(content, parser=parser) except Exception as err: raise Exception('Could not parse WAF: %s' % str(err)) up = urlparse(record) links = [] LOGGER.debug('collecting links') for link in tree.xpath('//a/@href'): link = link.strip() if not link: continue if link.find('?') != -1: continue if not link.endswith('.xml'): LOGGER.debug('Skipping, not .xml') continue if '/' in link: # path is embedded in link if link[-1] == '/': # directory, skip continue if link[0] == '/': # strip path of WAF URL link = '%s://%s%s' % (up.scheme, up.netloc, link) else: # tack on href to WAF URL link = '%s/%s' % (record, link) LOGGER.debug('URL is: %s', link) links.append(link) LOGGER.debug('%d links found', len(links)) for link in links: LOGGER.debug('Processing link %s', link) # fetch and parse linkcontent = util.http_request('GET', link) recobj = _parse_metadata(context, repos, linkcontent)[0] recobj.source = link recobj.mdsource = link recobjs.append(recobj) return recobjs
def parse_record(context, record, repos=None, mtype='http://www.opengis.net/cat/csw/2.0.2', identifier=None, pagesize=10): ''' parse metadata ''' if identifier is None: identifier = uuid.uuid4().get_urn() # parse web services if (mtype == 'http://www.opengis.net/cat/csw/2.0.2' and isinstance(record, str) and record.startswith('http')): LOGGER.debug('CSW service detected, fetching via HTTP') # CSW service, not csw:Record try: return _parse_csw(context, repos, record, identifier, pagesize) except Exception, err: # TODO: implement better exception handling if err.message.find('ExceptionReport') != -1: msg = 'CSW harvesting error: %s' % str(err) LOGGER.debug(msg) raise RuntimeError(msg) LOGGER.debug('Not a CSW, attempting to fetch Dublin Core') try: content = util.http_request('GET', record) except Exception, err: raise RuntimeError('HTTP error: %s' % str(err)) return [_parse_dc(context, repos, etree.fromstring(content))]
def parse_record(context, record, repos=None, mtype='http://www.opengis.net/cat/csw/2.0.2', identifier=None, pagesize=10): ''' parse metadata ''' if identifier is None: identifier = uuid.uuid4().get_urn() # parse web services if (mtype == 'http://www.opengis.net/cat/csw/2.0.2' and isinstance(record, str) and record.startswith('http')): LOGGER.debug('CSW service detected, fetching via HTTP') # CSW service, not csw:Record try: return _parse_csw(context, repos, record, identifier, pagesize) except Exception, err: # TODO: implement better exception handling if str(err).find('ExceptionReport') != -1: msg = 'CSW harvesting error: %s' % str(err) LOGGER.debug(msg) raise RuntimeError(msg) LOGGER.debug('Not a CSW, attempting to fetch Dublin Core') try: content = util.http_request('GET', record) except Exception, err: raise RuntimeError('HTTP error: %s' % str(err)) return [_parse_dc(context, repos, etree.fromstring(content))]
def _parse_waf(context, repos, record, identifier): recobjs = [] content = util.http_request('GET', record) LOGGER.debug(content) try: parser = etree.HTMLParser() tree = etree.fromstring(content, parser=parser) except Exception, err: raise Exception('Could not parse WAF: %s' % str(err))
if os.path.splitext(sfile)[1] not in ['.xml', '.txt']: break if sfile == 'requests.txt': # GET requests filename = '%s%s%s' % (root, os.sep, sfile) gets = csv.reader(open(filename)) for row in gets: testfile = '%s%s%s' % (root, os.sep, sfile) request = ','.join(row[1:]).replace('PYCSW_SERVER', URL) outfile = '%s%s' % (root.replace(os.sep, '_'), '_%s.xml' % row[0]) expected = 'expected%s%s' % (os.sep, outfile) print '\n test %s:%s' % (testfile, row[0]) result = http_request('GET', request) status = get_validity(expected, result, outfile) if status == 1: print ' passed' PASSED += 1 elif status == 0: print ' initialized' INITED += 1 else: print ' FAILED' FAILED += 1 if LOGWRITER is not None: LOGWRITER.writerow([URL, cfg,
elif mtype == 'http://www.opengis.net/wms': # WMS return _parse_wms(context, repos, record, identifier) elif mtype == 'http://www.opengis.net/wps/1.0.0': # WPS return [_parse_wps(context, repos, record, identifier)] elif mtype == 'http://www.opengis.net/wfs': # WFS return _parse_wfs(context, repos, record, identifier) elif mtype == 'http://www.opengis.net/wcs': # WCS return _parse_wcs(context, repos, record, identifier) elif (mtype == 'http://www.opengis.net/cat/csw/csdgm' and record.startswith('http')): # FGDC record = util.http_request('GET', record) # parse metadata records if isinstance(record, str): exml = etree.fromstring(record) else: # already serialized to lxml if hasattr(record, 'getroot'): # standalone document exml = record.getroot() else: # part of a larger document exml = record root = exml.tag LOGGER.debug('Serialized metadata, parsing content model') if root == '{%s}MD_Metadata' % context.namespaces['gmd']: # ISO
elif mtype == 'http://www.opengis.net/wcs': # WCS LOGGER.debug('WCS detected, fetching via OWSLib') return _parse_wcs(context, repos, record, identifier) elif mtype == 'http://www.opengis.net/sos/1.0': # SOS 1.0.0 LOGGER.debug('SOS 1.0.0 detected, fetching via OWSLib') return _parse_sos(context, repos, record, identifier, '1.0.0') elif mtype == 'http://www.opengis.net/sos/2.0': # SOS 2.0.0 LOGGER.debug('SOS 2.0.0 detected, fetching via OWSLib') return _parse_sos(context, repos, record, identifier, '2.0.0') elif (mtype == 'http://www.opengis.net/cat/csw/csdgm' and record.startswith('http')): # FGDC LOGGER.debug('FGDC detected, fetching via HTTP') record = util.http_request('GET', record) return _parse_metadata(context, repos, record) def _set(context, obj, name, value): ''' convenience method to set values ''' setattr(obj, context.md_core_model['mappings'][name], value) def _parse_metadata(context, repos, record): """parse metadata formats""" if isinstance(record, str): exml = etree.fromstring(record) else: # already serialized to lxml if hasattr(record, 'getroot'): # standalone document exml = record.getroot()
if os.path.splitext(sfile)[1] not in ['.xml', '.txt']: break if sfile == 'requests.txt': # GET requests filename = '%s%s%s' % (root, os.sep, sfile) gets = csv.reader(open(filename)) for row in gets: testfile = '%s%s%s' % (root, os.sep, sfile) request = ','.join(row[1:]).replace( 'PYCSW_SERVER', URL) outfile = '%s%s' % (root.replace( os.sep, '_'), '_%s.xml' % row[0]) expected = 'expected%s%s' % (os.sep, outfile) print '\n test %s:%s' % (testfile, row[0]) result = http_request('GET', request) status = get_validity(expected, result, outfile, force_id_mask) if status == 1: print ' passed' PASSED += 1 elif status == 0: print ' initialized' INITED += 1 elif status == -1 and DATABASE == 'PostgreSQL': print ' warning: possible collation issue' WARNING += 1 else: print ' FAILED'
elif mtype == 'http://www.opengis.net/wcs': # WCS LOGGER.debug('WCS detected, fetching via OWSLib') return _parse_wcs(context, repos, record, identifier) elif mtype == 'http://www.opengis.net/sos/1.0': # SOS 1.0.0 LOGGER.debug('SOS 1.0.0 detected, fetching via OWSLib') return _parse_sos(context, repos, record, identifier, '1.0.0') elif mtype == 'http://www.opengis.net/sos/2.0': # SOS 2.0.0 LOGGER.debug('SOS 2.0.0 detected, fetching via OWSLib') return _parse_sos(context, repos, record, identifier, '2.0.0') elif (mtype == 'http://www.opengis.net/cat/csw/csdgm' and record.startswith('http')): # FGDC LOGGER.debug('FGDC detected, fetching via HTTP') record = util.http_request('GET', record) return _parse_metadata(context, repos, record) def _set(context, obj, name, value): ''' convenience method to set values ''' setattr(obj, context.md_core_model['mappings'][name], value) def _parse_metadata(context, repos, record): """parse metadata formats""" if isinstance(record, str): exml = etree.fromstring(record) else: # already serialized to lxml
def parse_record(context, record, repos=None, mtype='http://www.opengis.net/cat/csw/2.0.2', identifier=None, pagesize=10): ''' parse metadata ''' if identifier is None: identifier = uuid.uuid4().get_urn() # parse web services if (mtype == 'http://www.opengis.net/cat/csw/2.0.2' and isinstance(record, str) and record.startswith('http')): LOGGER.debug('CSW service detected, fetching via HTTP') # CSW service, not csw:Record try: return _parse_csw(context, repos, record, identifier, pagesize) except Exception as err: # TODO: implement better exception handling if str(err).find('ExceptionReport') != -1: msg = 'CSW harvesting error: %s' % str(err) LOGGER.debug(msg) raise RuntimeError(msg) LOGGER.debug('Not a CSW, attempting to fetch Dublin Core') try: content = util.http_request('GET', record) except Exception as err: raise RuntimeError('HTTP error: %s' % str(err)) return [_parse_dc(context, repos, etree.fromstring(content))] elif mtype == 'urn:geoss:waf': # WAF LOGGER.debug('WAF detected, fetching via HTTP') return _parse_waf(context, repos, record, identifier) elif mtype == 'http://www.opengis.net/wms': # WMS LOGGER.debug('WMS detected, fetching via OWSLib') return _parse_wms(context, repos, record, identifier) elif mtype == 'http://www.opengis.net/wps/1.0.0': # WPS LOGGER.debug('WPS detected, fetching via OWSLib') return [_parse_wps(context, repos, record, identifier)] elif mtype == 'http://www.opengis.net/wfs': # WFS LOGGER.debug('WFS detected, fetching via OWSLib') return _parse_wfs(context, repos, record, identifier) elif mtype == 'http://www.opengis.net/wcs': # WCS LOGGER.debug('WCS detected, fetching via OWSLib') return _parse_wcs(context, repos, record, identifier) elif mtype == 'http://www.opengis.net/sos/1.0': # SOS 1.0.0 LOGGER.debug('SOS 1.0.0 detected, fetching via OWSLib') return _parse_sos(context, repos, record, identifier, '1.0.0') elif mtype == 'http://www.opengis.net/sos/2.0': # SOS 2.0.0 LOGGER.debug('SOS 2.0.0 detected, fetching via OWSLib') return _parse_sos(context, repos, record, identifier, '2.0.0') elif (mtype == 'http://www.opengis.net/cat/csw/csdgm' and record.startswith('http')): # FGDC LOGGER.debug('FGDC detected, fetching via HTTP') record = util.http_request('GET', record) return _parse_metadata(context, repos, record)