def test_xml_fromstring(self): 'xml_fromstring=False does not convert types' x2j_convert = self.check_data(xmljson.BadgerFish(xml_fromstring=True)) x2j_strings = self.check_data(xmljson.BadgerFish(xml_fromstring=str)) x2j_convert('{"x": {"$": 1}}', '<x>1</x>') x2j_strings('{"x": {"$": "1"}}', '<x>1</x>') x2j_convert('{"x": {"$": true}}', '<x>true</x>') x2j_strings('{"x": {"$": "true"}}', '<x>true</x>') j2x_convert = self.check_etree(xmljson.BadgerFish(xml_tostring=True)) j2x_strings = self.check_etree(xmljson.BadgerFish(xml_tostring=str)) j2x_convert({"x": {"$": True}}, '<x>true</x>') j2x_strings({"x": {"$": True}}, '<x>True</x>') j2x_convert({"x": {"$": False}}, '<x>false</x>') j2x_strings({"x": {"$": False}}, '<x>False</x>')
def _parse_xml(root): """Take the lxml.Element *root* and extract the TrafficHistory detail from the source XML. Returns: flattened JSON variant of the source XML """ xpath = '//a:TrafficHistoryResponse/b:Response/b:TrafficHistoryResult' _ns = domain_intel.common.NS results = root.xpath(xpath, namespaces=_ns) # See if we can find a DataUrl element to display. url_xpath = './b:Alexa/b:TrafficHistory/b:Site/text()' urls = [x.xpath(url_xpath, namespaces=_ns)[0] for x in results] log.info('TrafficHistory flattening domain: %s', ', '.join(['"{}"'.format(x) for x in urls])) # Extract the historical data. data_xpath = './b:Alexa/b:TrafficHistory' traffic = results[0].xpath(data_xpath, namespaces=_ns) bf_json = xmljson.BadgerFish(dict_type=collections.OrderedDict) ns_replace = r'{{{0}}}'.format(domain_intel.common.NS_20050711) xml_to_json = json.dumps(bf_json.data(traffic[0])) return xml_to_json.replace(ns_replace, '')
def test_html(self): 'BadgerFish conversion from data to HTML' html_converter = xmljson.BadgerFish(element=lxml.html.Element) self.test_etree(html_converter) eq = self.check_etree(html_converter, tostring=lxml.html.tostring, fromstring=lxml.html.fromstring) eq( { 'div': Dict([ ('p', { '$': 'paragraph' }), ('hr', {}), ('ul', { 'li': [{ '$': '1' }, { '$': '2' }] }), ]) }, '<div><p>paragraph</p><hr><ul><li>1</li><li>2</li></ul></div>')
def getBroaderTerm(self,collection,version,term): lstTerms = [] url='{}collection/{}/{}/{}/'.format(self.baseUrl,collection,version,term) response = requests.get(url,params={}) if (response.ok): payload = fromstring(response.content) bf = xmljson.BadgerFish() dict = bf.data(payload) concept = dict['{0}RDF'.format(rdf)]['{0}Concept'.format(skos)] if concept: broaderTerms=[] key='{0}broader'.format(skos) if (key in concept.keys()): term = concept[key] if (term): if (len(term)==1): broaderTerms=[term['@{}resource'.format(rdf)]] else: terms = [i["@{}resource".format(rdf)] for i in term] broaderTerms=[s for s in sorted(terms)] lstTerms = [self.getTermFromUrl(t) for t in broaderTerms] return lstTerms
def test_cli(self): dialects = [ xmljson.Abdera(), xmljson.BadgerFish(), xmljson.Cobra(), xmljson.GData(), xmljson.Parker(), xmljson.Yahoo() ] for dialect in dialects: for path in [ 'abdera-1.xml', 'abdera-2.xml', 'abdera-3.xml', 'abdera-4.xml' ]: in_file = io.open(os.path.join(_folder, path), encoding='utf-8') out_file = openwrite(self.tmp) main(in_file, out_file, dialect) in_file = io.open(os.path.join(_folder, path), encoding='utf-8') out_file = io.open(self.tmp, encoding='utf-8') with closing(in_file), closing(out_file): self.assertEqual(json.load(out_file), dialect.data(parse(in_file).getroot()))
def flatten_batched_xml(xml): """Batched Alexa responses need to be parsed and extracted into individual domain components ready for next data flow path. Also want to strip off redundant Alexa control XML elements that have no value in our problem domain. Args: *xml*: the source XML to process Returns: list of domain-based XML """ root = lxml.etree.fromstring(xml) xpath = '//a:UrlInfoResponse/b:Response/b:UrlInfoResult' _ns = domain_intel.common.NS xml_domains = root.xpath(xpath, namespaces=_ns) # See if we can find a DataUrl element to display. url_xpath = './b:Alexa/b:ContentData/b:DataUrl/text()' urls = [x.xpath(url_xpath, namespaces=_ns)[0] for x in xml_domains] log.info('Batched URLs sourced: %s', ', '.join(['"{}"'.format(x) for x in urls])) bf_json = xmljson.BadgerFish(dict_type=collections.OrderedDict) ns_replace = r'{{{0}}}'.format(domain_intel.common.NS_20050711) xml_to_json = [json.dumps(bf_json.data(x)) for x in xml_domains] return [x.replace(ns_replace, '') for x in xml_to_json]
def getCapabilities_KVP(self, parameters={}): sensors = [] offerings = [] url = '{}sos'.format(self.baseUrl) parameters['service'] = 'SOS' parameters['request'] = 'GetCapabilities' start = datetime.now() response = requests.get(url, params=parameters) end = datetime.now() if (response.ok): if self.showInfo: print('SOS Capabilities') payload = fromstring(response.content) print(payload) # BadgerFish: Use "$" for text content, @ to prefix attributes # GData: Use "$t" for text content, attributes added as-is # Yahoo Use "content" for text content, attributes added as-is # Parker: Use tail nodes for text content, ignore attributes bf = xmljson.BadgerFish() dict = bf.data(payload) operation = dict['{0}Capabilities'.format(sos)][ '{0}OperationsMetadata'.format(ows)]['{0}Operation'.format( ows)] gc = [o for o in operation if o['@name'] == 'GetCapabilities'][0] ds = [o for o in operation if o['@name'] == 'DescribeSensor'][0] prm = ds['{0}Parameter'.format(ows)] p = [p for p in prm if p['@name'] == 'procedure'][0] av = p['{0}AllowedValues'.format(ows)] values = av['{0}Value'.format(ows)] for val in [v['$'] for v in values]: valDef = val.split(':') value = valDef[len(valDef) - 1] sensors.append(value) if self.showInfo: print(value) go = [o for o in operation if o['@name'] == 'GetObservation'][0] prm = go['{0}Parameter'.format(ows)] p = [p for p in prm if p['@name'] == 'offering'][0] av = p['{0}AllowedValues'.format(ows)] values = av['{0}Value'.format(ows)] for val in [v['$'] for v in values]: valDef = val.split('Offering') value = valDef[len(valDef) - 1] offerings.append(value) else: util.printErrorMessage(response, parameters) return False if self.showInfo: util.printResponseTime(end, start) return {"sensors": sensors, "offerings": offerings}
def test_custom_root(self): for etree in (xml.etree.cElementTree, lxml.etree, lxml.html): conv = xmljson.BadgerFish(element=etree.Element) self.assertEqual( decode( etree.tostring( conv.etree({'p': { '$': 1 }}, etree.fromstring('<html/>')))), '<html><p>1</p></html>')
def getCollections(self): coll = [] url='{}collection'.format(self.baseUrl) parameters = {} response = requests.get(url,params=parameters) if (response.ok): payload = fromstring(response.content) bf = xmljson.BadgerFish() dict = bf.data(payload) for collection in dict['{0}RDF'.format(rdf)]['{}Collection'.format(skos)]: dicCol = {'url':collection['@{}about'.format(rdf)]} tags=[[skos,'prefLabel'], [dc,'title'], [skos,'altLabel'], [dc,'alternative'], [dc,'description'], [dc,'creator'], [grg,'RE_RegisterOwner'], [rdfs,'comment'], [dc,'publisher'], [owl,'versionInfo'], [dc,'date']] for tag in tags: key='{}{}'.format(tag[0],tag[1]) if key in collection: val = collection[key] if val is not None: dicCol[tag[1]]=val['$'] for t in ['broader','narrower','sameAs','related']: key='{0}{1}'.format(skos,t) if (key in collection.keys()): term = collection[key] if (term): if (len(term)==1): dicCol['{}Term'.format(t)]=term['@{}resource'.format(rdf)] else: terms = [i["@{}resource".format(rdf)] for i in term] dicCol['{}Term'.format(t)]=[s for s in sorted(terms)] coll.append(dicCol) else: util.printErrorMessage(response,{}) return coll
def _xml_to_json(self): """Convert raw Alexa SitesLinkingIn action result to JSON. """ root = lxml.etree.fromstring(self.xml) xpath = ('//a:SitesLinkingInResponse/' 'b:Response/' 'b:SitesLinkingInResult') sites = root.xpath(xpath, namespaces=NS) if sites: bf_json = xmljson.BadgerFish(dict_type=collections.OrderedDict) tmp = json.dumps(bf_json.data(sites[0])) ns_replace = r'{{{0}}}'.format(NS_20050711) self.__as_json = tmp.replace(ns_replace, '') else: log.error('Unable to parse SitesLinkingIn XML: %s', self.xml)
def getTermFromUrl(self,url): dicTerm = None response = requests.get(url,params={}) if (response.ok): payload = fromstring(response.content) bf = xmljson.BadgerFish() dict = bf.data(payload) concept = dict['{0}RDF'.format(rdf)]['{0}Concept'.format(skos)] if concept: dicTerm = {'url':concept['@{}about'.format(rdf)]} tags=[[skos,'prefLabel'], [skos,'definition'], [dc,'identifier'], [owl,'versionInfo']] for tag in tags: key='{}{}'.format(tag[0],tag[1]) try: if key in concept: val = concept[key] if val is not None: dicTerm[tag[1]]=val['$'] except: print('error retrieving key: {}'.key) for t in ['broader','narrower','sameAs','related']: key='{0}{1}'.format(skos,t) if (key in concept.keys()): term = concept[key] if (term): if (len(term)==1): dicTerm['{}Term'.format(t)]=term['@{}resource'.format(rdf)] else: terms = [i["@{}resource".format(rdf)] for i in term] dicTerm['{}Term'.format(t)]=[s for s in sorted(terms)] else: util.printErrorMessage(response,{}) return dicTerm
def processWFSResponse(response): """Used to convert XML response to relevant lists of data. Expects XML response in MultiPointCoverage format. Yes, it's ugly. """ # For reasons mysterious, sometimes the response is not # a cStringIO object but a ResponseWrapper object. try: xmlstring = response.getvalue() except AttributeError: xmlstring = response.read() try: xmlstring = xmlstring.encode() # Make sure response is in bytes except AttributeError: pass # A hack to remove namespaces and gml attributes from XML response # to make further processing easier. Yeah, not proud about this. But it works. # - recipe 1: Strip namespace:xxx attributes from all tag beginnings # - recipe 2: Strip namespace:xxx attributes from all tag end # - recipe 3: Strip gml: attributes from all tag ends recipes = [('<[a-zA-Z0-9]*?:', '<'), ('</.*?:', '</'), ('gml:', '')] for recipe in recipes: try: xmlstring = re.sub(recipe[0], recipe[1], xmlstring) except TypeError: # Depending on implementation, xmlstring is returned as string or bytes xmlstring = re.sub(recipe[0].encode(), recipe[1].encode(), xmlstring) # # Convert data from xml string to json to dictionary using the BadgerFish notation bf = xmljson.BadgerFish() data = bf.data(xml.etree.ElementTree.fromstring(xmlstring)) # # Extract relevant data from data dictionary if int(data['FeatureCollection']['@numberReturned']) > 0: fmisid = data['FeatureCollection']['member']['GridSeriesObservation'] \ ['featureOfInterest']['SF_SpatialSamplingFeature'] \ ['sampledFeature']['LocationCollection']['member'] \ ['Location']['identifier']['$'] # station = data['FeatureCollection']['member']['GridSeriesObservation'] \ ['featureOfInterest']['SF_SpatialSamplingFeature'] \ ['sampledFeature']['LocationCollection']['member'] \ ['Location']['name'][0]['$'] # latlontime = data['FeatureCollection']['member']['GridSeriesObservation'] \ ['result']['MultiPointCoverage']['domainSet'] \ ['SimpleMultiPoint']['positions']['$'] # values = data['FeatureCollection']['member']['GridSeriesObservation'] \ ['result']['MultiPointCoverage']['rangeSet']['DataBlock'] \ ['doubleOrNilReasonTupleList']['$'] # # Get names of variables varlist = data['FeatureCollection']['member']['GridSeriesObservation'] \ ['result']['MultiPointCoverage']['rangeType']['DataRecord'] \ ['field'] # try: temp = len(varlist[0]) # Try to recognise if returned data is LIST except KeyError: varlist = [ varlist ] # Convert string to list with only element being the string nbvars = len(varlist) varnames = [] for var in varlist: varnames.append(var['@name']) # # Convert data string to numpy array and reshape valuearray = numpy.fromstring(values, sep=' ') # parse data to 1-D array valuearray = valuearray.reshape( (len(valuearray) // nbvars, nbvars)) # Reshape 1-D array to 2-D array coordarray = numpy.fromstring(latlontime, sep=' ') # parse data to 1-D array coordarray = coordarray.reshape( (len(coordarray) // 3, 3)) # Reshape 1-D array to 2-D array times = coordarray[:, 2] else: print("Zero matches returned") return None, None, None, None, None, None, None # return nbvars, varnames, fmisid, station, coordarray[ 0, 0:2], times, valuearray
def getCapabilities_SOAP(self): url = '{}sos?WSDL'.format(self.baseUrl) headers = {'content-type': 'application/soap+xml'} #headers = {'content-type': 'text/xml'} body = """<?xml version="1.0" encoding="UTF-8"?> <GetCapabilities xmlns="http://www.opengis.net/sos/1.0" xmlns:ows="http://www.opengis.net/ows/1.1" xmlns:ogc="http://www.opengis.net/ogc" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.opengis.net/sos/1.0 http://schemas.opengis.net/sos/1.0.0/sosGetCapabilities.xsd" service="SOS"> <ows:AcceptVersions> <ows:Version>1.0.0</ows:Version> </ows:AcceptVersions> <ows:Sections> <ows:Section>ServiceIdentification</ows:Section> <ows:Section>ServiceProvider</ows:Section> <ows:Section>OperationsMetadata</ows:Section> <ows:Section>Filter_Capabilities</ows:Section> <ows:Section>Contents</ows:Section> </ows:Sections> </GetCapabilities>""" sensors = [] start = datetime.now() response = requests.post(url, data=body, headers=headers) end = datetime.now() if (response.ok): print('SOS Capabilities') payload = fromstring(response.content) bf = xmljson.BadgerFish() dict = bf.data(payload) operation = dict['{0}Capabilities'.format(sos)][ '{0}OperationsMetadata'.format(ows)]['{0}Operation'.format( ows)] gc = [o for o in operation if o['@name'] == 'GetCapabilities'][0] go = [o for o in operation if o['@name'] == 'GetObservation'][0] ds = [o for o in operation if o['@name'] == 'DescribeSensor'][0] prm = ds['{0}Parameter'.format(ows)] p = [p for p in prm if p['@name'] == 'procedure'][0] av = p['{0}AllowedValues'.format(ows)] values = av['{0}Value'.format(ows)] for val in [v['$'] for v in values]: valDef = val.split(':') value = valDef[len(valDef) - 1] sensors.append(value) if self.showInfo: print(value) else: util.printErrorMessage(response, {}) return False util.printResponseTime(end, start) return sensors
def getObservation_SOAP(self, offering, begin, end, observedProperty, propertyDictionary={}): result = {} url = '{}sos?WSDL'.format(self.baseUrl) headers = {'content-type': 'application/soap+xml'} #headers = {'content-type': 'text/xml'} observedProperties = '\n'.join([ ' <observedProperty>urn:ogc:def:phenomenon:{}</observedProperty>' .format(p) for p in observedProperty ]) body = """<?xml version="1.0" encoding="UTF-8"?> <GetObservation xmlns="http://www.opengis.net/sos/1.0" xmlns:ows="http://www.opengis.net/ows/1.1" xmlns:gml="http://www.opengis.net/gml" xmlns:ogc="http://www.opengis.net/ogc" xmlns:om="http://www.opengis.net/om/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.opengis.net/sos/1.0 http://schemas.opengis.net/sos/1.0.0/sosGetObservation.xsd" service="SOS" version="1.0.0" srsName="urn:ogc:def:crs:EPSG::4326"> <offering>Offering{0}</offering> <eventTime> <ogc:TM_During> <ogc:PropertyName>om:samplingTime</ogc:PropertyName> <gml:TimePeriod> <gml:beginPosition>{1}</gml:beginPosition> <gml:endPosition>{2}</gml:endPosition> </gml:TimePeriod> </ogc:TM_During> </eventTime> {3} <responseFormat>text/xml;subtype="om/1.0.0"</responseFormat> </GetObservation>""".format(offering, begin, end, observedProperties) print(body) start = datetime.now() response = requests.post(url, data=body, headers=headers) end = datetime.now() if (response.ok): if self.showInfo: print('SOS Get Observation') payload = fromstring(response.content) print(payload) # BadgerFish: Use "$" for text content, @ to prefix attributes # GData: Use "$t" for text content, attributes added as-is # Yahoo Use "content" for text content, attributes added as-is # Parker: Use tail nodes for text content, ignore attributes bf = xmljson.BadgerFish() dict = bf.data(payload) fields = [] observations = [] if '{0}ExceptionReport'.format(ows) in dict.keys(): exceptionReport = dict['{0}ExceptionReport'.format(ows)] exception = exceptionReport['{0}Exception'.format(ows)] print( exception['{http://www.opengis.net/ows/1.1}ExceptionText'] ['$']) return result oc = dict['{0}ObservationCollection'.format(om)] id = oc['@{0}id'.format(gml)] result['id'] = id member = oc['{}member'.format(om)] try: if member[ '@{http://www.w3.org/1999/xlink}href'] == 'urn:ogc:def:nil:OGC:inapplicable': print('There is no data for device and time range') return result except: pass observation = member['{0}Observation'.format(om)] idGo = observation['@{0}id'.format(gml)] #samplingTime = observation['{0}samplingTime'.format(om)] timePeriod = observation['{0}samplingTime'.format(om)][ '{0}TimePeriod'.format(gml)] timePeriodType = timePeriod[ '@{http://www.w3.org/2001/XMLSchema-instance}type'] beginPosition = timePeriod['{0}beginPosition'.format(gml)]['$'] endPosition = timePeriod['{0}endPosition'.format(gml)]['$'] result['samplingTime'] = { "begin": beginPosition, "end": endPosition } procedure = observation['{0}procedure'.format(om)][ '@{0}href'.format(xlink)] #observedProperty = observation['{0}observedProperty'.format(om)]['@{0}href'.format(xlink)] cp = observation['{0}observedProperty'.format(om)][ '{0}CompositePhenomenon'.format(swe)] cpName = cp['{0}name'.format(gml)]['$'] cpLink = [] for c in cp['{0}component'.format(swe)]: cpLink.append(c['@{0}href'.format(xlink)]) result['CompositePhenomenon'] = { "name": cpName, "components": cpLink } foiTitle = observation['{0}featureOfInterest'.format(om)][ '@{0}title'.format(xlink)] foiLink = observation['{0}featureOfInterest'.format(om)][ '@{0}href'.format(xlink)] result['featureOfInterest'] = foiTitle dataArray = observation['{0}result'.format(om)][ '{0}DataArray'.format(swe)] elementCount = dataArray['{0}elementCount'.format(swe)][ '{0}Count'.format(swe)]['{0}value'.format(swe)]['$'] elementType = dataArray['{0}elementType'.format(swe)] for f in elementType['{0}DataRecord'.format(swe)][ '{0}field'.format(swe)]: name = f['@name'] fields.append(name) if self.showInfo: print(fields) encoding = dataArray['{0}encoding'.format(swe)][ '{0}TextBlock'.format(swe)] decimalSeparator = encoding['@decimalSeparator'] tokenSeparator = encoding['@tokenSeparator'] blockSeparator = encoding['@blockSeparator'] values = dataArray['{0}values'.format(swe)]['$'].split( blockSeparator) for v in values: observation = {} #OrderedDict() o = v.split(tokenSeparator) if len(o) == len(fields): for i in range(0, len(fields)): field = str(fields[i]) if (field in propertyDictionary.keys()): field = propertyDictionary[field] observation[field] = float(o[i]) else: observation[field] = o[i] if self.showInfo: print(v) if self.showInfo: print(observation) observations.append(observation) result['observations'] = observations else: util.printErrorMessage(response, {}) return False if self.showInfo: util.printResponseTime(end, start) return result
def describSensor_SOAP(self, sensor): deviceInfo = {} url = '{}sos?WSDL'.format(self.baseUrl) headers = {'content-type': 'application/soap+xml'} #headers = {'content-type': 'text/xml'} body = """<?xml version="1.0" encoding="UTF-8"?> <DescribeSensor version="1.0.0" service="SOS" xmlns="http://www.opengis.net/sos/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.opengis.net/sos/1.0 http://schemas.opengis.net/sos/1.0.0/sosDescribeSensor.xsd" outputFormat="text/xml;subtype="sensorML/1.0.1""> <procedure>urn:ogc:def:sensor:{0}</procedure> </DescribeSensor>""".format(sensor) start = datetime.now() response = requests.post(url, data=body, headers=headers) end = datetime.now() if (response.ok): if self.showInfo: print('SOS Describe Sensor') payload = fromstring(response.content) # BadgerFish: Use "$" for text content, @ to prefix attributes # GData: Use "$t" for text content, attributes added as-is # Yahoo Use "content" for text content, attributes added as-is # Parker: Use tail nodes for text content, ignore attributes bf = xmljson.BadgerFish() dict = bf.data(payload) member = dict['{0}SensorML'.format(sensorML)]['{0}member'.format( sensorML)]['{0}System'.format(sensorML)] keywords = member['{0}keywords'.format(sensorML)][ '{0}KeywordList'.format(sensorML)] #Get the Short Name, Long Name and Unique Identifier identifiers = member['{0}identification'.format(sensorML)][ '{0}IdentifierList'.format(sensorML)] for i in identifiers['{0}identifier'.format(sensorML)]: term = i['{0}Term'.format(sensorML)]['@definition'] value = i['{0}Term'.format(sensorML)]['{0}value'.format( sensorML)]['$'] idDef = term.split(':') id = idDef[len(idDef) - 1] valDef = value.split(':') val = valDef[len(valDef) - 1] deviceInfo[id] = val #Get the Sensor Name and ID sensors = {} inputs = member['{0}inputs'.format(sensorML)][ '{0}InputList'.format(sensorML)] if ('{0}input'.format(sensorML) in inputs): for i in inputs['{0}input'.format(sensorML)]: name = i['@name'] sensorDef = i['{0}ObservableProperty'.format( swe)]['@definition'].split(':') sensorId = sensorDef[len(sensorDef) - 1] if self.showInfo: print(' {} ({})'.format(name, sensorId)) sensors[sensorId] = name deviceInfo['sensors'] = sensors else: util.printErrorMessage(response, {}) return False if self.showInfo: util.printResponseTime(end, start) return deviceInfo