def _parse_site_info(site_info, namespace): """returns a dict representation of a site given an etree object representing a siteInfo element """ site_code = site_info.find(namespace + "siteCode") return_dict = { 'code': site_code.text, 'name': site_info.find(namespace + "siteName").text, 'network': site_code.attrib.get('network'), } agency = site_code.attrib.get('agencyCode') if agency: return_dict['agency'] = agency geog_location = site_info.find( namespace.join(["", "geoLocation/", "geogLocation"])) if not geog_location is None: return_dict['location'] = _parse_geog_location(geog_location, namespace) timezone_info = site_info.find(namespace + "timeZoneInfo") if not timezone_info is None: return_dict['timezone_info'] = _parse_timezone_info( timezone_info, namespace) elevation_m = site_info.find(namespace + 'elevation_m') if not elevation_m is None: return_dict['elevation_m'] = elevation_m.text # WaterML 1.0 notes notes = dict([ (util.camel_to_underscore(note.attrib['title'].replace(' ', '')), note.text) for note in site_info.findall(namespace + 'note') ]) if notes: return_dict['notes'] = notes # WaterML 1.1 siteProperties site_properties = dict([ (util.camel_to_underscore(site_property.attrib['name'].replace( ' ', '')), site_property.text) for site_property in site_info.findall(namespace + 'siteProperty') ]) if site_properties: return_dict['site_property'] = site_properties return return_dict
def _parse_site_info(site_info, namespace): """returns a dict representation of a site given an etree object representing a siteInfo element """ site_code = site_info.find(namespace + "siteCode") return_dict = { 'code': site_code.text, 'name': site_info.find(namespace + "siteName").text, 'network': site_code.attrib.get('network'), } agency = site_code.attrib.get('agencyCode') if agency: return_dict['agency'] = agency geog_location = site_info.find( namespace.join(["", "geoLocation/", "geogLocation"])) if not geog_location is None: return_dict['location'] = _parse_geog_location(geog_location, namespace) timezone_info = site_info.find(namespace + "timeZoneInfo") if not timezone_info is None: return_dict['timezone_info'] = _parse_timezone_info(timezone_info, namespace) elevation_m = site_info.find(namespace + 'elevation_m') if not elevation_m is None: return_dict['elevation_m'] = elevation_m.text # WaterML 1.0 notes notes = dict([ (util.camel_to_underscore(note.attrib['title'].replace(' ', '')), note.text) for note in site_info.findall(namespace + 'note') ]) if notes: return_dict['notes'] = notes # WaterML 1.1 siteProperties site_properties = dict([ (util.camel_to_underscore( site_property.attrib['name'].replace(' ', '')), site_property.text) for site_property in site_info.findall(namespace + 'siteProperty') ]) if site_properties: return_dict['site_property'] = site_properties return return_dict
def _element_dict_attribute_name(attribute_name, element_name, prepend_element_name=True): attribute_only = util.camel_to_underscore(attribute_name.split('}')[-1]) if attribute_only.startswith(element_name) or not prepend_element_name: return attribute_only else: return element_name + '_' + attribute_only
def _element_dict(element, exclude_children=None, prepend_attributes=True): """converts an element to a dict representation with CamelCase tag names and attributes converted to underscores; this is a generic converter for cases where special parsing isn't necessary. In most cases you will want to update with this dict. If prepend_element_name is True (default), then attributes and children will be prepended with the parent element's tag name. Note: does not handle sibling elements """ if element is None: return {} if exclude_children is None: exclude_children = [] element_dict = {} element_name = util.camel_to_underscore(element.tag.split('}')[-1]) if len(element) == 0 and not element.text is None: element_dict[element_name] = element.text element_dict.update(dict([ (_element_dict_attribute_name(key, element_name, prepend_element_name=prepend_attributes), value) for key, value in element.attrib.items() if value.split(':')[0] not in ['xsd', 'xsi'] ])) for child in element.iterchildren(): if not child.tag.split('}')[-1] in exclude_children: element_dict.update(_element_dict(child)) return element_dict
def _service_dict(service_info): """converts a ServiceInfo etree object into a service info dict""" change_keys = [ #(old_key, new_key) ('aabstract', 'abstract'), ('maxx', 'max_x'), ('maxy', 'max_y'), ('minx', 'min_x'), ('miny', 'min_y'), ('orgwebsite', 'organization_website'), ('serv_url', 'service_url'), ('sitecount', 'site_count'), ('valuecount', 'value_count'), ('variablecount', 'variable_count'), ] service_dict = dict([ (util.camel_to_underscore(key), _cast_if_text(value)) for key, value in dict(service_info).items() ]) for old_key, new_key in change_keys: if old_key in service_dict: service_dict[new_key] = service_dict[old_key] del service_dict[old_key] return service_dict
def _series_dict(series_info): """converts a ServiceInfo etree object into a service info dict""" change_keys = [ #(old_key, new_key) ('aabstract', 'abstract'), ('maxx', 'max_x'), ('maxy', 'max_y'), ('minx', 'min_x'), ('miny', 'min_y'), ('orgwebsite', 'organization_website'), ('serv_url', 'service_url'), ('sitecount', 'site_count'), ('valuecount', 'value_count'), ('variablecount', 'variable_count'), ] series_dict = dict([ (util.camel_to_underscore(key), core._cast_if_text(value)) for key, value in dict(series_info).iteritems() ]) for old_key, new_key in change_keys: if old_key in series_dict: series_dict[new_key] = series_dict[old_key] del series_dict[old_key] return series_dict
def _element_dict(element, exclude_children=None, prepend_attributes=True): """converts an element to a dict representation with CamelCase tag names and attributes converted to underscores; this is a generic converter for cases where special parsing isn't necessary. In most cases you will want to update with this dict. If prepend_element_name is True (default), then attributes and children will be prepended with the parent element's tag name. Note: does not handle sibling elements """ if element is None: return {} if exclude_children is None: exclude_children = [] element_dict = {} element_name = util.camel_to_underscore(element.tag.split('}')[-1]) if len(element) == 0 and not element.text is None: element_dict[element_name] = element.text element_dict.update(dict([ (_element_dict_attribute_name(key, element_name, prepend_element_name=prepend_attributes), value) for key, value in element.attrib.iteritems() if value.split(':')[0] not in ['xsd', 'xsi'] ])) for child in element.iterchildren(): if not child.tag.split('}')[-1] in exclude_children: element_dict.update(_element_dict(child)) return element_dict
def _parse_metadata(values_element, metadata_elements, namespace): metadata = {} for tag, collection_name, key in metadata_elements: underscored_tag = util.camel_to_underscore(tag) collection = [ _scrub_prefix(_element_dict(element, namespace), underscored_tag) for element in values_element.findall(namespace + tag) ] if len([x for x in collection if len(x)]): collection_dict = dict([(item[key], item) for item in collection]) metadata[collection_name] = collection_dict return metadata
def parse_site_values(content_io, namespace, query_isodate=None): """parses values out of a waterml file; content_io should be a file-like object""" data_dict = {} metadata_elements = [ # (element name, name of collection, # key from element dict to use as for a key in the collections dict) ('censorCode', 'censor_codes', 'censor_code'), ('method', 'methods', 'id'), ('offset', 'offsets', 'id'), ('qualifier', 'qualifiers', 'id'), ('qualityControlLevel', 'quality_control_levels', 'id'), ('source', 'sources', 'id') ] for (event, ele) in etree.iterparse(content_io): if ele.tag == namespace + "timeSeries": source_info_element = ele.find(namespace + 'sourceInfo') site_info = _parse_site_info(source_info_element, namespace) values_element = ele.find(namespace + 'values') values = _parse_values(values_element, namespace) var_element = ele.find(namespace + 'variable') variable = _parse_variable(var_element, namespace) code = variable['code'] if 'statistic' in variable: code += ":" + variable['statistic']['code'] data_dict[code] = { 'site': site_info, 'values': values, 'variable': variable, } for tag, collection_name, key in metadata_elements: underscored_tag = util.camel_to_underscore(tag) collection = [ _scrub_prefix(_element_dict(element, namespace), underscored_tag) for element in values_element.findall(namespace + tag) ] if len(collection): collection_dict = dict([ (item[key], item) for item in collection ]) data_dict[code][collection_name] = collection_dict if query_isodate: data_dict[code]['last_refresh'] = query_isodate return data_dict
def parse_site_values(content_io, namespace, query_isodate=None): """parses values out of a waterml file; content_io should be a file-like object""" data_dict = {} metadata_elements = [ # (element name, name of collection, # key from element dict to use as for a key in the collections dict) ('censorCode', 'censor_codes', 'censor_code'), ('method', 'methods', 'id'), ('offset', 'offsets', 'id'), ('qualifier', 'qualifiers', 'id'), ('qualityControlLevel', 'quality_control_levels', 'id'), ('source', 'sources', 'id') ] for (event, ele) in etree.iterparse(content_io): if ele.tag == namespace + "timeSeries": source_info_element = ele.find(namespace + 'sourceInfo') site_info = _parse_site_info(source_info_element, namespace) values_element = ele.find(namespace + 'values') values = _parse_values(values_element, namespace) var_element = ele.find(namespace + 'variable') variable = _parse_variable(var_element, namespace) code = variable['code'] if 'statistic' in variable: code += ":" + variable['statistic']['code'] data_dict[code] = { 'site': site_info, 'values': values, 'variable': variable, } for tag, collection_name, key in metadata_elements: underscored_tag = util.camel_to_underscore(tag) collection = [ _scrub_prefix(_element_dict(element, namespace), underscored_tag) for element in values_element.findall(namespace + tag) ] if len(filter(lambda x: len(x), collection)): collection_dict = dict([ (item[key], item) for item in collection ]) data_dict[code][collection_name] = collection_dict if query_isodate: data_dict[code]['last_refresh'] = query_isodate return data_dict
def _parse_metadata(values_element, metadata_elements, namespace): metadata = {} for tag, collection_name, key in metadata_elements: underscored_tag = util.camel_to_underscore(tag) collection = [ _scrub_prefix(_element_dict(element, namespace), underscored_tag) for element in values_element.findall(namespace + tag) ] if len([x for x in collection if len(x)]): collection_dict = dict([ (item[key], item) for item in collection ]) metadata[collection_name] = collection_dict return metadata
def _parse_series(series, namespace): include_elements = [ 'method', 'Method', 'source', 'Source', 'QualityControlLevel', 'qualityControlLevel', 'variableTimeInterval', 'valueCount', ] series_dict = {} variable_element = series.find(namespace + 'variable') series_dict['variable'] = _parse_variable(variable_element, namespace) for include_element in include_elements: element = series.find(namespace + include_element) if not element is None: name = util.camel_to_underscore(element.tag) element_dict = _scrub_prefix(_element_dict(element), name) series_dict[name] = element_dict return series_dict