def _parse_xml_legal_reports(self, xml_obj: Element): """ Parses existing CI_Date elements from the MD_DataIdentification element Args: xml_obj (Element): The document xml element Returns: """ data_quality_elem = xml_helper.try_get_single_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("DQ_DataQuality"), xml_obj) report_elems = xml_helper.try_get_single_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("report"), xml_obj) for report_elem in report_elems: report = LegalReport() report.title = xml_helper.try_get_text_from_xml_element( report_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("title") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) report.explanation = xml_helper.try_get_text_from_xml_element( report_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("explanation") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) legal_date = LegalDate() legal_date.date = xml_helper.try_get_text_from_xml_element( report_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Date")) legal_date.date_type_code = xml_helper.try_get_attribute_from_xml_element( report_elem, "codeListValue", ".//" + GENERIC_NAMESPACE_TEMPLATE.format("CI_DateTypeCode")) legal_date.date_type_code_list_url = xml_helper.try_get_attribute_from_xml_element( report_elem, "codeList", ".//" + GENERIC_NAMESPACE_TEMPLATE.format("CI_DateTypeCode")) report.date = legal_date self.legal_reports.append(report)
def parse_style(self, layer, layer_obj): style_xml = xml_helper.try_get_single_element_from_xml( "./" + GENERIC_NAMESPACE_TEMPLATE.format("Style"), layer) if style_xml is None: # no <Style> element found return style_obj = Style() style_obj.name = xml_helper.try_get_text_from_xml_element( style_xml, "./" + GENERIC_NAMESPACE_TEMPLATE.format("Name")) style_obj.title = xml_helper.try_get_text_from_xml_element( style_xml, "./" + GENERIC_NAMESPACE_TEMPLATE.format("Title")) legend_elem = xml_helper.try_get_single_element_from_xml( elem="./" + GENERIC_NAMESPACE_TEMPLATE.format("LegendURL") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"), xml_elem=style_xml) style_obj.legend_uri = xml_helper.get_href_attribute(legend_elem) style_obj.width = int( xml_helper.try_get_attribute_from_xml_element( style_xml, "width", "./" + GENERIC_NAMESPACE_TEMPLATE.format("LegendURL")) or 0) style_obj.height = int( xml_helper.try_get_attribute_from_xml_element( style_xml, "height", "./" + GENERIC_NAMESPACE_TEMPLATE.format("LegendURL")) or 0) style_obj.mime_type = MimeType.objects.filter( mime_type=xml_helper.try_get_text_from_xml_element( style_xml, "./" + GENERIC_NAMESPACE_TEMPLATE.format("LegendURL") + "/ " + GENERIC_NAMESPACE_TEMPLATE.format("Format"))).first() layer_obj.style = style_obj
def _parse_xml_legal_dates(self, xml_obj: Element): """ Parses existing CI_Date elements from the MD_DataIdentification element Args: xml_obj (Element): The document xml element Returns: """ md_data_ident_elem = xml_helper.try_get_single_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_DataIdentification"), xml_obj) legal_date_elems = xml_helper.try_get_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("CI_Date"), md_data_ident_elem) if legal_date_elems: for legal_date_elem in legal_date_elems: legal_date = LegalDate() legal_date.date = xml_helper.try_get_text_from_xml_element( legal_date_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Date")) legal_date.date_type_code = xml_helper.try_get_attribute_from_xml_element( legal_date_elem, "codeListValue", ".//" + GENERIC_NAMESPACE_TEMPLATE.format("CI_DateTypeCode")) legal_date.date_type_code_list_url = xml_helper.try_get_attribute_from_xml_element( legal_date_elem, "codeList", ".//" + GENERIC_NAMESPACE_TEMPLATE.format("CI_DateTypeCode")) self.legal_dates.append(legal_date)
def _create_dataset_from_md_metadata(self, md_metadata: Element, metadata: Metadata) -> Dataset: """ Creates a Dataset record from xml data Args: md_metadata (Element): The xml element which holds the data metadata (Metadata): The related metadata element Returns: dataset (Dataset): The dataset record """ dataset = Dataset() dataset.language_code = metadata.language_code dataset.language_code_list_url = xml_helper.try_get_attribute_from_xml_element( md_metadata, "codeList", ".//" + GENERIC_NAMESPACE_TEMPLATE.format("language") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("LanguageCode")) dataset.character_set_code = xml_helper.try_get_text_from_xml_element( md_metadata, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("characterSet") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("MD_CharacterSetCode")) dataset.character_set_code_list_url = xml_helper.try_get_attribute_from_xml_element( md_metadata, "codeList", ".//" + GENERIC_NAMESPACE_TEMPLATE.format("characterSet") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("MD_CharacterSetCode")) dataset.date_stamp = xml_helper.try_get_text_from_xml_element( md_metadata, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("dateStamp") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("Date")) dataset.metadata_standard_name = xml_helper.try_get_text_from_xml_element( md_metadata, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("metadataStandardName") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) dataset.metadata_standard_version = xml_helper.try_get_text_from_xml_element( md_metadata, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("metadataStandardVersion") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) dataset.update_frequency_code = xml_helper.try_get_text_from_xml_element( md_metadata, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_MaintenanceFrequencyCode")) dataset.update_frequency_code_list_url = xml_helper.try_get_attribute_from_xml_element( md_metadata, "codeList", ".//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_MaintenanceFrequencyCode")) dataset.use_limitation = xml_helper.try_get_text_from_xml_element( md_metadata, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("useLimitation") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) dataset.lineage_statement = xml_helper.try_get_text_from_xml_element( md_metadata, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("statement") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) return dataset
def _overwrite_capabilities_iso_metadata_links(xml_obj: _Element, metadata: Metadata): """ Overwrites links in capabilities document Args: xml_obj (_Element): The xml_object of the document metadata (Metadata): The metadata object, holding the data Returns: """ # get list of all iso md links that really exist (from the metadata object) iso_md_links = metadata.get_related_metadata_uris() # get list of all MetadataURL elements from the capabilities element xml_links = xml_helper.try_get_element_from_xml("./MetadataURL", xml_obj) for xml_link in xml_links: xml_online_resource_elem = xml_helper.try_get_element_from_xml( "./OnlineResource", xml_link) xml_link_attr = xml_helper.try_get_attribute_from_xml_element( xml_online_resource_elem, "xlink:href") if xml_link_attr in iso_md_links: # we still use this, so we are good # Remove this link from iso_md_links to get an overview of which links are left over in the end # These links must be new then! iso_md_links.remove(xml_link_attr) continue else: # this does not seem to exist anymore -> remove it from the xml xml_helper.remove_element(xml_link) # what is left over in iso_md_links are new links that must be added to the capabilities doc for new_link in iso_md_links: xml_helper.add_iso_md_element(xml_obj, new_link)
def _parse_parameter_metadata(self, upper_elem): """ Parses the <Parameter> elements inside of <OperationsMetadata> Args: upper_elem (Element): The upper xml element Returns: parameter_map (dict): Mapped parameters and values """ parameter_objs = xml_helper.try_get_element_from_xml( "./" + GENERIC_NAMESPACE_TEMPLATE.format("Parameter"), upper_elem ) parameter_map = {} for parameter in parameter_objs: param_name = xml_helper.try_get_attribute_from_xml_element( parameter, "name" ) param_val = xml_helper.try_get_text_from_xml_element( parameter, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Value") ) parameter_map[param_name] = param_val return parameter_map
def test_get_records_by_id(self): """ Test for checking if the GetRecordsById is working fine or not. Returns: """ get_records_param = { "service": "CSW", "version": "2.0.2", "request": "GetRecordById", "id": self.test_id, "elementsetname": "full", } response = self.client.get(reverse(CSW_PATH), data=get_records_param) status_code = response.status_code content = response.content content_xml = xml_helper.parse_xml(content) self.assertEqual(response.status_code, 200, WRONG_STATUS_CODE_TEMPLATE.format(status_code)) self.assertIsNotNone(content_xml, INVALID_XML_MSG) # Check that the results are correct in amount and quality num_returned_elems = int( xml_helper.try_get_attribute_from_xml_element( xml_elem=content_xml, attribute="numberOfRecordsMatched", elem="//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"))) self.assertEqual( num_returned_elems, 1, "More than one element returned on GetRecordsById with only one used identifier!" ) real_returned_elems = xml_helper.try_get_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("Record"), content_xml) num_real_returned_elems = len(real_returned_elems) self.assertEqual( num_real_returned_elems, num_returned_elems, "csw:SearchResults contains wrong numberOfRecordsMatched! {} stated but {} returned!" .format(num_returned_elems, num_real_returned_elems)) identifiers = [ xml_helper.try_get_text_from_xml_element( real_returned_elem, "//" + GENERIC_NAMESPACE_TEMPLATE.format("identifier")) for real_returned_elem in real_returned_elems ] identifiers_identical = [ identifier == self.test_id for identifier in identifiers ] self.assertTrue( False not in identifiers_identical, "Elements with not matching identifier has been returned: {}". format(", ".join(identifiers)))
def _parse_operations_metadata(self, upper_elem): """ Parses the <Operation> elements inside of <OperationsMetadata> Args: upper_elem (Element): The upper xml element Returns: """ operations_objs = xml_helper.try_get_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Operation"), upper_elem ) attribute_map = { OGCOperationEnum.GET_CAPABILITIES.value: 'get_capabilities_uri', OGCOperationEnum.DESCRIBE_RECORD.value: 'describe_record_uri', OGCOperationEnum.GET_RECORDS.value: 'get_records_uri', OGCOperationEnum.GET_RECORD_BY_ID.value: 'get_record_by_id_uri', } for operation in operations_objs: operation_name = xml_helper.try_get_attribute_from_xml_element( operation, "name", ) get_uri = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation ) get_uri = xml_helper.get_href_attribute(get_uri) if get_uri is not None else None post_uri = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation ) post_uri = xml_helper.get_href_attribute(post_uri) if post_uri is not None else None if attribute_map.get(operation_name): setattr(self, attribute_map.get(operation_name)+'_GET', get_uri) setattr(self, attribute_map.get(operation_name)+'_POST', post_uri) else: # the given operation is not supported for now pass parameters = self._parse_parameter_metadata(operation) output_format = parameters.get("outputFormat", None) if output_format is not None: self.formats_list.append( MimeType.objects.get_or_create( operation=operation_name, mime_type=output_format, )[0] )
def _transform_constraint_to_cql_recursive(upper_elem: Element): constraints = [] connector_tags = ["and", "or", "not"] # Prevent <ogc:Filter> from being used as upper_tag joiner in the end upper_tag = QName(upper_elem).localname.lower() upper_tag = upper_tag if upper_tag in connector_tags else "" elements = upper_elem.getchildren() for child in elements: child_tag = QName(child).localname if child_tag.lower() in connector_tags: constraints.append(_transform_constraint_to_cql_recursive(child)) else: property_name = xml_helper.try_get_text_from_xml_element( elem="./" + GENERIC_NAMESPACE_TEMPLATE.format("PropertyName"), xml_elem=child) literal = xml_helper.try_get_text_from_xml_element( elem="./" + GENERIC_NAMESPACE_TEMPLATE.format("Literal"), xml_elem=child) expr = "" if child_tag == "PropertyIsLike": expr = "like" wild_card = xml_helper.try_get_attribute_from_xml_element( child, "wildCard") literal = literal.replace(wild_card, "%") elif child_tag == "PropertyIsEqualTo": expr = "=" elif child_tag == "PropertyIsNotEqualTo": expr = "!=" elif child_tag == "PropertyIsGreaterThanOrEqualTo": expr = ">=" elif child_tag == "PropertyIsGreaterThan": expr = ">" elif child_tag == "PropertyIsLessThanOrEqualTo": expr = "<=" elif child_tag == "PropertyIsLessThan": expr = "<" else: raise ValueError("Unsupported {} found!".format(child_tag), "Filter") constraints.append("{} {} {}".format(property_name, expr, literal)) constraint = " {} ".format(upper_tag).join(constraints) return constraint
def parse_xml(self): """ Reads the needed data from the xml and writes to an ISOMetadata instance (self) Returns: nothing """ xml = self.raw_metadata xml_obj = xml_helper.parse_xml(xml) self.file_identifier = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:fileIdentifier/gco:CharacterString") self.character_set_code = xml_helper.try_get_attribute_from_xml_element( xml_elem=xml_obj, attribute="codeListValue", elem="//gmd:MD_Metadata/gmd:characterSet/gmd:MD_CharacterSetCode") if self.file_identifier is None: self.file_identifier = uuid.uuid4() self.date_stamp = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:dateStamp/gco:Date") self.last_change_date = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:dateStamp/gco:Date") self.md_standard_name = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:metadataStandardName/gco:CharacterString") self.md_standard_version = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:metadataStandardVersion/gco:CharacterString") self._parse_xml_legal_dates(xml_obj) self._parse_xml_legal_reports(xml_obj) # try to transform the last_change_date into a datetime object try: self.last_change_date = parse(self.last_change_date, tzinfo=timezone.utc) except (ValueError, OverflowError, TypeError): # if this is not possible due to wrong input, just use the current time... self.last_change_date = timezone.now() self.hierarchy_level = xml_helper.try_get_attribute_from_xml_element( xml_obj, "codeListValue", "//gmd:MD_Metadata/gmd:hierarchyLevel/gmd:MD_ScopeCode") if self.hierarchy_level == "service": xpath_type = "srv:SV_ServiceIdentification" else: xpath_type = "gmd:MD_DataIdentification" self.title = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:citation/gmd:CI_Citation/gmd:title/gco:CharacterString" .format(xpath_type)) self._parse_xml_dataset_id(xml_obj, xpath_type) self.abstract = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:abstract/gco:CharacterString" .format(xpath_type)) keywords = xml_helper.try_get_element_from_xml( xml_elem=xml_obj, elem= "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString" .format(xpath_type)) for keyword in keywords: if keyword.text is not None and keyword not in self.keywords: self.keywords.append( xml_helper.try_get_text_from_xml_element(keyword)) language = xml_helper.try_get_single_element_from_xml( xml_elem=xml_obj, elem= "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:language/gmd:LanguageCode" .format(xpath_type)) if language and language.text is not None: self.language = xml_helper.try_get_text_from_xml_element(language) iso_categories = xml_helper.try_get_element_from_xml( xml_elem=xml_obj, elem= "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:topicCategory/gmd:MD_TopicCategoryCode" .format(xpath_type)) if iso_categories: for iso_category in iso_categories: self.iso_categories.append( xml_helper.try_get_text_from_xml_element(iso_category)) # Get all values from <gmd:distributionInfo> which declares the distributionFormat formats = xml_helper.try_get_element_from_xml( xml_elem=xml_obj, elem="//" + GENERIC_NAMESPACE_TEMPLATE.format("distributionFormat")) if formats: for format_elem in formats: # get the character value per format name_elem = xml_helper.try_get_single_element_from_xml( xml_elem=format_elem, elem=".//" + GENERIC_NAMESPACE_TEMPLATE.format("name")) if name_elem is None: continue val = xml_helper.try_get_text_from_xml_element( xml_elem=name_elem, elem=".//" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) self.formats.append(val) self.download_link = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:distributionInfo/gmd:MD_Distribution/gmd:transferOptions/gmd:MD_DigitalTransferOptions/gmd:onLine/gmd:CI_OnlineResource[gmd:function/gmd:CI_OnLineFunctionCode/@codeListValue="download"]/gmd:linkage/gmd:URL' ) self.transfer_size = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:distributionInfo/gmd:MD_Distribution/gmd:transferOptions/gmd:MD_DigitalTransferOptions/gmd:transferSize/gco:Real' ) self.preview_image = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:graphicOverview/gmd:MD_BrowseGraphic/gmd:fileName/gco:CharacterString" .format(xpath_type)) try: self.bounding_box["min_x"] = float( xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:westBoundLongitude/gco:Decimal".format(xpath_type))) self.bounding_box["min_y"] = float( xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:southBoundLatitude/gco:Decimal".format(xpath_type))) self.bounding_box["max_x"] = float( xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:eastBoundLongitude/gco:Decimal".format(xpath_type))) self.bounding_box["max_y"] = float( xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:northBoundLatitude/gco:Decimal".format(xpath_type))) except TypeError: self.bounding_box = None self._parse_xml_polygons(xml_obj, xpath_type) self.tmp_extent_begin = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:beginPosition" .format(xpath_type)) if self.tmp_extent_begin is None: self.tmp_extent_begin = "1900-01-01" self.tmp_extent_end = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:endPosition" .format(xpath_type)) if self.tmp_extent_end is None: self.tmp_extent_end = "1900-01-01" equivalent_scale = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:spatialResolution/gmd:MD_Resolution/gmd:equivalentScale/gmd:MD_RepresentativeFraction/gmd:denominator/gco:Integer" .format(xpath_type)) ground_res = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:spatialResolution/gmd:MD_Resolution/gmd:distance/gco:Distance" .format(xpath_type)) if equivalent_scale is not None and int(equivalent_scale) > 0: self.spatial_res_val = equivalent_scale self.spatial_res_type = "scaleDenominator" elif ground_res is not None and len(ground_res) > 0: self.spatial_res_val = ground_res self.spatial_res_type = "groundDistance" self.ref_system = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:referenceSystemInfo/gmd:MD_ReferenceSystem/gmd:referenceSystemIdentifier/gmd:RS_Identifier/gmd:code/gco:CharacterString" ) self.ref_system_version = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:referenceSystemInfo/gmd:MD_ReferenceSystem/gmd:referenceSystemIdentifier/gmd:RS_Identifier/gmd:version/gco:CharacterString" ) self.ref_system_authority = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:referenceSystemInfo/gmd:MD_ReferenceSystem/gmd:referenceSystemIdentifier/gmd:RS_Identifier/gmd:authority/gmd:CI_Citation/gmd:title/gco:CharacterString" ) epsg_api = EpsgApi() if self.ref_system is not None: self.ref_system = "EPSG:{}".format( epsg_api.get_subelements(self.ref_system).get("code")) # gmd:CI_OnLineFunctionCode dist_func_elem = xml_helper.try_get_single_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("CI_OnLineFunctionCode"), xml_obj) self.distribution_function = xml_helper.try_get_attribute_from_xml_element( dist_func_elem, "codeListValue", ) del dist_func_elem # gmd:MD_RepresentativeFraction fraction_elem = xml_helper.try_get_single_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_RepresentativeFraction"), xml_obj) self.fraction_denominator = xml_helper.try_get_text_from_xml_element( fraction_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Integer")) del fraction_elem # gmd:useLimitation limit_elem = xml_helper.try_get_single_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("useLimitation"), xml_obj) self.use_limitation = xml_helper.try_get_text_from_xml_element( limit_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) del limit_elem self.lineage = xml_helper.try_get_text_from_xml_element( xml_obj, "//gmd:MD_Metadata/gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:lineage/gmd:LI_Lineage/gmd:statement/gco:CharacterString" ) restriction_code_attr_val = xml_helper.try_get_element_from_xml( xml_elem=xml_obj, elem= '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceConstraints/gmd:MD_LegalConstraints/gmd:useConstraints/gmd:MD_RestrictionCode/@codeListValue' .format(xpath_type)) if len(restriction_code_attr_val) >= 2: legal_constraints = "" if restriction_code_attr_val[ 0] == 'license' and restriction_code_attr_val[ 1] == 'otherRestrictions': other_constraints = xml_helper.try_get_element_from_xml( xml_elem=xml_obj, elem= '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceConstraints/gmd:MD_LegalConstraints[gmd:useConstraints/gmd:MD_RestrictionCode/@codeListValue="otherRestrictions"]/gmd:otherConstraints/gco:CharacterString' .format(xpath_type)) for constraint in other_constraints: try: tmp_constraint = xml_helper.try_get_text_from_xml_element( xml_elem=constraint) constraint_json = json.loads(tmp_constraint) self.license_source_note = constraint_json.get( "quelle", None) self.license_json = constraint_json except ValueError: # no, this is not a json! # handle it is a normal text legal_constraints += tmp_constraint + ";" self.fees = legal_constraints self.access_constraints = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceConstraints/gmd:MD_LegalConstraints[gmd:accessConstraints/gmd:MD_RestrictionCode/@codeListValue="otherRestrictions"]/gmd:otherConstraints/gco:CharacterString' .format(xpath_type)) self.responsible_party = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:organisationName/gco:CharacterString' .format(xpath_type)) self.contact_person = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:individualName/gco:CharacterString' .format(xpath_type)) self.contact_phone = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:phone/gmd:CI_Telephone/gmd:voice/gco:CharacterString' .format(xpath_type)) self.contact_email = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString' .format(xpath_type)) update_frequency = xml_helper.try_get_attribute_from_xml_element( xml_elem=xml_obj, attribute="codeListValue", elem= '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceMaintenance/gmd:MD_MaintenanceInformation/gmd:maintenanceAndUpdateFrequency/gmd:MD_MaintenanceFrequencyCode' .format(xpath_type)) if update_frequency in self.valid_update_frequencies: self.update_frequency = update_frequency # inspire regulations regislations = {"inspire_rules": []} with open(INSPIRE_LEGISLATION_FILE, "r", encoding="utf-8") as _file: regislations = json.load(_file) for regislation in regislations["inspire_rules"]: reg = { "name": regislation.get("name", None), "date": regislation.get("date", "1900-01-01"), "pass": None, } statement = xml_helper.try_get_text_from_xml_element( xml_obj, '//gmd:MD_Metadata/gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:report/gmd:DQ_DomainConsistency/gmd:result/gmd:DQ_ConformanceResult[gmd:specification/gmd:CI_Citation/gmd:title/gco:CharacterString="{}" and gmd:specification/gmd:CI_Citation/gmd:date/gmd:CI_Date/gmd:date/gco:Date="{}"]/gmd:pass/gco:Boolean' .format(reg["name"], reg["date"])) statement_val = utils.resolve_boolean_attribute_val(statement) if statement_val is None: reg["pass"] = "******" self.inspire_interoperability = False else: reg["pass"] = statement_val # if only one regislation is not fullfilled, we do not have interoperability if not statement_val: self.inspire_interoperability = False self.interoperability_list.append(reg)
def _parse_operations_metadata(self, upper_elem): """ Parses the <Operation> elements inside of <OperationsMetadata> Args: upper_elem (Element): The upper xml element Returns: """ operations_objs = xml_helper.try_get_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Operation"), upper_elem) attribute_map = { OGCOperationEnum.GET_CAPABILITIES.value: 'get_capabilities_uri', OGCOperationEnum.DESCRIBE_RECORD.value: 'describe_record_uri', OGCOperationEnum.GET_RECORDS.value: 'get_records_uri', OGCOperationEnum.GET_RECORD_BY_ID.value: 'get_record_by_id_uri', } for operation in operations_objs: operation_name = xml_helper.try_get_attribute_from_xml_element( operation, "name", ) get_uri = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation) csw_logger.error("Type of returned object of get_uri: {}".format( type(get_uri))) get_uri = xml_helper.get_href_attribute( get_uri) if get_uri is not None else None post_uris = xml_helper.try_get_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation) number_of_post_endpoints = post_uris.__len__() if (number_of_post_endpoints > 1): post_uri = xml_helper.try_get_single_element_from_xml( ".//*[local-name()='Post'][.//ows:Constraint/ows:Value='XML']", operation) else: post_uri = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation) csw_logger.error( "Number of Entries of Post endpoints: {} for operation {}". format(number_of_post_endpoints, operation_name)) csw_logger.error("Type of returned object of post_uri: {}".format( type(post_uri))) post_uri = xml_helper.get_href_attribute( post_uri) if post_uri is not None else None if attribute_map.get(operation_name): setattr(self, attribute_map.get(operation_name) + '_GET', get_uri) setattr(self, attribute_map.get(operation_name) + '_POST', post_uri) else: # the given operation is not supported for now pass parameters = self._parse_parameter_metadata(operation) output_format = parameters.get("outputFormat", None) if output_format is not None: self.formats_list.append( MimeType.objects.get_or_create( operation=operation_name, mime_type=output_format, )[0])
def test_proxy_setting(self): return """ Tests whether the proxy can be set properly. Returns: """ metadata = self.service_wms.metadata # To avoid running celery in a separate test instance, we do not call the route. Instead we call the logic, which # is used to process access settings directly. async_process_securing_access( metadata.id, use_proxy=True, log_proxy=True, restrict_access=False, ) self.cap_doc_wms.refresh_from_db() doc_unsecured = self.cap_doc_wms.content doc_secured = Document.objects.get( metadata=metadata, document_type=DocumentEnum.CAPABILITY.value, is_original=False, ).content # Check for all operations if the uris has been changed! # Do not check for GetCapabilities, since we always change this uri during registration! # Make sure all versions can be matched by the code - the xml structure differs a lot from version to version service_version = metadata.get_service_version() if metadata.is_service_type(OGCServiceEnum.WMS): operations = [ OGCOperationEnum.GET_MAP.value, OGCOperationEnum.GET_FEATURE_INFO.value, OGCOperationEnum.DESCRIBE_LAYER.value, OGCOperationEnum.GET_LEGEND_GRAPHIC.value, OGCOperationEnum.GET_STYLES.value, OGCOperationEnum.PUT_STYLES.value, ] elif metadata.is_service_type(OGCServiceEnum.WFS): operations = [ OGCOperationEnum.GET_FEATURE.value, OGCOperationEnum.TRANSACTION.value, OGCOperationEnum.LOCK_FEATURE.value, OGCOperationEnum.DESCRIBE_FEATURE_TYPE.value, ] else: operations = [] # create xml documents from string documents and fetch only the relevant <Request> element for each xml_unsecured = xml_helper.parse_xml(doc_unsecured) request_unsecured = xml_helper.try_get_single_element_from_xml(elem="//" + GENERIC_NAMESPACE_TEMPLATE.format("Request"), xml_elem=xml_unsecured) xml_secured = xml_helper.parse_xml(doc_secured) request_secured = xml_helper.try_get_single_element_from_xml(elem="//" + GENERIC_NAMESPACE_TEMPLATE.format("Request"), xml_elem=xml_secured) for operation in operations: # Get <OPERATION> element operation_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_unsecured) operation_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_secured) if service_version == OGCServiceVersionEnum.V_1_0_0: if metadata.is_service_type(OGCServiceEnum.WMS): # The WMS 1.0.0 specification uses <OPERATION> instead of <GetOPERATION> for any operation element. operation = operation.replace("Get", "") # Get <OPERATION> element again operation_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_unsecured) operation_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_secured) # Version 1.0.0 holds the uris in the "onlineResource" attribute of <Get> and <Post> get_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_unsecured) get_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_secured) post_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_unsecured) post_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_secured) online_res = "onlineResource" get_unsecured = xml_helper.try_get_attribute_from_xml_element(get_unsecured, online_res) get_secured = xml_helper.try_get_attribute_from_xml_element(get_secured, online_res) post_unsecured = xml_helper.try_get_attribute_from_xml_element(post_unsecured, online_res) post_secured = xml_helper.try_get_attribute_from_xml_element(post_secured, online_res) # Assert that all get/post elements are not None self.assertIsNotNone(get_secured, msg="The secured uri of '{}' is None!".format(operation)) self.assertIsNotNone(post_secured, msg="The secured uri of '{}' is None!".format(operation)) # Assert that the secured version is different from the unsecured one self.assertNotEqual(get_unsecured, get_secured, msg="The uri of '{}' has not been secured!".format(operation)) self.assertNotEqual(post_unsecured, post_secured, msg="The uri of '{}' has not been secured!".format(operation)) # Assert that the HOST_NAME constant appears in the secured uri self.assertContains(get_secured, HOST_NAME) self.assertContains(post_secured, HOST_NAME) elif service_version == OGCServiceVersionEnum.V_1_1_0 \ or service_version == OGCServiceVersionEnum.V_2_0_0 \ or service_version == OGCServiceVersionEnum.V_2_0_2: # Only WFS # Get <OPERATION> element again, since the operation is now identified using an attribute, not an element tag operation_unsecured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Operation") + "[@name='" + operation + "']", request_unsecured ) operation_secured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Operation") + "[@name='" + operation + "']", request_secured ) # Version 1.1.0 holds the uris in the href attribute of <Get> and <Post> get_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_unsecured) get_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_secured) post_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_unsecured) post_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_secured) get_unsecured = xml_helper.get_href_attribute(get_unsecured) get_secured = xml_helper.get_href_attribute(get_secured) post_unsecured = xml_helper.get_href_attribute(post_unsecured) post_secured = xml_helper.get_href_attribute(post_secured) # Assert that all get/post elements are not None self.assertIsNotNone(get_secured, msg="The secured uri of '{}' is None!".format(operation)) self.assertIsNotNone(post_secured, msg="The secured uri of '{}' is None!".format(operation)) # Assert that the secured version is different from the unsecured one self.assertNotEqual(get_unsecured, get_secured, msg="The uri of '{}' has not been secured!".format(operation)) self.assertNotEqual(post_unsecured, post_secured, msg="The uri of '{}' has not been secured!".format(operation)) # Assert that the HOST_NAME constant appears in the secured uri self.assertContains(get_secured, HOST_NAME) self.assertContains(post_secured, HOST_NAME) elif service_version == OGCServiceVersionEnum.V_1_1_1 or service_version == OGCServiceVersionEnum.V_1_3_0: # Version 1.1.1 holds the uris in the <OnlineResource> element inside <Get> and <Post> get_unsecured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"), operation_unsecured ) get_secured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"), operation_secured ) post_unsecured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"), operation_unsecured ) post_secured = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"), operation_secured ) get_unsecured = xml_helper.get_href_attribute(get_unsecured) get_secured = xml_helper.get_href_attribute(get_secured) post_unsecured = xml_helper.get_href_attribute(post_unsecured) post_secured = xml_helper.get_href_attribute(post_secured) # Assert that both (secure/unsecure) uris are None or none of them # This is possible for operations that are not supported by the service if get_secured is not None and get_unsecured is not None: self.assertIsNotNone(get_secured, msg="The secured uri of '{}' is None!".format(operation)) # Assert that the secured version is different from the unsecured one self.assertNotEqual(get_unsecured, get_secured, msg="The uri of '{}' has not been secured!".format(operation)) # Assert that the HOST_NAME constant appears in the secured uri self.assertTrue(HOST_NAME in get_secured) if post_secured is not None and post_unsecured is not None: self.assertIsNotNone(post_secured, msg="The secured uri of '{}' is None!".format(operation)) self.assertNotEqual(post_unsecured, post_secured, msg="The uri of '{}' has not been secured!".format(operation)) self.assertTrue(HOST_NAME in post_secured) else: pass
def harvest(self, task_id: str = None): """ Starts harvesting procedure Returns: """ # Create a pending task record for the database first! task_exists = PendingTask.objects.filter( description__icontains=self.metadata.title).exists() if task_exists: raise ProcessLookupError(_("Harvesting is currently performed")) else: async_task_id = task_id or self.metadata.id self.pending_task = PendingTask.objects.create( task_id=async_task_id, description=json.dumps({ "service": self.metadata.title, "phase": "Connecting...", }), progress=0, remaining_time=None, created_by=self.harvesting_group) # Fill the deleted_metadata with all persisted metadata, so we can eliminate each entry if it is still provided by # the catalogue. In the end we will have a list, which contains metadata IDs that are not found in the catalogue anymore. all_persisted_metadata_identifiers = self.metadata.get_related_metadatas( filters={ 'to_metadatas__relation_type': MetadataRelationEnum.HARVESTED_THROUGH.value }).values_list("identifier", flat=True) # Use a set instead of list to increase lookup afterwards self.deleted_metadata.update(all_persisted_metadata_identifiers) # Perform the initial "hits" request to get an overview of how many data will be fetched hits_response, status_code = self._get_harvest_response( result_type="hits") descr = json.loads(self.pending_task.description) if status_code != 200: descr["phase"] = "Harvest failed: HTTP Code {}" self.pending_task.description = json.dumps(descr) self.pending_task.save() raise ConnectionError( _("Harvest failed: Code {}\n{}").format( status_code, hits_response)) xml_response = xml_helper.parse_xml(hits_response) if xml_response is None: descr["phase"] = "Response is not a valid xml" self.pending_task.description = json.dumps(descr) self.pending_task.save() raise ConnectionError( _("Response is not a valid xml: \n{}".format(hits_response))) try: total_number_to_harvest = int( xml_helper.try_get_attribute_from_xml_element( xml_response, "numberOfRecordsMatched", "//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"), )) except TypeError: csw_logger.error( "Malicious Harvest response: {}".format(hits_response)) descr[ "phase"] = "Harvest response incorrect. Inform an administrator!" self.pending_task.description = json.dumps(descr) self.pending_task.save() raise AttributeError( _("Harvest response is missing important data!")) descr["phase"] = "Start harvesting..." self.pending_task.description = json.dumps(descr) self.pending_task.save() progress_step_per_request = float( self.max_records_per_request / total_number_to_harvest) * 100 # There are wongly configured CSW, which do not return nextRecord=0 on the last page but instead continue on # nextRecord=1. We need to prevent endless loops by checking whether, we already worked on these positions and # simply end it there! processed_start_positions = set() t_start = time() number_rest_to_harvest = total_number_to_harvest number_of_harvested = 0 self.harvest_result.timestamp_start = timezone.now() self.harvest_result.save() page_cacher = PageCacher() # Run as long as we can fetch data and as long as the user does not abort the pending task! while self.pending_task is not None: processed_start_positions.add(self.start_position) # Get response next_response, status_code = self._get_harvest_response( result_type="results") found_entries = self._process_harvest_response(next_response) # Calculate time since loop started duration = time() - t_start number_rest_to_harvest -= self.max_records_per_request number_of_harvested += found_entries self.harvest_result.number_results = number_of_harvested self.harvest_result.save() # Remove cached pages of API and CSW page_cacher.remove_pages(API_CACHE_KEY_PREFIX) page_cacher.remove_pages(CSW_CACHE_PREFIX) if self.start_position == 0 or self.start_position in processed_start_positions: # We are done! estimated_time_for_all = timezone.timedelta(seconds=0) break else: seconds_for_rest = (number_rest_to_harvest * (duration / number_of_harvested)) estimated_time_for_all = timezone.timedelta( seconds=seconds_for_rest) self._update_pending_task(self.start_position, total_number_to_harvest, progress_step_per_request, estimated_time_for_all) # Add HarvestResult infos self.harvest_result.timestamp_end = timezone.now() self.harvest_result.number_results = number_of_harvested self.harvest_result.save() # Delete Metadata records which could not be found in the catalogue anymore # This has to be done if the harvesting run completely. Skip this part if the user aborted the harvest! if self.pending_task is not None: deleted_metadatas = Metadata.objects.filter( identifier__in=self.deleted_metadata) deleted_metadatas.delete() self.pending_task.delete() # Remove cached pages of API and CSW page_cacher.remove_pages(API_CACHE_KEY_PREFIX) page_cacher.remove_pages(CSW_CACHE_PREFIX)
def _md_metadata_parse_to_dict(self, md_metadata_entries: list) -> list: """ Read most important data from MD_Metadata xml element Args: md_metadata_entries (list): The xml MD_Metadata elements Returns: ret_list (list): The list containing dicts """ ret_list = [] for md_metadata in md_metadata_entries: md_data_entry = {} # Check before anything else, whether this metadata type can be skipped! hierarchy_level = xml_helper.try_get_attribute_from_xml_element( md_metadata, "codeListValue", ".//" + GENERIC_NAMESPACE_TEMPLATE.format("hierarchyLevel") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("MD_ScopeCode")) metadata_type = hierarchy_level md_data_entry["metadata_type"] = metadata_type if not HARVEST_METADATA_TYPES.get(metadata_type, False): continue _id = xml_helper.try_get_text_from_xml_element( md_metadata, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("fileIdentifier") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) md_data_entry["id"] = _id parent_id = xml_helper.try_get_text_from_xml_element( md_metadata, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("parentIdentifier") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) md_data_entry["parent_id"] = parent_id # A workaround, so we do not need to check whether SV_ServiceIdentification or MD_DataIdentification is present # in this metadata: Simply take the direct parent and perform a deeper nested search on the inside of this element. # Yes, we could simply decide based on the hierarchyLevel attribute whether to search for SV_xxx or MD_yyy. # No, there are metadata entries which do not follow these guidelines and have "service" with MD_yyy # Yes, they are important since they can be found in the INSPIRE catalogue (07/2020) identification_elem = xml_helper.try_get_single_element_from_xml( xml_elem=md_metadata, elem=".//" + GENERIC_NAMESPACE_TEMPLATE.format("identificationInfo")) title = xml_helper.try_get_text_from_xml_element( identification_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("citation") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CI_Citation") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("title") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) md_data_entry["title"] = title language_code = xml_helper.try_get_attribute_from_xml_element( md_metadata, "codeListValue", ".//" + GENERIC_NAMESPACE_TEMPLATE.format("language") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("LanguageCode")) md_data_entry["language_code"] = language_code date_stamp = xml_helper.try_get_text_from_xml_element( md_metadata, "./" + GENERIC_NAMESPACE_TEMPLATE.format("dateStamp") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("Date") ) or xml_helper.try_get_text_from_xml_element( md_metadata, "./" + GENERIC_NAMESPACE_TEMPLATE.format("dateStamp") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("DateTime")) try: md_data_entry["date_stamp"] = parse(date_stamp).replace( tzinfo=utc) except TypeError: md_data_entry["date_stamp"] = None abstract = xml_helper.try_get_text_from_xml_element( md_metadata, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("abstract") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) md_data_entry["abstract"] = abstract digital_transfer_elements = xml_helper.try_get_element_from_xml( xml_elem=md_metadata, elem=".//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_DigitalTransferOptions")) links = [] for elem in digital_transfer_elements: links_entry = {} resource_link = xml_helper.try_get_text_from_xml_element( elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("onLine") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CI_OnlineResource") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("linkage") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("URL"), ) descr = xml_helper.try_get_text_from_xml_element( elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("onLine") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CI_OnlineResource") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("description") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) links_entry["link"] = resource_link links_entry["description"] = descr if resource_link is not None: # Check on the type of online_resource we found -> could be GetCapabilities query_params = parse_qs( urlparse(resource_link.lower()).query) if OGCOperationEnum.GET_CAPABILITIES.value.lower( ) in query_params.get("request", []): # Parse all possibly relevant data from the dict version = query_params.get("version", [None]) service_type = query_params.get("service", [None]) md_data_entry[ "capabilities_original_url"] = resource_link md_data_entry["service_type"] = service_type[0] md_data_entry["version"] = version[0] links.append(links_entry) md_data_entry["links"] = links keywords = xml_helper.try_get_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("keyword") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"), md_metadata, ) or [] keywords = [ xml_helper.try_get_text_from_xml_element(kw) for kw in keywords ] md_data_entry["keywords"] = keywords access_constraints = xml_helper.try_get_text_from_xml_element( md_metadata, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("otherConstraints") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) md_data_entry["access_constraints"] = access_constraints categories = xml_helper.try_get_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_TopicCategoryCode"), md_metadata, ) or [] categories = [ xml_helper.try_get_text_from_xml_element(cat) for cat in categories ] md_data_entry["categories"] = categories bbox_elem = xml_helper.try_get_single_element_from_xml( ".//" + GENERIC_NAMESPACE_TEMPLATE.format("EX_GeographicBoundingBox"), md_metadata) if bbox_elem is not None: extent = [ xml_helper.try_get_text_from_xml_element( bbox_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("westBoundLongitude") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("Decimal")) or "0.0", xml_helper.try_get_text_from_xml_element( bbox_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("southBoundLatitude") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("Decimal")) or "0.0", xml_helper.try_get_text_from_xml_element( bbox_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("eastBoundLongitude") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("Decimal")) or "0.0", xml_helper.try_get_text_from_xml_element( bbox_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("northBoundLatitude") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("Decimal")) or "0.0", ] # There are metadata with wrong vertex notations like 50,3 instead of 50.3 # We should just drop them, since they are not compatible with the specifications but in here, we make an # exception and replace , since it's quite easy extent = [vertex.replace(",", ".") for vertex in extent] try: bounding_geometry = GEOSGeometry( Polygon.from_bbox(bbox=extent), srid=DEFAULT_SRS) except Exception: # Log malicious extent! csw_logger.warning( CSW_EXTENT_WARNING_LOG_TEMPLATE.format( _id, self.metadata.title, extent)) bounding_geometry = DEFAULT_SERVICE_BOUNDING_BOX_EMPTY else: bounding_geometry = DEFAULT_SERVICE_BOUNDING_BOX_EMPTY md_data_entry["bounding_geometry"] = bounding_geometry md_data_entry["contact"] = self._create_contact_from_md_metadata( md_metadata) md_data_entry["formats"] = self._create_formats_from_md_metadata( md_metadata) # Load non-metadata data # ToDo: Should harvesting persist non-metadata data?! #described_resource = None #metadata = None #if hierarchy_level == MetadataEnum.DATASET.value: # described_resource = self._create_dataset_from_md_metadata(md_metadata, metadata) # described_resource.metadata = metadata # described_resource.is_active = True # described_resource.save() ret_list.append(md_data_entry) return ret_list
def _process_harvest_response(self, next_response: bytes) -> int: """ Processes the harvest response content While the last response is being processed, the next one is already loaded to decrease run time Args: response (bytes): The response as bytes Returns: number_found_entries (int): The amount of found metadata records in this response """ xml_response = xml_helper.parse_xml(next_response) if xml_response is None: csw_logger.error( "Response is no valid xml. catalogue: {}, startPosition: {}, maxRecords: {}" .format(self.metadata.title, self.start_position, self.max_records_per_request)) # Abort! self.start_position = 0 return md_metadata_entries = xml_helper.try_get_element_from_xml( "//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_Metadata"), xml_response) or [] next_record_position = int( xml_helper.try_get_attribute_from_xml_element( xml_response, "nextRecord", "//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"), )) self.start_position = next_record_position # Fetch found identifiers in parent process, so self.deleted_metadata can be edited easily for md_identifier in md_metadata_entries: id = xml_helper.try_get_text_from_xml_element( md_identifier, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("fileIdentifier") + "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString")) try: self.deleted_metadata.remove(id) except KeyError: pass # Delete response to free memory del xml_response # Process response via multiple processes t_start = time() num_processes = int(cpu_count() / 2) num_processes = num_processes if num_processes >= 1 else 1 index_step = int(len(md_metadata_entries) / num_processes) start_index = 0 end_index = 0 self.resource_list = md_metadata_entries process_list = [] for i in range(0, num_processes): if index_step < 1: end_index = -1 else: end_index += index_step p = Process(target=self._create_metadata_from_md_metadata, args=(start_index, end_index)) start_index += index_step process_list.append(p) # Close all connections to force each process to create a new one for itself connections.close_all() execute_threads(process_list) csw_logger.debug( "Harvesting '{}': runtime for {} metadata parsing: {}s ####". format(self.metadata.title, self.max_records_per_request, time() - t_start)) return len(md_metadata_entries)
def harvest(self): """ Starts harvesting procedure Returns: """ absolute_url = f'<a href="{self.metadata.get_absolute_url()}">{self.metadata.title}</a>' service_json = {'id': self.metadata.pk, 'absolute_url': absolute_url}, if current_task: current_task.update_state(state=states.STARTED, meta={ 'service': service_json, 'phase': f"Connecting to {absolute_url}", }) # Fill the deleted_metadata with all persisted metadata, so we can eliminate each entry if it is still provided by # the catalogue. In the end we will have a list, which contains metadata IDs that are not found in the catalogue anymore. all_persisted_metadata_identifiers = self.metadata.get_related_metadatas( filters={ 'to_metadatas__relation_type': MetadataRelationEnum.HARVESTED_THROUGH.value }).values_list("identifier", flat=True) # Use a set instead of list to increase lookup afterwards self.deleted_metadata.update(all_persisted_metadata_identifiers) # Perform the initial "hits" request to get an overview of how many data will be fetched hits_response, status_code = self._get_harvest_response( result_type="hits") if status_code != 200: raise ConnectionError( _("Harvest failed: Code {}\n{}").format( status_code, hits_response)) xml_response = xml_helper.parse_xml(hits_response) if xml_response is None: raise ConnectionError( _("Response is not a valid xml: \n{}".format(hits_response))) try: if current_task: current_task.update_state(state=states.STARTED, meta={ 'phase': f"calculating harvesting time", }) total_number_to_harvest = int( xml_helper.try_get_attribute_from_xml_element( xml_response, "numberOfRecordsMatched", "//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"), )) except TypeError: csw_logger.error( "Malicious Harvest response: {}".format(hits_response)) raise AttributeError( _("Harvest response is missing important data!")) if current_task: current_task.update_state(state=states.STARTED, meta={ 'service': service_json, 'phase': "Start harvesting..." }) self.progress_step_per_result = float( 1 / total_number_to_harvest) * 100 # There are wongly configured CSW, which do not return nextRecord=0 on the last page but instead continue on # nextRecord=1. We need to prevent endless loops by checking whether, we already worked on these positions and # simply end it there! processed_start_positions = set() t_start = time() number_rest_to_harvest = total_number_to_harvest number_of_harvested = 0 self.harvest_result.timestamp_start = timezone.now() self.harvest_result.save() page_cacher = PageCacher() # Run as long as we can fetch data and as long as the user does not abort the pending task! while True: estimated_time_for_all = 'unknown' if current_task: current_task.update_state( state=states.STARTED, meta={ 'phase': _("Harvesting first {} of {}. Time remaining: {}"). format(self.max_records_per_request, total_number_to_harvest, estimated_time_for_all), }) processed_start_positions.add(self.start_position) # Get response next_response, status_code = self._get_harvest_response( result_type="results") if current_task: current_task.update_state( state=states.STARTED, meta={ 'phase': _("Processing harvested results for the first {} of {}. Time remaining: {}" ).format(self.max_records_per_request, total_number_to_harvest, estimated_time_for_all), }) found_entries = self._process_harvest_response(next_response) # Calculate time since loop started duration = time() - t_start number_rest_to_harvest -= self.max_records_per_request number_of_harvested += found_entries self.harvest_result.number_results = number_of_harvested self.harvest_result.save() # Remove cached pages of API and CSW page_cacher.remove_pages(API_CACHE_KEY_PREFIX) page_cacher.remove_pages(CSW_CACHE_PREFIX) if self.start_position == 0 or self.start_position in processed_start_positions: # We are done! break else: seconds_for_rest = (number_rest_to_harvest * (duration / number_of_harvested)) estimated_time_for_all = timezone.timedelta( seconds=seconds_for_rest) # Add HarvestResult infos self.harvest_result.timestamp_end = timezone.now() self.harvest_result.number_results = number_of_harvested self.harvest_result.save() # Delete Metadata records which could not be found in the catalogue anymore # This has to be done if the harvesting run completely. Skip this part if the user aborted the harvest! deleted_metadatas = Metadata.objects.filter( identifier__in=self.deleted_metadata) deleted_metadatas.delete() # Remove cached pages of API and CSW page_cacher.remove_pages(API_CACHE_KEY_PREFIX) page_cacher.remove_pages(CSW_CACHE_PREFIX)