def _parse_xml_legal_reports(self, xml_obj: Element):
        """ Parses existing CI_Date elements from the MD_DataIdentification element

        Args:
            xml_obj (Element): The document xml element
        Returns:

        """
        data_quality_elem = xml_helper.try_get_single_element_from_xml(
            "//" + GENERIC_NAMESPACE_TEMPLATE.format("DQ_DataQuality"),
            xml_obj)
        report_elems = xml_helper.try_get_single_element_from_xml(
            "//" + GENERIC_NAMESPACE_TEMPLATE.format("report"), xml_obj)
        for report_elem in report_elems:
            report = LegalReport()
            report.title = xml_helper.try_get_text_from_xml_element(
                report_elem,
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("title") + "/" +
                GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
            report.explanation = xml_helper.try_get_text_from_xml_element(
                report_elem,
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("explanation") +
                "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
            legal_date = LegalDate()
            legal_date.date = xml_helper.try_get_text_from_xml_element(
                report_elem, ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Date"))
            legal_date.date_type_code = xml_helper.try_get_attribute_from_xml_element(
                report_elem, "codeListValue",
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("CI_DateTypeCode"))
            legal_date.date_type_code_list_url = xml_helper.try_get_attribute_from_xml_element(
                report_elem, "codeList",
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("CI_DateTypeCode"))
            report.date = legal_date
            self.legal_reports.append(report)
Exemple #2
0
    def parse_style(self, layer, layer_obj):
        style_xml = xml_helper.try_get_single_element_from_xml(
            "./" + GENERIC_NAMESPACE_TEMPLATE.format("Style"), layer)

        if style_xml is None:
            # no <Style> element found
            return

        style_obj = Style()

        style_obj.name = xml_helper.try_get_text_from_xml_element(
            style_xml, "./" + GENERIC_NAMESPACE_TEMPLATE.format("Name"))
        style_obj.title = xml_helper.try_get_text_from_xml_element(
            style_xml, "./" + GENERIC_NAMESPACE_TEMPLATE.format("Title"))
        legend_elem = xml_helper.try_get_single_element_from_xml(
            elem="./" + GENERIC_NAMESPACE_TEMPLATE.format("LegendURL") + "/" +
            GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"),
            xml_elem=style_xml)
        style_obj.legend_uri = xml_helper.get_href_attribute(legend_elem)
        style_obj.width = int(
            xml_helper.try_get_attribute_from_xml_element(
                style_xml, "width",
                "./" + GENERIC_NAMESPACE_TEMPLATE.format("LegendURL")) or 0)
        style_obj.height = int(
            xml_helper.try_get_attribute_from_xml_element(
                style_xml, "height",
                "./" + GENERIC_NAMESPACE_TEMPLATE.format("LegendURL")) or 0)
        style_obj.mime_type = MimeType.objects.filter(
            mime_type=xml_helper.try_get_text_from_xml_element(
                style_xml, "./" +
                GENERIC_NAMESPACE_TEMPLATE.format("LegendURL") + "/ " +
                GENERIC_NAMESPACE_TEMPLATE.format("Format"))).first()

        layer_obj.style = style_obj
    def _parse_xml_legal_dates(self, xml_obj: Element):
        """ Parses existing CI_Date elements from the MD_DataIdentification element

        Args:
            xml_obj (Element): The document xml element
        Returns:

        """
        md_data_ident_elem = xml_helper.try_get_single_element_from_xml(
            "//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_DataIdentification"),
            xml_obj)
        legal_date_elems = xml_helper.try_get_element_from_xml(
            ".//" + GENERIC_NAMESPACE_TEMPLATE.format("CI_Date"),
            md_data_ident_elem)
        if legal_date_elems:
            for legal_date_elem in legal_date_elems:
                legal_date = LegalDate()
                legal_date.date = xml_helper.try_get_text_from_xml_element(
                    legal_date_elem,
                    ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Date"))
                legal_date.date_type_code = xml_helper.try_get_attribute_from_xml_element(
                    legal_date_elem, "codeListValue", ".//" +
                    GENERIC_NAMESPACE_TEMPLATE.format("CI_DateTypeCode"))
                legal_date.date_type_code_list_url = xml_helper.try_get_attribute_from_xml_element(
                    legal_date_elem, "codeList", ".//" +
                    GENERIC_NAMESPACE_TEMPLATE.format("CI_DateTypeCode"))
                self.legal_dates.append(legal_date)
Exemple #4
0
 def _create_dataset_from_md_metadata(self, md_metadata: Element,
                                      metadata: Metadata) -> Dataset:
     """ Creates a Dataset record from xml data
     Args:
         md_metadata (Element): The xml element which holds the data
         metadata (Metadata): The related metadata element
     Returns:
         dataset (Dataset): The dataset record
     """
     dataset = Dataset()
     dataset.language_code = metadata.language_code
     dataset.language_code_list_url = xml_helper.try_get_attribute_from_xml_element(
         md_metadata, "codeList",
         ".//" + GENERIC_NAMESPACE_TEMPLATE.format("language") + "/" +
         GENERIC_NAMESPACE_TEMPLATE.format("LanguageCode"))
     dataset.character_set_code = xml_helper.try_get_text_from_xml_element(
         md_metadata,
         ".//" + GENERIC_NAMESPACE_TEMPLATE.format("characterSet") + "/" +
         GENERIC_NAMESPACE_TEMPLATE.format("MD_CharacterSetCode"))
     dataset.character_set_code_list_url = xml_helper.try_get_attribute_from_xml_element(
         md_metadata, "codeList",
         ".//" + GENERIC_NAMESPACE_TEMPLATE.format("characterSet") + "/" +
         GENERIC_NAMESPACE_TEMPLATE.format("MD_CharacterSetCode"))
     dataset.date_stamp = xml_helper.try_get_text_from_xml_element(
         md_metadata,
         ".//" + GENERIC_NAMESPACE_TEMPLATE.format("dateStamp") + "/" +
         GENERIC_NAMESPACE_TEMPLATE.format("Date"))
     dataset.metadata_standard_name = xml_helper.try_get_text_from_xml_element(
         md_metadata,
         ".//" + GENERIC_NAMESPACE_TEMPLATE.format("metadataStandardName") +
         "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
     dataset.metadata_standard_version = xml_helper.try_get_text_from_xml_element(
         md_metadata, ".//" +
         GENERIC_NAMESPACE_TEMPLATE.format("metadataStandardVersion") +
         "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
     dataset.update_frequency_code = xml_helper.try_get_text_from_xml_element(
         md_metadata, ".//" +
         GENERIC_NAMESPACE_TEMPLATE.format("MD_MaintenanceFrequencyCode"))
     dataset.update_frequency_code_list_url = xml_helper.try_get_attribute_from_xml_element(
         md_metadata, "codeList", ".//" +
         GENERIC_NAMESPACE_TEMPLATE.format("MD_MaintenanceFrequencyCode"))
     dataset.use_limitation = xml_helper.try_get_text_from_xml_element(
         md_metadata,
         ".//" + GENERIC_NAMESPACE_TEMPLATE.format("useLimitation") + "/" +
         GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
     dataset.lineage_statement = xml_helper.try_get_text_from_xml_element(
         md_metadata,
         ".//" + GENERIC_NAMESPACE_TEMPLATE.format("statement") + "/" +
         GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
     return dataset
Exemple #5
0
def _overwrite_capabilities_iso_metadata_links(xml_obj: _Element,
                                               metadata: Metadata):
    """ Overwrites links in capabilities document

    Args:
        xml_obj (_Element): The xml_object of the document
        metadata (Metadata): The metadata object, holding the data
    Returns:

    """
    # get list of all iso md links that really exist (from the metadata object)
    iso_md_links = metadata.get_related_metadata_uris()

    # get list of all MetadataURL elements from the capabilities element
    xml_links = xml_helper.try_get_element_from_xml("./MetadataURL", xml_obj)
    for xml_link in xml_links:
        xml_online_resource_elem = xml_helper.try_get_element_from_xml(
            "./OnlineResource", xml_link)
        xml_link_attr = xml_helper.try_get_attribute_from_xml_element(
            xml_online_resource_elem, "xlink:href")
        if xml_link_attr in iso_md_links:
            # we still use this, so we are good
            # Remove this link from iso_md_links to get an overview of which links are left over in the end
            # These links must be new then!
            iso_md_links.remove(xml_link_attr)
            continue
        else:
            # this does not seem to exist anymore -> remove it from the xml
            xml_helper.remove_element(xml_link)
    # what is left over in iso_md_links are new links that must be added to the capabilities doc
    for new_link in iso_md_links:
        xml_helper.add_iso_md_element(xml_obj, new_link)
Exemple #6
0
    def _parse_parameter_metadata(self, upper_elem):
        """ Parses the <Parameter> elements inside of <OperationsMetadata>

        Args:
            upper_elem (Element): The upper xml element
        Returns:
            parameter_map (dict): Mapped parameters and values
        """
        parameter_objs = xml_helper.try_get_element_from_xml(
            "./" + GENERIC_NAMESPACE_TEMPLATE.format("Parameter"),
            upper_elem
        )
        parameter_map = {}
        for parameter in parameter_objs:
            param_name = xml_helper.try_get_attribute_from_xml_element(
                parameter,
                "name"
            )
            param_val = xml_helper.try_get_text_from_xml_element(
                parameter,
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Value")
            )
            parameter_map[param_name] = param_val

        return parameter_map
Exemple #7
0
    def test_get_records_by_id(self):
        """ Test for checking if the GetRecordsById is working fine or not.

        Returns:

        """
        get_records_param = {
            "service": "CSW",
            "version": "2.0.2",
            "request": "GetRecordById",
            "id": self.test_id,
            "elementsetname": "full",
        }

        response = self.client.get(reverse(CSW_PATH), data=get_records_param)
        status_code = response.status_code
        content = response.content
        content_xml = xml_helper.parse_xml(content)

        self.assertEqual(response.status_code, 200,
                         WRONG_STATUS_CODE_TEMPLATE.format(status_code))
        self.assertIsNotNone(content_xml, INVALID_XML_MSG)

        # Check that the results are correct in amount and quality
        num_returned_elems = int(
            xml_helper.try_get_attribute_from_xml_element(
                xml_elem=content_xml,
                attribute="numberOfRecordsMatched",
                elem="//" +
                GENERIC_NAMESPACE_TEMPLATE.format("SearchResults")))
        self.assertEqual(
            num_returned_elems, 1,
            "More than one element returned on GetRecordsById with only one used identifier!"
        )
        real_returned_elems = xml_helper.try_get_element_from_xml(
            "//" + GENERIC_NAMESPACE_TEMPLATE.format("Record"), content_xml)
        num_real_returned_elems = len(real_returned_elems)
        self.assertEqual(
            num_real_returned_elems, num_returned_elems,
            "csw:SearchResults contains wrong numberOfRecordsMatched! {} stated but {} returned!"
            .format(num_returned_elems, num_real_returned_elems))

        identifiers = [
            xml_helper.try_get_text_from_xml_element(
                real_returned_elem,
                "//" + GENERIC_NAMESPACE_TEMPLATE.format("identifier"))
            for real_returned_elem in real_returned_elems
        ]
        identifiers_identical = [
            identifier == self.test_id for identifier in identifiers
        ]
        self.assertTrue(
            False not in identifiers_identical,
            "Elements with not matching identifier has been returned: {}".
            format(", ".join(identifiers)))
Exemple #8
0
    def _parse_operations_metadata(self, upper_elem):
        """ Parses the <Operation> elements inside of <OperationsMetadata>

        Args:
            upper_elem (Element): The upper xml element
        Returns:

        """
        operations_objs = xml_helper.try_get_element_from_xml(
            ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Operation"),
            upper_elem
        )

        attribute_map = {
            OGCOperationEnum.GET_CAPABILITIES.value: 'get_capabilities_uri',
            OGCOperationEnum.DESCRIBE_RECORD.value: 'describe_record_uri',
            OGCOperationEnum.GET_RECORDS.value: 'get_records_uri',
            OGCOperationEnum.GET_RECORD_BY_ID.value: 'get_record_by_id_uri',
        }

        for operation in operations_objs:
            operation_name = xml_helper.try_get_attribute_from_xml_element(
                operation,
                "name",
            )
            get_uri = xml_helper.try_get_single_element_from_xml(
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"),
                operation
            )
            get_uri = xml_helper.get_href_attribute(get_uri) if get_uri is not None else None

            post_uri = xml_helper.try_get_single_element_from_xml(
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"),
                operation
            )
            post_uri = xml_helper.get_href_attribute(post_uri) if post_uri is not None else None

            if attribute_map.get(operation_name):
                setattr(self, attribute_map.get(operation_name)+'_GET', get_uri)
                setattr(self, attribute_map.get(operation_name)+'_POST', post_uri)
            else:
                # the given operation is not supported for now
                pass

            parameters = self._parse_parameter_metadata(operation)
            output_format = parameters.get("outputFormat", None)
            if output_format is not None:
                self.formats_list.append(
                    MimeType.objects.get_or_create(
                        operation=operation_name,
                        mime_type=output_format,
                    )[0]
                )
Exemple #9
0
def _transform_constraint_to_cql_recursive(upper_elem: Element):
    constraints = []

    connector_tags = ["and", "or", "not"]
    # Prevent <ogc:Filter> from being used as upper_tag joiner in the end
    upper_tag = QName(upper_elem).localname.lower()
    upper_tag = upper_tag if upper_tag in connector_tags else ""
    elements = upper_elem.getchildren()

    for child in elements:
        child_tag = QName(child).localname
        if child_tag.lower() in connector_tags:
            constraints.append(_transform_constraint_to_cql_recursive(child))
        else:
            property_name = xml_helper.try_get_text_from_xml_element(
                elem="./" + GENERIC_NAMESPACE_TEMPLATE.format("PropertyName"),
                xml_elem=child)
            literal = xml_helper.try_get_text_from_xml_element(
                elem="./" + GENERIC_NAMESPACE_TEMPLATE.format("Literal"),
                xml_elem=child)
            expr = ""
            if child_tag == "PropertyIsLike":
                expr = "like"
                wild_card = xml_helper.try_get_attribute_from_xml_element(
                    child, "wildCard")
                literal = literal.replace(wild_card, "%")
            elif child_tag == "PropertyIsEqualTo":
                expr = "="
            elif child_tag == "PropertyIsNotEqualTo":
                expr = "!="
            elif child_tag == "PropertyIsGreaterThanOrEqualTo":
                expr = ">="
            elif child_tag == "PropertyIsGreaterThan":
                expr = ">"
            elif child_tag == "PropertyIsLessThanOrEqualTo":
                expr = "<="
            elif child_tag == "PropertyIsLessThan":
                expr = "<"
            else:
                raise ValueError("Unsupported {} found!".format(child_tag),
                                 "Filter")
            constraints.append("{} {} {}".format(property_name, expr, literal))
    constraint = " {} ".format(upper_tag).join(constraints)
    return constraint
    def parse_xml(self):
        """ Reads the needed data from the xml and writes to an ISOMetadata instance (self)

        Returns:
             nothing
        """
        xml = self.raw_metadata
        xml_obj = xml_helper.parse_xml(xml)
        self.file_identifier = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:fileIdentifier/gco:CharacterString")
        self.character_set_code = xml_helper.try_get_attribute_from_xml_element(
            xml_elem=xml_obj,
            attribute="codeListValue",
            elem="//gmd:MD_Metadata/gmd:characterSet/gmd:MD_CharacterSetCode")
        if self.file_identifier is None:
            self.file_identifier = uuid.uuid4()
        self.date_stamp = xml_helper.try_get_text_from_xml_element(
            xml_obj, "//gmd:MD_Metadata/gmd:dateStamp/gco:Date")
        self.last_change_date = xml_helper.try_get_text_from_xml_element(
            xml_obj, "//gmd:MD_Metadata/gmd:dateStamp/gco:Date")

        self.md_standard_name = xml_helper.try_get_text_from_xml_element(
            xml_obj, "//gmd:metadataStandardName/gco:CharacterString")
        self.md_standard_version = xml_helper.try_get_text_from_xml_element(
            xml_obj, "//gmd:metadataStandardVersion/gco:CharacterString")

        self._parse_xml_legal_dates(xml_obj)
        self._parse_xml_legal_reports(xml_obj)

        # try to transform the last_change_date into a datetime object
        try:
            self.last_change_date = parse(self.last_change_date,
                                          tzinfo=timezone.utc)
        except (ValueError, OverflowError, TypeError):
            # if this is not possible due to wrong input, just use the current time...
            self.last_change_date = timezone.now()

        self.hierarchy_level = xml_helper.try_get_attribute_from_xml_element(
            xml_obj, "codeListValue",
            "//gmd:MD_Metadata/gmd:hierarchyLevel/gmd:MD_ScopeCode")
        if self.hierarchy_level == "service":
            xpath_type = "srv:SV_ServiceIdentification"
        else:
            xpath_type = "gmd:MD_DataIdentification"
        self.title = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:citation/gmd:CI_Citation/gmd:title/gco:CharacterString"
            .format(xpath_type))
        self._parse_xml_dataset_id(xml_obj, xpath_type)
        self.abstract = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:abstract/gco:CharacterString"
            .format(xpath_type))
        keywords = xml_helper.try_get_element_from_xml(
            xml_elem=xml_obj,
            elem=
            "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString"
            .format(xpath_type))
        for keyword in keywords:
            if keyword.text is not None and keyword not in self.keywords:
                self.keywords.append(
                    xml_helper.try_get_text_from_xml_element(keyword))

        language = xml_helper.try_get_single_element_from_xml(
            xml_elem=xml_obj,
            elem=
            "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:language/gmd:LanguageCode"
            .format(xpath_type))
        if language and language.text is not None:
            self.language = xml_helper.try_get_text_from_xml_element(language)

        iso_categories = xml_helper.try_get_element_from_xml(
            xml_elem=xml_obj,
            elem=
            "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:topicCategory/gmd:MD_TopicCategoryCode"
            .format(xpath_type))
        if iso_categories:
            for iso_category in iso_categories:
                self.iso_categories.append(
                    xml_helper.try_get_text_from_xml_element(iso_category))

        # Get all values from <gmd:distributionInfo> which declares the distributionFormat
        formats = xml_helper.try_get_element_from_xml(
            xml_elem=xml_obj,
            elem="//" +
            GENERIC_NAMESPACE_TEMPLATE.format("distributionFormat"))
        if formats:
            for format_elem in formats:
                # get the character value per format
                name_elem = xml_helper.try_get_single_element_from_xml(
                    xml_elem=format_elem,
                    elem=".//" + GENERIC_NAMESPACE_TEMPLATE.format("name"))
                if name_elem is None:
                    continue
                val = xml_helper.try_get_text_from_xml_element(
                    xml_elem=name_elem,
                    elem=".//" +
                    GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
                self.formats.append(val)

        self.download_link = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            '//gmd:MD_Metadata/gmd:distributionInfo/gmd:MD_Distribution/gmd:transferOptions/gmd:MD_DigitalTransferOptions/gmd:onLine/gmd:CI_OnlineResource[gmd:function/gmd:CI_OnLineFunctionCode/@codeListValue="download"]/gmd:linkage/gmd:URL'
        )
        self.transfer_size = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            '//gmd:MD_Metadata/gmd:distributionInfo/gmd:MD_Distribution/gmd:transferOptions/gmd:MD_DigitalTransferOptions/gmd:transferSize/gco:Real'
        )
        self.preview_image = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:graphicOverview/gmd:MD_BrowseGraphic/gmd:fileName/gco:CharacterString"
            .format(xpath_type))
        try:
            self.bounding_box["min_x"] = float(
                xml_helper.try_get_text_from_xml_element(
                    xml_obj,
                    "//gmd:westBoundLongitude/gco:Decimal".format(xpath_type)))
            self.bounding_box["min_y"] = float(
                xml_helper.try_get_text_from_xml_element(
                    xml_obj,
                    "//gmd:southBoundLatitude/gco:Decimal".format(xpath_type)))
            self.bounding_box["max_x"] = float(
                xml_helper.try_get_text_from_xml_element(
                    xml_obj,
                    "//gmd:eastBoundLongitude/gco:Decimal".format(xpath_type)))
            self.bounding_box["max_y"] = float(
                xml_helper.try_get_text_from_xml_element(
                    xml_obj,
                    "//gmd:northBoundLatitude/gco:Decimal".format(xpath_type)))
        except TypeError:
            self.bounding_box = None

        self._parse_xml_polygons(xml_obj, xpath_type)

        self.tmp_extent_begin = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:beginPosition"
            .format(xpath_type))
        if self.tmp_extent_begin is None:
            self.tmp_extent_begin = "1900-01-01"

        self.tmp_extent_end = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:endPosition"
            .format(xpath_type))
        if self.tmp_extent_end is None:
            self.tmp_extent_end = "1900-01-01"

        equivalent_scale = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:spatialResolution/gmd:MD_Resolution/gmd:equivalentScale/gmd:MD_RepresentativeFraction/gmd:denominator/gco:Integer"
            .format(xpath_type))
        ground_res = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:spatialResolution/gmd:MD_Resolution/gmd:distance/gco:Distance"
            .format(xpath_type))
        if equivalent_scale is not None and int(equivalent_scale) > 0:
            self.spatial_res_val = equivalent_scale
            self.spatial_res_type = "scaleDenominator"
        elif ground_res is not None and len(ground_res) > 0:
            self.spatial_res_val = ground_res
            self.spatial_res_type = "groundDistance"

        self.ref_system = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:referenceSystemInfo/gmd:MD_ReferenceSystem/gmd:referenceSystemIdentifier/gmd:RS_Identifier/gmd:code/gco:CharacterString"
        )
        self.ref_system_version = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:referenceSystemInfo/gmd:MD_ReferenceSystem/gmd:referenceSystemIdentifier/gmd:RS_Identifier/gmd:version/gco:CharacterString"
        )
        self.ref_system_authority = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:referenceSystemInfo/gmd:MD_ReferenceSystem/gmd:referenceSystemIdentifier/gmd:RS_Identifier/gmd:authority/gmd:CI_Citation/gmd:title/gco:CharacterString"
        )
        epsg_api = EpsgApi()
        if self.ref_system is not None:
            self.ref_system = "EPSG:{}".format(
                epsg_api.get_subelements(self.ref_system).get("code"))

        # gmd:CI_OnLineFunctionCode
        dist_func_elem = xml_helper.try_get_single_element_from_xml(
            "//" + GENERIC_NAMESPACE_TEMPLATE.format("CI_OnLineFunctionCode"),
            xml_obj)
        self.distribution_function = xml_helper.try_get_attribute_from_xml_element(
            dist_func_elem,
            "codeListValue",
        )
        del dist_func_elem

        # gmd:MD_RepresentativeFraction
        fraction_elem = xml_helper.try_get_single_element_from_xml(
            "//" +
            GENERIC_NAMESPACE_TEMPLATE.format("MD_RepresentativeFraction"),
            xml_obj)
        self.fraction_denominator = xml_helper.try_get_text_from_xml_element(
            fraction_elem,
            ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Integer"))
        del fraction_elem

        # gmd:useLimitation
        limit_elem = xml_helper.try_get_single_element_from_xml(
            "//" + GENERIC_NAMESPACE_TEMPLATE.format("useLimitation"), xml_obj)
        self.use_limitation = xml_helper.try_get_text_from_xml_element(
            limit_elem,
            ".//" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
        del limit_elem

        self.lineage = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            "//gmd:MD_Metadata/gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:lineage/gmd:LI_Lineage/gmd:statement/gco:CharacterString"
        )

        restriction_code_attr_val = xml_helper.try_get_element_from_xml(
            xml_elem=xml_obj,
            elem=
            '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceConstraints/gmd:MD_LegalConstraints/gmd:useConstraints/gmd:MD_RestrictionCode/@codeListValue'
            .format(xpath_type))
        if len(restriction_code_attr_val) >= 2:
            legal_constraints = ""
            if restriction_code_attr_val[
                    0] == 'license' and restriction_code_attr_val[
                        1] == 'otherRestrictions':
                other_constraints = xml_helper.try_get_element_from_xml(
                    xml_elem=xml_obj,
                    elem=
                    '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceConstraints/gmd:MD_LegalConstraints[gmd:useConstraints/gmd:MD_RestrictionCode/@codeListValue="otherRestrictions"]/gmd:otherConstraints/gco:CharacterString'
                    .format(xpath_type))
                for constraint in other_constraints:
                    try:
                        tmp_constraint = xml_helper.try_get_text_from_xml_element(
                            xml_elem=constraint)
                        constraint_json = json.loads(tmp_constraint)
                        self.license_source_note = constraint_json.get(
                            "quelle", None)
                        self.license_json = constraint_json
                    except ValueError:
                        # no, this is not a json!
                        # handle it is a normal text
                        legal_constraints += tmp_constraint + ";"
            self.fees = legal_constraints

        self.access_constraints = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceConstraints/gmd:MD_LegalConstraints[gmd:accessConstraints/gmd:MD_RestrictionCode/@codeListValue="otherRestrictions"]/gmd:otherConstraints/gco:CharacterString'
            .format(xpath_type))
        self.responsible_party = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:organisationName/gco:CharacterString'
            .format(xpath_type))
        self.contact_person = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:individualName/gco:CharacterString'
            .format(xpath_type))
        self.contact_phone = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:phone/gmd:CI_Telephone/gmd:voice/gco:CharacterString'
            .format(xpath_type))
        self.contact_email = xml_helper.try_get_text_from_xml_element(
            xml_obj,
            '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString'
            .format(xpath_type))
        update_frequency = xml_helper.try_get_attribute_from_xml_element(
            xml_elem=xml_obj,
            attribute="codeListValue",
            elem=
            '//gmd:MD_Metadata/gmd:identificationInfo/{}/gmd:resourceMaintenance/gmd:MD_MaintenanceInformation/gmd:maintenanceAndUpdateFrequency/gmd:MD_MaintenanceFrequencyCode'
            .format(xpath_type))
        if update_frequency in self.valid_update_frequencies:
            self.update_frequency = update_frequency

        # inspire regulations
        regislations = {"inspire_rules": []}
        with open(INSPIRE_LEGISLATION_FILE, "r", encoding="utf-8") as _file:
            regislations = json.load(_file)
        for regislation in regislations["inspire_rules"]:
            reg = {
                "name": regislation.get("name", None),
                "date": regislation.get("date", "1900-01-01"),
                "pass": None,
            }
            statement = xml_helper.try_get_text_from_xml_element(
                xml_obj,
                '//gmd:MD_Metadata/gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:report/gmd:DQ_DomainConsistency/gmd:result/gmd:DQ_ConformanceResult[gmd:specification/gmd:CI_Citation/gmd:title/gco:CharacterString="{}" and gmd:specification/gmd:CI_Citation/gmd:date/gmd:CI_Date/gmd:date/gco:Date="{}"]/gmd:pass/gco:Boolean'
                .format(reg["name"], reg["date"]))
            statement_val = utils.resolve_boolean_attribute_val(statement)
            if statement_val is None:
                reg["pass"] = "******"
                self.inspire_interoperability = False
            else:
                reg["pass"] = statement_val
                # if only one regislation is not fullfilled, we do not have interoperability
                if not statement_val:
                    self.inspire_interoperability = False
            self.interoperability_list.append(reg)
Exemple #11
0
    def _parse_operations_metadata(self, upper_elem):
        """ Parses the <Operation> elements inside of <OperationsMetadata>

        Args:
            upper_elem (Element): The upper xml element
        Returns:

        """
        operations_objs = xml_helper.try_get_element_from_xml(
            ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Operation"), upper_elem)

        attribute_map = {
            OGCOperationEnum.GET_CAPABILITIES.value: 'get_capabilities_uri',
            OGCOperationEnum.DESCRIBE_RECORD.value: 'describe_record_uri',
            OGCOperationEnum.GET_RECORDS.value: 'get_records_uri',
            OGCOperationEnum.GET_RECORD_BY_ID.value: 'get_record_by_id_uri',
        }

        for operation in operations_objs:
            operation_name = xml_helper.try_get_attribute_from_xml_element(
                operation,
                "name",
            )
            get_uri = xml_helper.try_get_single_element_from_xml(
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation)
            csw_logger.error("Type of returned object of get_uri: {}".format(
                type(get_uri)))

            get_uri = xml_helper.get_href_attribute(
                get_uri) if get_uri is not None else None
            post_uris = xml_helper.try_get_element_from_xml(
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation)
            number_of_post_endpoints = post_uris.__len__()
            if (number_of_post_endpoints > 1):
                post_uri = xml_helper.try_get_single_element_from_xml(
                    ".//*[local-name()='Post'][.//ows:Constraint/ows:Value='XML']",
                    operation)
            else:
                post_uri = xml_helper.try_get_single_element_from_xml(
                    ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"),
                    operation)
            csw_logger.error(
                "Number of Entries of Post endpoints: {} for operation {}".
                format(number_of_post_endpoints, operation_name))
            csw_logger.error("Type of returned object of post_uri: {}".format(
                type(post_uri)))
            post_uri = xml_helper.get_href_attribute(
                post_uri) if post_uri is not None else None

            if attribute_map.get(operation_name):
                setattr(self,
                        attribute_map.get(operation_name) + '_GET', get_uri)
                setattr(self,
                        attribute_map.get(operation_name) + '_POST', post_uri)
            else:
                # the given operation is not supported for now
                pass

            parameters = self._parse_parameter_metadata(operation)
            output_format = parameters.get("outputFormat", None)
            if output_format is not None:
                self.formats_list.append(
                    MimeType.objects.get_or_create(
                        operation=operation_name,
                        mime_type=output_format,
                    )[0])
Exemple #12
0
    def test_proxy_setting(self):
        return
        """ Tests whether the proxy can be set properly.

        Returns:
        """
        metadata = self.service_wms.metadata

        # To avoid running celery in a separate test instance, we do not call the route. Instead we call the logic, which
        # is used to process access settings directly.
        async_process_securing_access(
            metadata.id,
            use_proxy=True,
            log_proxy=True,
            restrict_access=False,
        )

        self.cap_doc_wms.refresh_from_db()
        doc_unsecured = self.cap_doc_wms.content
        doc_secured = Document.objects.get(
            metadata=metadata,
            document_type=DocumentEnum.CAPABILITY.value,
            is_original=False,
        ).content

        # Check for all operations if the uris has been changed!
        # Do not check for GetCapabilities, since we always change this uri during registration!
        # Make sure all versions can be matched by the code - the xml structure differs a lot from version to version
        service_version = metadata.get_service_version()

        if metadata.is_service_type(OGCServiceEnum.WMS):
            operations = [
                OGCOperationEnum.GET_MAP.value,
                OGCOperationEnum.GET_FEATURE_INFO.value,
                OGCOperationEnum.DESCRIBE_LAYER.value,
                OGCOperationEnum.GET_LEGEND_GRAPHIC.value,
                OGCOperationEnum.GET_STYLES.value,
                OGCOperationEnum.PUT_STYLES.value,
            ]
        elif metadata.is_service_type(OGCServiceEnum.WFS):
            operations = [
                OGCOperationEnum.GET_FEATURE.value,
                OGCOperationEnum.TRANSACTION.value,
                OGCOperationEnum.LOCK_FEATURE.value,
                OGCOperationEnum.DESCRIBE_FEATURE_TYPE.value,
            ]
        else:
            operations = []

        # create xml documents from string documents and fetch only the relevant <Request> element for each
        xml_unsecured = xml_helper.parse_xml(doc_unsecured)
        request_unsecured = xml_helper.try_get_single_element_from_xml(elem="//" + GENERIC_NAMESPACE_TEMPLATE.format("Request"), xml_elem=xml_unsecured)
        xml_secured = xml_helper.parse_xml(doc_secured)
        request_secured = xml_helper.try_get_single_element_from_xml(elem="//" + GENERIC_NAMESPACE_TEMPLATE.format("Request"), xml_elem=xml_secured)

        for operation in operations:
            # Get <OPERATION> element
            operation_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_unsecured)
            operation_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_secured)

            if service_version == OGCServiceVersionEnum.V_1_0_0:
                if metadata.is_service_type(OGCServiceEnum.WMS):
                    # The WMS 1.0.0 specification uses <OPERATION> instead of <GetOPERATION> for any operation element.
                    operation = operation.replace("Get", "")

                    # Get <OPERATION> element again
                    operation_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_unsecured)
                    operation_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format(operation), request_secured)

                # Version 1.0.0 holds the uris in the "onlineResource" attribute of <Get> and <Post>
                get_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_unsecured)
                get_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_secured)
                post_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_unsecured)
                post_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_secured)

                online_res = "onlineResource"
                get_unsecured = xml_helper.try_get_attribute_from_xml_element(get_unsecured, online_res)
                get_secured = xml_helper.try_get_attribute_from_xml_element(get_secured, online_res)
                post_unsecured = xml_helper.try_get_attribute_from_xml_element(post_unsecured, online_res)
                post_secured = xml_helper.try_get_attribute_from_xml_element(post_secured, online_res)

                # Assert that all get/post elements are not None
                self.assertIsNotNone(get_secured, msg="The secured uri of '{}' is None!".format(operation))
                self.assertIsNotNone(post_secured, msg="The secured uri of '{}' is None!".format(operation))

                # Assert that the secured version is different from the unsecured one
                self.assertNotEqual(get_unsecured, get_secured, msg="The uri of '{}' has not been secured!".format(operation))
                self.assertNotEqual(post_unsecured, post_secured, msg="The uri of '{}' has not been secured!".format(operation))

                # Assert that the HOST_NAME constant appears in the secured uri
                self.assertContains(get_secured, HOST_NAME)
                self.assertContains(post_secured, HOST_NAME)

            elif service_version == OGCServiceVersionEnum.V_1_1_0 \
                    or service_version == OGCServiceVersionEnum.V_2_0_0 \
                    or service_version == OGCServiceVersionEnum.V_2_0_2:
                # Only WFS
                # Get <OPERATION> element again, since the operation is now identified using an attribute, not an element tag
                operation_unsecured = xml_helper.try_get_single_element_from_xml(
                    ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Operation") + "[@name='" + operation + "']",
                    request_unsecured
                )
                operation_secured = xml_helper.try_get_single_element_from_xml(
                    ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Operation") + "[@name='" + operation + "']",
                    request_secured
                )

                # Version 1.1.0 holds the uris in the href attribute of <Get> and <Post>
                get_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_unsecured)
                get_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get"), operation_secured)
                post_unsecured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_unsecured)
                post_secured = xml_helper.try_get_single_element_from_xml(".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post"), operation_secured)

                get_unsecured = xml_helper.get_href_attribute(get_unsecured)
                get_secured = xml_helper.get_href_attribute(get_secured)
                post_unsecured = xml_helper.get_href_attribute(post_unsecured)
                post_secured = xml_helper.get_href_attribute(post_secured)

                # Assert that all get/post elements are not None
                self.assertIsNotNone(get_secured, msg="The secured uri of '{}' is None!".format(operation))
                self.assertIsNotNone(post_secured, msg="The secured uri of '{}' is None!".format(operation))

                # Assert that the secured version is different from the unsecured one
                self.assertNotEqual(get_unsecured, get_secured, msg="The uri of '{}' has not been secured!".format(operation))
                self.assertNotEqual(post_unsecured, post_secured, msg="The uri of '{}' has not been secured!".format(operation))

                # Assert that the HOST_NAME constant appears in the secured uri
                self.assertContains(get_secured, HOST_NAME)
                self.assertContains(post_secured, HOST_NAME)

            elif service_version == OGCServiceVersionEnum.V_1_1_1 or service_version == OGCServiceVersionEnum.V_1_3_0:
                # Version 1.1.1 holds the uris in the <OnlineResource> element inside <Get> and <Post>
                get_unsecured = xml_helper.try_get_single_element_from_xml(
                    ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get")
                    + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"),
                    operation_unsecured
                )
                get_secured = xml_helper.try_get_single_element_from_xml(
                    ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Get")
                    + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"),
                    operation_secured
                )
                post_unsecured = xml_helper.try_get_single_element_from_xml(
                    ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post")
                    + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"),
                    operation_unsecured
                )
                post_secured = xml_helper.try_get_single_element_from_xml(
                    ".//" + GENERIC_NAMESPACE_TEMPLATE.format("Post")
                    + "/" + GENERIC_NAMESPACE_TEMPLATE.format("OnlineResource"),
                    operation_secured
                )

                get_unsecured = xml_helper.get_href_attribute(get_unsecured)
                get_secured = xml_helper.get_href_attribute(get_secured)
                post_unsecured = xml_helper.get_href_attribute(post_unsecured)
                post_secured = xml_helper.get_href_attribute(post_secured)

                # Assert that both (secure/unsecure) uris are None or none of them
                # This is possible for operations that are not supported by the service
                if get_secured is not None and get_unsecured is not None:
                    self.assertIsNotNone(get_secured, msg="The secured uri of '{}' is None!".format(operation))

                    # Assert that the secured version is different from the unsecured one
                    self.assertNotEqual(get_unsecured, get_secured, msg="The uri of '{}' has not been secured!".format(operation))

                    # Assert that the HOST_NAME constant appears in the secured uri
                    self.assertTrue(HOST_NAME in get_secured)

                if post_secured is not None and post_unsecured is not None:
                    self.assertIsNotNone(post_secured, msg="The secured uri of '{}' is None!".format(operation))
                    self.assertNotEqual(post_unsecured, post_secured, msg="The uri of '{}' has not been secured!".format(operation))
                    self.assertTrue(HOST_NAME in post_secured)
            else:
                pass
Exemple #13
0
    def harvest(self, task_id: str = None):
        """ Starts harvesting procedure

        Returns:

        """
        # Create a pending task record for the database first!
        task_exists = PendingTask.objects.filter(
            description__icontains=self.metadata.title).exists()
        if task_exists:
            raise ProcessLookupError(_("Harvesting is currently performed"))
        else:
            async_task_id = task_id or self.metadata.id
            self.pending_task = PendingTask.objects.create(
                task_id=async_task_id,
                description=json.dumps({
                    "service": self.metadata.title,
                    "phase": "Connecting...",
                }),
                progress=0,
                remaining_time=None,
                created_by=self.harvesting_group)

        # Fill the deleted_metadata with all persisted metadata, so we can eliminate each entry if it is still provided by
        # the catalogue. In the end we will have a list, which contains metadata IDs that are not found in the catalogue anymore.

        all_persisted_metadata_identifiers = self.metadata.get_related_metadatas(
            filters={
                'to_metadatas__relation_type':
                MetadataRelationEnum.HARVESTED_THROUGH.value
            }).values_list("identifier", flat=True)
        # Use a set instead of list to increase lookup afterwards
        self.deleted_metadata.update(all_persisted_metadata_identifiers)

        # Perform the initial "hits" request to get an overview of how many data will be fetched
        hits_response, status_code = self._get_harvest_response(
            result_type="hits")
        descr = json.loads(self.pending_task.description)
        if status_code != 200:
            descr["phase"] = "Harvest failed: HTTP Code {}"
            self.pending_task.description = json.dumps(descr)
            self.pending_task.save()
            raise ConnectionError(
                _("Harvest failed: Code {}\n{}").format(
                    status_code, hits_response))
        xml_response = xml_helper.parse_xml(hits_response)
        if xml_response is None:
            descr["phase"] = "Response is not a valid xml"
            self.pending_task.description = json.dumps(descr)
            self.pending_task.save()
            raise ConnectionError(
                _("Response is not a valid xml: \n{}".format(hits_response)))

        try:
            total_number_to_harvest = int(
                xml_helper.try_get_attribute_from_xml_element(
                    xml_response,
                    "numberOfRecordsMatched",
                    "//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"),
                ))
        except TypeError:
            csw_logger.error(
                "Malicious Harvest response: {}".format(hits_response))
            descr[
                "phase"] = "Harvest response incorrect. Inform an administrator!"
            self.pending_task.description = json.dumps(descr)
            self.pending_task.save()
            raise AttributeError(
                _("Harvest response is missing important data!"))

        descr["phase"] = "Start harvesting..."
        self.pending_task.description = json.dumps(descr)
        self.pending_task.save()
        progress_step_per_request = float(
            self.max_records_per_request / total_number_to_harvest) * 100

        # There are wongly configured CSW, which do not return nextRecord=0 on the last page but instead continue on
        # nextRecord=1. We need to prevent endless loops by checking whether, we already worked on these positions and
        # simply end it there!
        processed_start_positions = set()

        t_start = time()
        number_rest_to_harvest = total_number_to_harvest
        number_of_harvested = 0
        self.harvest_result.timestamp_start = timezone.now()
        self.harvest_result.save()

        page_cacher = PageCacher()

        # Run as long as we can fetch data and as long as the user does not abort the pending task!
        while self.pending_task is not None:
            processed_start_positions.add(self.start_position)
            # Get response
            next_response, status_code = self._get_harvest_response(
                result_type="results")

            found_entries = self._process_harvest_response(next_response)

            # Calculate time since loop started
            duration = time() - t_start
            number_rest_to_harvest -= self.max_records_per_request
            number_of_harvested += found_entries
            self.harvest_result.number_results = number_of_harvested
            self.harvest_result.save()

            # Remove cached pages of API and CSW
            page_cacher.remove_pages(API_CACHE_KEY_PREFIX)
            page_cacher.remove_pages(CSW_CACHE_PREFIX)
            if self.start_position == 0 or self.start_position in processed_start_positions:
                # We are done!
                estimated_time_for_all = timezone.timedelta(seconds=0)
                break
            else:
                seconds_for_rest = (number_rest_to_harvest *
                                    (duration / number_of_harvested))
                estimated_time_for_all = timezone.timedelta(
                    seconds=seconds_for_rest)

            self._update_pending_task(self.start_position,
                                      total_number_to_harvest,
                                      progress_step_per_request,
                                      estimated_time_for_all)

        # Add HarvestResult infos
        self.harvest_result.timestamp_end = timezone.now()
        self.harvest_result.number_results = number_of_harvested
        self.harvest_result.save()

        # Delete Metadata records which could not be found in the catalogue anymore
        # This has to be done if the harvesting run completely. Skip this part if the user aborted the harvest!
        if self.pending_task is not None:
            deleted_metadatas = Metadata.objects.filter(
                identifier__in=self.deleted_metadata)
            deleted_metadatas.delete()
            self.pending_task.delete()

        # Remove cached pages of API and CSW
        page_cacher.remove_pages(API_CACHE_KEY_PREFIX)
        page_cacher.remove_pages(CSW_CACHE_PREFIX)
Exemple #14
0
    def _md_metadata_parse_to_dict(self, md_metadata_entries: list) -> list:
        """ Read most important data from MD_Metadata xml element

        Args:
            md_metadata_entries (list): The xml MD_Metadata elements
        Returns:
             ret_list (list): The list containing dicts
        """
        ret_list = []
        for md_metadata in md_metadata_entries:
            md_data_entry = {}

            # Check before anything else, whether this metadata type can be skipped!
            hierarchy_level = xml_helper.try_get_attribute_from_xml_element(
                md_metadata, "codeListValue",
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("hierarchyLevel") +
                "/" + GENERIC_NAMESPACE_TEMPLATE.format("MD_ScopeCode"))
            metadata_type = hierarchy_level
            md_data_entry["metadata_type"] = metadata_type
            if not HARVEST_METADATA_TYPES.get(metadata_type, False):
                continue

            _id = xml_helper.try_get_text_from_xml_element(
                md_metadata,
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("fileIdentifier") +
                "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
            md_data_entry["id"] = _id

            parent_id = xml_helper.try_get_text_from_xml_element(
                md_metadata,
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("parentIdentifier") +
                "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
            md_data_entry["parent_id"] = parent_id

            # A workaround, so we do not need to check whether SV_ServiceIdentification or MD_DataIdentification is present
            # in this metadata: Simply take the direct parent and perform a deeper nested search on the inside of this element.
            # Yes, we could simply decide based on the hierarchyLevel attribute whether to search for SV_xxx or MD_yyy.
            # No, there are metadata entries which do not follow these guidelines and have "service" with MD_yyy
            # Yes, they are important since they can be found in the INSPIRE catalogue (07/2020)
            identification_elem = xml_helper.try_get_single_element_from_xml(
                xml_elem=md_metadata,
                elem=".//" +
                GENERIC_NAMESPACE_TEMPLATE.format("identificationInfo"))
            title = xml_helper.try_get_text_from_xml_element(
                identification_elem,
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("citation") + "/" +
                GENERIC_NAMESPACE_TEMPLATE.format("CI_Citation") + "/" +
                GENERIC_NAMESPACE_TEMPLATE.format("title") + "/" +
                GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
            md_data_entry["title"] = title

            language_code = xml_helper.try_get_attribute_from_xml_element(
                md_metadata, "codeListValue",
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("language") + "/" +
                GENERIC_NAMESPACE_TEMPLATE.format("LanguageCode"))
            md_data_entry["language_code"] = language_code

            date_stamp = xml_helper.try_get_text_from_xml_element(
                md_metadata,
                "./" + GENERIC_NAMESPACE_TEMPLATE.format("dateStamp") + "/" +
                GENERIC_NAMESPACE_TEMPLATE.format("Date")
            ) or xml_helper.try_get_text_from_xml_element(
                md_metadata,
                "./" + GENERIC_NAMESPACE_TEMPLATE.format("dateStamp") + "/" +
                GENERIC_NAMESPACE_TEMPLATE.format("DateTime"))
            try:
                md_data_entry["date_stamp"] = parse(date_stamp).replace(
                    tzinfo=utc)
            except TypeError:
                md_data_entry["date_stamp"] = None

            abstract = xml_helper.try_get_text_from_xml_element(
                md_metadata,
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("abstract") + "/" +
                GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
            md_data_entry["abstract"] = abstract

            digital_transfer_elements = xml_helper.try_get_element_from_xml(
                xml_elem=md_metadata,
                elem=".//" +
                GENERIC_NAMESPACE_TEMPLATE.format("MD_DigitalTransferOptions"))
            links = []
            for elem in digital_transfer_elements:
                links_entry = {}
                resource_link = xml_helper.try_get_text_from_xml_element(
                    elem,
                    ".//" + GENERIC_NAMESPACE_TEMPLATE.format("onLine") + "/" +
                    GENERIC_NAMESPACE_TEMPLATE.format("CI_OnlineResource") +
                    "/" + GENERIC_NAMESPACE_TEMPLATE.format("linkage") + "/" +
                    GENERIC_NAMESPACE_TEMPLATE.format("URL"),
                )
                descr = xml_helper.try_get_text_from_xml_element(
                    elem,
                    ".//" + GENERIC_NAMESPACE_TEMPLATE.format("onLine") + "/" +
                    GENERIC_NAMESPACE_TEMPLATE.format("CI_OnlineResource") +
                    "/" + GENERIC_NAMESPACE_TEMPLATE.format("description") +
                    "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
                links_entry["link"] = resource_link
                links_entry["description"] = descr

                if resource_link is not None:
                    # Check on the type of online_resource we found -> could be GetCapabilities
                    query_params = parse_qs(
                        urlparse(resource_link.lower()).query)
                    if OGCOperationEnum.GET_CAPABILITIES.value.lower(
                    ) in query_params.get("request", []):
                        # Parse all possibly relevant data from the dict
                        version = query_params.get("version", [None])
                        service_type = query_params.get("service", [None])
                        md_data_entry[
                            "capabilities_original_url"] = resource_link
                        md_data_entry["service_type"] = service_type[0]
                        md_data_entry["version"] = version[0]
                links.append(links_entry)

            md_data_entry["links"] = links

            keywords = xml_helper.try_get_element_from_xml(
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("keyword") + "/" +
                GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"),
                md_metadata,
            ) or []
            keywords = [
                xml_helper.try_get_text_from_xml_element(kw) for kw in keywords
            ]
            md_data_entry["keywords"] = keywords

            access_constraints = xml_helper.try_get_text_from_xml_element(
                md_metadata,
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("otherConstraints") +
                "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
            md_data_entry["access_constraints"] = access_constraints

            categories = xml_helper.try_get_element_from_xml(
                ".//" +
                GENERIC_NAMESPACE_TEMPLATE.format("MD_TopicCategoryCode"),
                md_metadata,
            ) or []
            categories = [
                xml_helper.try_get_text_from_xml_element(cat)
                for cat in categories
            ]
            md_data_entry["categories"] = categories

            bbox_elem = xml_helper.try_get_single_element_from_xml(
                ".//" +
                GENERIC_NAMESPACE_TEMPLATE.format("EX_GeographicBoundingBox"),
                md_metadata)
            if bbox_elem is not None:
                extent = [
                    xml_helper.try_get_text_from_xml_element(
                        bbox_elem, ".//" +
                        GENERIC_NAMESPACE_TEMPLATE.format("westBoundLongitude")
                        + "/" + GENERIC_NAMESPACE_TEMPLATE.format("Decimal"))
                    or "0.0",
                    xml_helper.try_get_text_from_xml_element(
                        bbox_elem, ".//" +
                        GENERIC_NAMESPACE_TEMPLATE.format("southBoundLatitude")
                        + "/" + GENERIC_NAMESPACE_TEMPLATE.format("Decimal"))
                    or "0.0",
                    xml_helper.try_get_text_from_xml_element(
                        bbox_elem, ".//" +
                        GENERIC_NAMESPACE_TEMPLATE.format("eastBoundLongitude")
                        + "/" + GENERIC_NAMESPACE_TEMPLATE.format("Decimal"))
                    or "0.0",
                    xml_helper.try_get_text_from_xml_element(
                        bbox_elem, ".//" +
                        GENERIC_NAMESPACE_TEMPLATE.format("northBoundLatitude")
                        + "/" + GENERIC_NAMESPACE_TEMPLATE.format("Decimal"))
                    or "0.0",
                ]
                # There are metadata with wrong vertex notations like 50,3 instead of 50.3
                # We should just drop them, since they are not compatible with the specifications but in here, we make an
                # exception and replace , since it's quite easy
                extent = [vertex.replace(",", ".") for vertex in extent]
                try:
                    bounding_geometry = GEOSGeometry(
                        Polygon.from_bbox(bbox=extent), srid=DEFAULT_SRS)
                except Exception:
                    # Log malicious extent!
                    csw_logger.warning(
                        CSW_EXTENT_WARNING_LOG_TEMPLATE.format(
                            _id, self.metadata.title, extent))
                    bounding_geometry = DEFAULT_SERVICE_BOUNDING_BOX_EMPTY
            else:
                bounding_geometry = DEFAULT_SERVICE_BOUNDING_BOX_EMPTY

            md_data_entry["bounding_geometry"] = bounding_geometry
            md_data_entry["contact"] = self._create_contact_from_md_metadata(
                md_metadata)
            md_data_entry["formats"] = self._create_formats_from_md_metadata(
                md_metadata)

            # Load non-metadata data
            # ToDo: Should harvesting persist non-metadata data?!
            #described_resource = None
            #metadata = None
            #if hierarchy_level == MetadataEnum.DATASET.value:
            #    described_resource = self._create_dataset_from_md_metadata(md_metadata, metadata)
            #    described_resource.metadata = metadata
            #    described_resource.is_active = True
            #    described_resource.save()

            ret_list.append(md_data_entry)
        return ret_list
Exemple #15
0
    def _process_harvest_response(self, next_response: bytes) -> int:
        """ Processes the harvest response content

        While the last response is being processed, the next one is already loaded to decrease run time

        Args:
            response (bytes): The response as bytes
        Returns:
             number_found_entries (int): The amount of found metadata records in this response
        """
        xml_response = xml_helper.parse_xml(next_response)
        if xml_response is None:
            csw_logger.error(
                "Response is no valid xml. catalogue: {}, startPosition: {}, maxRecords: {}"
                .format(self.metadata.title, self.start_position,
                        self.max_records_per_request))
            # Abort!
            self.start_position = 0
            return

        md_metadata_entries = xml_helper.try_get_element_from_xml(
            "//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_Metadata"),
            xml_response) or []
        next_record_position = int(
            xml_helper.try_get_attribute_from_xml_element(
                xml_response,
                "nextRecord",
                "//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"),
            ))
        self.start_position = next_record_position

        # Fetch found identifiers in parent process, so self.deleted_metadata can be edited easily
        for md_identifier in md_metadata_entries:
            id = xml_helper.try_get_text_from_xml_element(
                md_identifier,
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("fileIdentifier") +
                "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
            try:
                self.deleted_metadata.remove(id)
            except KeyError:
                pass

        # Delete response to free memory
        del xml_response

        # Process response via multiple processes
        t_start = time()
        num_processes = int(cpu_count() / 2)
        num_processes = num_processes if num_processes >= 1 else 1
        index_step = int(len(md_metadata_entries) / num_processes)
        start_index = 0
        end_index = 0
        self.resource_list = md_metadata_entries
        process_list = []
        for i in range(0, num_processes):
            if index_step < 1:
                end_index = -1
            else:
                end_index += index_step
            p = Process(target=self._create_metadata_from_md_metadata,
                        args=(start_index, end_index))
            start_index += index_step
            process_list.append(p)
        # Close all connections to force each process to create a new one for itself
        connections.close_all()
        execute_threads(process_list)

        csw_logger.debug(
            "Harvesting '{}': runtime for {} metadata parsing: {}s ####".
            format(self.metadata.title, self.max_records_per_request,
                   time() - t_start))
        return len(md_metadata_entries)
Exemple #16
0
    def harvest(self):
        """ Starts harvesting procedure

        Returns:

        """
        absolute_url = f'<a href="{self.metadata.get_absolute_url()}">{self.metadata.title}</a>'
        service_json = {'id': self.metadata.pk, 'absolute_url': absolute_url},
        if current_task:
            current_task.update_state(state=states.STARTED,
                                      meta={
                                          'service': service_json,
                                          'phase':
                                          f"Connecting to {absolute_url}",
                                      })

        # Fill the deleted_metadata with all persisted metadata, so we can eliminate each entry if it is still provided by
        # the catalogue. In the end we will have a list, which contains metadata IDs that are not found in the catalogue anymore.

        all_persisted_metadata_identifiers = self.metadata.get_related_metadatas(
            filters={
                'to_metadatas__relation_type':
                MetadataRelationEnum.HARVESTED_THROUGH.value
            }).values_list("identifier", flat=True)
        # Use a set instead of list to increase lookup afterwards
        self.deleted_metadata.update(all_persisted_metadata_identifiers)

        # Perform the initial "hits" request to get an overview of how many data will be fetched
        hits_response, status_code = self._get_harvest_response(
            result_type="hits")

        if status_code != 200:
            raise ConnectionError(
                _("Harvest failed: Code {}\n{}").format(
                    status_code, hits_response))
        xml_response = xml_helper.parse_xml(hits_response)
        if xml_response is None:
            raise ConnectionError(
                _("Response is not a valid xml: \n{}".format(hits_response)))

        try:
            if current_task:
                current_task.update_state(state=states.STARTED,
                                          meta={
                                              'phase':
                                              f"calculating harvesting time",
                                          })
            total_number_to_harvest = int(
                xml_helper.try_get_attribute_from_xml_element(
                    xml_response,
                    "numberOfRecordsMatched",
                    "//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"),
                ))
        except TypeError:
            csw_logger.error(
                "Malicious Harvest response: {}".format(hits_response))
            raise AttributeError(
                _("Harvest response is missing important data!"))
        if current_task:
            current_task.update_state(state=states.STARTED,
                                      meta={
                                          'service': service_json,
                                          'phase': "Start harvesting..."
                                      })

        self.progress_step_per_result = float(
            1 / total_number_to_harvest) * 100

        # There are wongly configured CSW, which do not return nextRecord=0 on the last page but instead continue on
        # nextRecord=1. We need to prevent endless loops by checking whether, we already worked on these positions and
        # simply end it there!
        processed_start_positions = set()

        t_start = time()
        number_rest_to_harvest = total_number_to_harvest
        number_of_harvested = 0
        self.harvest_result.timestamp_start = timezone.now()
        self.harvest_result.save()

        page_cacher = PageCacher()

        # Run as long as we can fetch data and as long as the user does not abort the pending task!
        while True:
            estimated_time_for_all = 'unknown'
            if current_task:
                current_task.update_state(
                    state=states.STARTED,
                    meta={
                        'phase':
                        _("Harvesting first {} of {}. Time remaining: {}").
                        format(self.max_records_per_request,
                               total_number_to_harvest,
                               estimated_time_for_all),
                    })
            processed_start_positions.add(self.start_position)
            # Get response
            next_response, status_code = self._get_harvest_response(
                result_type="results")

            if current_task:
                current_task.update_state(
                    state=states.STARTED,
                    meta={
                        'phase':
                        _("Processing harvested results for the first {} of {}. Time remaining: {}"
                          ).format(self.max_records_per_request,
                                   total_number_to_harvest,
                                   estimated_time_for_all),
                    })
            found_entries = self._process_harvest_response(next_response)

            # Calculate time since loop started
            duration = time() - t_start
            number_rest_to_harvest -= self.max_records_per_request
            number_of_harvested += found_entries
            self.harvest_result.number_results = number_of_harvested
            self.harvest_result.save()

            # Remove cached pages of API and CSW
            page_cacher.remove_pages(API_CACHE_KEY_PREFIX)
            page_cacher.remove_pages(CSW_CACHE_PREFIX)
            if self.start_position == 0 or self.start_position in processed_start_positions:
                # We are done!
                break
            else:
                seconds_for_rest = (number_rest_to_harvest *
                                    (duration / number_of_harvested))
                estimated_time_for_all = timezone.timedelta(
                    seconds=seconds_for_rest)

        # Add HarvestResult infos
        self.harvest_result.timestamp_end = timezone.now()
        self.harvest_result.number_results = number_of_harvested
        self.harvest_result.save()

        # Delete Metadata records which could not be found in the catalogue anymore
        # This has to be done if the harvesting run completely. Skip this part if the user aborted the harvest!
        deleted_metadatas = Metadata.objects.filter(
            identifier__in=self.deleted_metadata)
        deleted_metadatas.delete()

        # Remove cached pages of API and CSW
        page_cacher.remove_pages(API_CACHE_KEY_PREFIX)
        page_cacher.remove_pages(CSW_CACHE_PREFIX)