def lookup_re3data(self):
     if self.client_id and self.pid_scheme:
         re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get(
             self.client_id)  # {client_id,re3doi}
         short_re3doi = idutils.normalize_pid(
             re3doi, scheme='doi')  #https://doi.org/10.17616/R3XS37
         # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api
         if re3doi:
             self.logger.info('Found match re3data (DOI-based) record')
             query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi  # https://re3data.org/api/beta/repositories?query=
             q = RequestHelper(url=query_url)
             q.setAcceptType(AcceptTypes.xml)
             re_source, xml = q.content_negotiate(metric_id='RE3DATA')
             root = etree.fromstring(xml.content)
             #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" />
             re3link = root.xpath('//link')[0].attrib['href']
             if re3link is not None:
                 self.logger.info('Found match re3data metadata record')
                 # query reposiroty metadata
                 q2 = RequestHelper(url=re3link)
                 q2.setAcceptType(AcceptTypes.xml)
                 re3_source, re3_response = q2.content_negotiate(
                     metric_id='RE3DATA')
                 self.re3metadata_raw = re3_response.content
                 self.parseRepositoryMetadata()
         else:
             self.logger.warning(
                 'No DOI of client id is available from datacite api')
 def getMetadataStandards(self):
     filter = ['datacite.org', 'openarchives.org',
               'purl.org/dc/']  # TODO expand filters
     #http://ws.pangaea.de/oai/provider?verb=ListMetadataFormats
     oai_endpoint = self.endpoint.split('?')[0]
     #oai_endpoint = oai_endpoint.rstrip('/')
     oai_listmetadata_url = oai_endpoint + '?verb=ListMetadataFormats'
     requestHelper = RequestHelper(url=oai_listmetadata_url,
                                   logInst=self.logger)
     requestHelper.setAcceptType(AcceptTypes.xml)
     response_type, xml = requestHelper.content_negotiate(self.metric_id)
     root = etree.fromstring(xml.content)
     metadata_nodes = root.xpath(
         '//oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat',
         namespaces=OAIMetadataProvider.oai_namespaces)
     schemas = {}
     for node in metadata_nodes:
         ele = etree.XPathEvaluator(
             node, namespaces=OAIMetadataProvider.oai_namespaces).evaluate
         metadata_prefix = ele('string(oai:metadataPrefix/text())'
                               )  # <metadataPrefix>oai_dc</metadataPrefix>
         metadata_schema = ele(
             'string(oai:schema/text())'
         )  #<schema>http://www.openarchives.org/OAI/2.0/oai_dc.xsd</schema>
         metadata_schema = metadata_schema.strip()
         self.namespaces.append(metadata_schema)
         # TODO there can be more than one OAI-PMH endpoint, https://www.re3data.org/repository/r3d100011221
         if not any(s in metadata_schema for s in filter):
             schemas[metadata_prefix] = [metadata_schema]
         else:
             self.logger.info(
                 '{0} : Skipping domain-agnostic standard listed in OAI-PMH - {1}'
                 .format(self.metric_id, metadata_prefix))
     return schemas
Beispiel #3
0
    def parse_metadata(self):
        XSI = "http://www.w3.org/2001/XMLSchema-instance"
        if self.link_type == 'embedded':
            source_name = self.getEnumSourceNames().LINKED_DATA.value
        elif self.link_type == 'guessed':
            source_name = self.getEnumSourceNames().GUESSED_XML.value
        elif self.link_type == 'negotiated':
            source_name = self.getEnumSourceNames().XML_NEGOTIATED.value
        else:
            source_name = self.getEnumSourceNames().TYPED_LINK.value
        dc_core_metadata = None
        requestHelper = RequestHelper(self.target_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.xml)
        #self.logger.info('FsF-F2-01M : Sending request to access metadata from -: {}'.format(self.target_url))
        neg_source, xml_response = requestHelper.content_negotiate(
            'FsF-F2-01M')
        if requestHelper.getHTTPResponse() is not None:
            self.logger.info(
                'FsF-F2-01M : Trying to extract/parse metadata from -: {}'.
                format(source_name))
            #dom = lxml.html.fromstring(self.landing_html.encode('utf8'))
            if neg_source != 'xml':
                self.logger.info(
                    'FsF-F2-01M : Expected XML but content negotiation responded -: '
                    + str(neg_source))
            else:
                tree = lxml.etree.XML(xml_response)
                schema_locations = set(
                    tree.xpath("//*/@xsi:schemaLocation",
                               namespaces={'xsi': XSI}))
                for schema_location in schema_locations:
                    self.namespaces = re.split('\s', schema_location)
                #TODO: implement some XSLT to handle the XML..

        return source_name, dc_core_metadata
Beispiel #4
0
    def getMetadataStandards(self):
        csw_endpoint = self.endpoint.split('?')[0]
        csw_listmetadata_url = csw_endpoint + '?service=CSW&request=GetCapabilities'
        requestHelper = RequestHelper(url=csw_listmetadata_url,
                                      logInst=self.logger)
        requestHelper.setAcceptType(AcceptTypes.xml)
        response_type, xml = requestHelper.content_negotiate(self.metric_id)
        schemas = {}
        if xml:
            try:
                root = etree.fromstring(requestHelper.response_content)
                metadata_nodes = root.xpath(
                    '//ows:Parameter[@name="outputSchema"]/ows:Value',
                    namespaces=OGCCSWMetadataProvider.csw_namespaces)
                for node in metadata_nodes:
                    if node.text:
                        if node.text not in self.namespaces:
                            self.namespaces.append(str(node.text))
                            schemas[str(node.text)] = str(node.text)
            except:
                self.logger.info(
                    '{0} : Could not parse XML response retrieved from OGC CSW endpoint'
                    .format(self.metric_id))

        return schemas
    def lookup_re3data(self):
        if self.client_id and self.pid_scheme:

            re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get(
                self.client_id)  # {client_id,re3doi}
            #print(self.client_id,'Re3DOI',re3doi, idutils.is_doi(re3doi))
            if re3doi:
                if idutils.is_doi(re3doi):
                    short_re3doi = idutils.normalize_pid(
                        re3doi, scheme='doi')  #https://doi.org/10.17616/R3XS37
                else:
                    re3doi = None

            # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api
            if re3doi:
                self.logger.info(
                    'FsF-R1.3-01M : Found match re3data (DOI-based) record')
                query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi  # https://re3data.org/api/beta/repositories?query=
                q = RequestHelper(url=query_url)
                q.setAcceptType(AcceptTypes.xml)
                re_source, xml = q.content_negotiate(metric_id='RE3DATA')
                try:
                    if isinstance(xml, bytes):
                        xml = xml.decode().encode()
                    root = etree.fromstring(xml)

                    #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" />
                    re3link = root.xpath('//link')[0].attrib['href']
                    if re3link is not None:
                        self.logger.info(
                            'FsF-R1.3-01M : Found match re3data metadata record -: '
                            + str(re3link))
                        # query reposiroty metadata
                        q2 = RequestHelper(url=re3link)
                        q2.setAcceptType(AcceptTypes.xml)
                        re3_source, re3_response = q2.content_negotiate(
                            metric_id='RE3DATA')
                        self.re3metadata_raw = re3_response
                        self.parseRepositoryMetadata()
                except Exception as e:
                    self.logger.warning(
                        'FsF-R1.3-01M : Malformed re3data (DOI-based) record received: '
                        + str(e))
            else:
                self.logger.warning(
                    'FsF-R1.3-01M : No DOI of client id is available from datacite api'
                )
    def parse_metadata(self):
        source_name = None
        dcite_metadata = {}
        self.logger.info('FsF-F2-01M : Trying to retrieve datacite metadata')
        requestHelper = RequestHelper(self.pid_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.datacite_json)
        neg_source, ext_meta = requestHelper.content_negotiate('FsF-F2-01M')
        if ext_meta:
            try:
                dcite_metadata = jmespath.search(self.metadata_mapping.value,
                                                 ext_meta)
                if dcite_metadata:
                    self.namespaces.append('http://datacite.org/schema/')
                    source_name = self.getEnumSourceNames().DATACITE_JSON.value
                    if dcite_metadata['creator'] is None:
                        first = dcite_metadata['creator_first']
                        last = dcite_metadata['creator_last']
                        # default type of creator is []
                        if isinstance(first, list) and isinstance(last, list):
                            if len(first) == len(last):
                                names = [
                                    i + " " + j for i, j in zip(first, last)
                                ]
                                dcite_metadata['creator'] = names

                    if dcite_metadata.get('related_resources'):
                        self.logger.info(
                            'FsF-I3-01M : {0} related resource(s) extracted from -: {1}'
                            .format(len(dcite_metadata['related_resources']),
                                    source_name))
                        temp_rels = []

                        for r in dcite_metadata['related_resources']:
                            if r.get('scheme_uri'):
                                self.namespaces.append(r.get('scheme_uri'))
                            filtered = {
                                k: v
                                for k, v in r.items() if v is not None
                            }
                            temp_rels.append(filtered)
                        dcite_metadata['related_resources'] = temp_rels
                    else:
                        self.logger.info(
                            'FsF-I3-01M : No related resource(s) found in Datacite metadata'
                        )

                    # convert all values (list type) into string except 'creator','license','related_resources'
                    for key, value in dcite_metadata.items():
                        if key not in self.exclude_conversion and isinstance(
                                value, list):
                            flat = ', '.join(map(str, value))
                            dcite_metadata[key] = flat
            except Exception as e:
                self.logger.exception(
                    'Failed to extract Datacite Json -: {}'.format(e))
        return source_name, dcite_metadata
    def evaluate(self):
        self.result = Persistence(id=self.metric_number,
                                  metric_identifier=self.metric_identifier,
                                  metric_name=self.metric_name)
        self.output = PersistenceOutput()
        # ======= CHECK IDENTIFIER PERSISTENCE =======
        self.logger.info(
            'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}'
            .format(Mapper.VALID_PIDS.value))
        check_url = None
        signposting_pid = None
        if self.fuji.id_scheme is not None:
            check_url = self.fuji.pid_url
            #check_url = idutils.to_url(self.fuji.id, scheme=self.fuji.id_scheme)
        if self.fuji.id_scheme == 'url':
            self.fuji.origin_url = self.fuji.id
            check_url = self.fuji.id
        if check_url:
            # ======= RETRIEVE METADATA FROM LANDING PAGE =======
            requestHelper = RequestHelper(check_url, self.logger)
            requestHelper.setAcceptType(AcceptTypes.html_xml)  # request
            neg_source, self.fuji.extruct_result = requestHelper.content_negotiate(
                'FsF-F1-02D', ignore_html=False)
            if not 'html' in str(requestHelper.content_type):
                self.logger.info(
                    'FsF-F2-01M :Content type is ' +
                    str(requestHelper.content_type) +
                    ', therefore skipping embedded metadata (microdata, RDFa) tests'
                )
                self.fuji.extruct_result = {}
            if type(self.fuji.extruct_result) != dict:
                self.fuji.extruct_result = {}
            r = requestHelper.getHTTPResponse()
            response_status = requestHelper.response_status

            if r:
                self.fuji.landing_url = requestHelper.redirect_url
                #in case the test has been repeated because a PID has been found in metadata
                #print(self.fuji.landing_url, self.fuji.input_id)
                if self.fuji.repeat_pid_check == True:
                    if self.fuji.landing_url != self.fuji.input_id:
                        self.logger.warning(
                            'FsF-F1-02D : Landing page URL resolved from PID found in metadata does not match with input URL'
                        )
                        self.logger.warning(
                            'FsF-F2-01M : Seems to be a catalogue entry or alternative representation of the data set, landing page URL resolved from PID found in metadata does not match with input URL'
                        )

                        #self.fuji.repeat_pid_check = False
                if self.fuji.landing_url not in [
                        'https://datacite.org/invalid.html'
                ]:

                    if response_status == 200:
                        # identify signposting links in header
                        header_link_string = requestHelper.getHTTPResponse(
                        ).getheader('Link')
                        if header_link_string is not None:
                            self.logger.info(
                                'FsF-F1-02D : Found signposting links in response header of landingpage'
                            )

                            for preparsed_link in header_link_string.split(
                                    ','):
                                found_link = None
                                found_type, type_match = None, None
                                found_rel, rel_match = None, None
                                found_formats, formats_match = None, None
                                parsed_link = preparsed_link.strip().split(';')
                                found_link = parsed_link[0].strip()
                                for link_prop in parsed_link[1:]:
                                    if str(link_prop).startswith('rel="'):
                                        rel_match = re.search(
                                            'rel=\"(.*?)\"', link_prop)
                                    elif str(link_prop).startswith('type="'):
                                        type_match = re.search(
                                            'type=\"(.*?)\"', link_prop)
                                    elif str(link_prop).startswith(
                                            'formats="'):
                                        formats_match = re.search(
                                            'formats=\"(.*?)\"', link_prop)
                                if type_match:
                                    found_type = type_match[1]
                                if rel_match:
                                    found_rel = rel_match[1]
                                if formats_match:
                                    found_formats = formats_match[1]
                                signposting_link_dict = {
                                    'url': found_link[1:-1],
                                    'type': found_type,
                                    'rel': found_rel,
                                    'profile': found_formats
                                }
                                if found_link:
                                    self.fuji.signposting_header_links.append(
                                        signposting_link_dict)

                        #check if there is a cite-as signposting link
                        if self.fuji.pid_scheme is None:
                            signposting_pid_link = self.fuji.get_signposting_links(
                                'cite-as')
                            if signposting_pid_link:
                                signposting_pid = signposting_pid_link[0].get(
                                    'url')
                            if signposting_pid:
                                signidhelper = IdentifierHelper
                                #found_ids = idutils.detect_identifier_schemes(signposting_pid[0])
                                found_id = signidhelper.preferred_schema
                                #if len(found_ids) > 1:
                                #    found_ids.remove('url')
                                #    found_id = found_ids[0]
                                if signidhelper.is_persistent:
                                    self.logger.info(
                                        'FsF-F1-02D : Found object identifier in signposting header links'
                                    )
                                    self.fuji.pid_scheme = found_id

                        up = urlparse(self.fuji.landing_url)
                        self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format(
                            uri=up)
                        self.fuji.landing_html = requestHelper.getResponseContent(
                        )
                        self.fuji.landing_content_type = requestHelper.content_type

                        self.output.resolved_url = self.fuji.landing_url  # url is active, although the identifier is not based on a pid scheme
                        self.output.resolvable_status = True
                        self.logger.info(
                            'FsF-F1-02D : Object identifier active (status code = 200)'
                        )
                        self.fuji.isMetadataAccessible = True
                    elif response_status in [401, 402, 403]:
                        self.fuji.isMetadataAccessible = False
                        self.logger.warning(
                            "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                            .format(code=response_status))
                    else:
                        self.fuji.isMetadataAccessible = False
                        self.logger.warning(
                            "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                            .format(code=response_status))
                else:
                    self.logger.warning(
                        "FsF-F1-02D : Invalid DOI, identifier resolved to -: {code}"
                        .format(code=self.fuji.landing_url))

            else:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "FsF-F1-02D :Resource inaccessible, no response received from -: {}"
                    .format(check_url))
                if response_status in [401, 402, 403]:
                    self.logger.warning(
                        "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                        .format(code=response_status))
        else:
            self.logger.warning(
                "FsF-F1-02D :Resource inaccessible, could not identify an actionable representation for the given identfier -: {}"
                .format(self.fuji.id))

        if self.fuji.pid_scheme is not None:
            # short_pid = id.normalize_pid(self.id, scheme=pid_scheme)
            if signposting_pid is None:
                idhelper = IdentifierHelper(self.fuji.id)
                self.fuji.pid_url = idhelper.identifier_url
                #self.fuji.pid_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme)
            else:
                self.fuji.pid_url = signposting_pid[0]
            self.output.pid_scheme = self.fuji.pid_scheme

            self.output.pid = self.fuji.pid_url
            self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0.5, 'pass')
            self.score.earned = 0.5
            self.maturity = 1
            if self.fuji.isMetadataAccessible:
                self.setEvaluationCriteriumScore('FsF-F1-02D-2', 0.5, 'pass')
                self.maturity = 3
                self.result.test_status = 'pass'
                self.score.earned = self.total_score  # idenfier should be based on a persistence scheme and resolvable

            #print(self.metric_tests)

            self.logger.log(
                self.fuji.LOG_SUCCESS,
                'FsF-F1-02D : Persistence identifier scheme -: {}'.format(
                    self.fuji.pid_scheme))
            #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme))
        else:
            self.score.earned = 0
            self.logger.warning(
                'FsF-F1-02D : Not a persistent identifier scheme -: {}'.format(
                    self.fuji.id_scheme))

        self.result.score = self.score
        self.result.maturity = self.maturity
        self.result.metric_tests = self.metric_tests
        self.result.output = self.output
Beispiel #8
0
    def evaluate(self):

        self.result = Persistence(id=self.fuji.count,
                                  metric_identifier=self.metric_identifier,
                                  metric_name=self.metric_name)
        self.output = PersistenceOutput()
        # ======= CHECK IDENTIFIER PERSISTENCE =======
        self.logger.info(
            'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}'
            .format(Mapper.VALID_PIDS.value))

        if self.fuji.pid_scheme is not None:
            check_url = idutils.to_url(self.fuji.id,
                                       scheme=self.fuji.pid_scheme)
        elif self.fuji.id_scheme == 'url':
            check_url = self.fuji.id

        # ======= RETRIEVE METADATA FROM LANDING PAGE =======
        requestHelper = RequestHelper(check_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.html)  # request
        neg_source, self.fuji.extruct_result = requestHelper.content_negotiate(
            'FsF-F1-02D')
        r = requestHelper.getHTTPResponse()
        signposting_pid = None
        if r:
            self.fuji.landing_url = requestHelper.redirect_url
            if r.status == 200:
                # identify signposting links in header
                header_link_string = requestHelper.getHTTPResponse().getheader(
                    'Link')
                if header_link_string is not None:
                    self.logger.info(
                        'FsF-F1-02D : Found signposting links in response header of landingpage'
                    )

                    for preparsed_link in header_link_string.split(','):
                        found_link = None
                        found_type, type_match = None, None
                        found_rel, rel_match = None, None
                        parsed_link = preparsed_link.strip().split(';')
                        found_link = parsed_link[0].strip()
                        for link_prop in parsed_link[1:]:
                            if str(link_prop).startswith('rel="'):
                                rel_match = re.search('rel=\"(.*?)\"',
                                                      link_prop)
                            elif str(link_prop).startswith('type="'):
                                type_match = re.search('type=\"(.*?)\"',
                                                       link_prop)
                        if type_match:
                            found_type = type_match[1]
                        if rel_match:
                            found_rel = rel_match[1]
                        signposting_link_dict = {
                            'url': found_link[1:-1],
                            'type': found_type,
                            'rel': found_rel
                        }
                        if found_link:
                            self.fuji.signposting_header_links.append(
                                signposting_link_dict)
                        '''
                        if found_rel:
                            if self.fuji.signposting_header_links.get(found_rel[1]):
                                self.fuji.signposting_header_links[found_rel[1]].append(found_link[1:-1])
                            else:
                                self.fuji.signposting_header_links[found_rel[1]]=[found_link[1:-1]]
                        '''

                #check if there is a cite-as signposting link
                if self.fuji.pid_scheme is None:
                    signposting_pid_link = self.fuji.get_signposting_links(
                        'cite-as')
                    if signposting_pid_link:
                        signposting_pid = signposting_pid_link[0].get('url')
                    if signposting_pid:
                        found_ids = idutils.detect_identifier_schemes(
                            signposting_pid[0])
                        if len(found_ids) > 1:
                            found_ids.remove('url')
                            found_id = found_ids[0]
                            if found_id in Mapper.VALID_PIDS.value:
                                self.logger.info(
                                    'FsF-F1-02D : Found object identifier in signposting header links'
                                )
                                self.fuji.pid_scheme = found_id

                up = urlparse(self.fuji.landing_url)
                self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format(
                    uri=up)
                self.fuji.landing_html = requestHelper.getResponseContent()

                self.output.resolved_url = self.fuji.landing_url  # url is active, although the identifier is not based on a pid scheme
                self.output.resolvable_status = True
                self.logger.info(
                    'FsF-F1-02D : Object identifier active (status code = 200)'
                )
                self.fuji.isMetadataAccessible = True
            elif r.status_code in [401, 402, 403]:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "Resource inaccessible, identifier returned http status code: {code}"
                    .format(code=r.status_code))
            else:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "Resource inaccessible, identifier returned http status code: {code}"
                    .format(code=r.status_code))
        else:
            self.fuji.isMetadataAccessible = False
            self.logger.warning(
                "FsF-F1-02D :Resource inaccessible, no response received from: {}"
                .format(check_url))

        if self.fuji.pid_scheme is not None:
            # short_pid = id.normalize_pid(self.id, scheme=pid_scheme)
            if signposting_pid is None:
                self.fuji.pid_url = idutils.to_url(self.fuji.id,
                                                   scheme=self.fuji.pid_scheme)
            else:
                self.fuji.pid_url = signposting_pid[0]
            self.output.pid_scheme = self.fuji.pid_scheme
            self.result.test_status = 'pass'
            self.output.pid = self.fuji.pid_url
            self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0, 'pass')
            if self.fuji.isMetadataAccessible:
                self.setEvaluationCriteriumScore('FsF-F1-02D-2', 1, 'pass')
                self.score.earned = self.total_score  # idenfier should be based on a persistence scheme and resolvable

            #print(self.metric_tests)

            self.logger.log(
                self.fuji.LOG_SUCCESS,
                'FsF-F1-02D : Persistence identifier scheme - {}'.format(
                    self.fuji.pid_scheme))
            #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme))
        else:
            self.score.earned = 0
            self.logger.warning(
                'FsF-F1-02D : Not a persistent identifier scheme - {}'.format(
                    self.fuji.id_scheme))

        self.result.score = self.score
        self.result.metric_tests = self.metric_tests
        self.result.output = self.output
Beispiel #9
0
    def parse_metadata(self):
        xml_metadata = None
        xml_mapping = None
        metatree = None
        envelope_metadata = {}
        XSI = "http://www.w3.org/2001/XMLSchema-instance"
        if self.link_type == 'linked':
            source_name = self.getEnumSourceNames().TYPED_LINK.value
        if self.link_type == 'embedded':
            source_name = self.getEnumSourceNames().LINKED_DATA.value
        elif self.link_type == 'guessed':
            source_name = self.getEnumSourceNames().GUESSED_XML.value
        elif self.link_type == 'negotiated':
            source_name = self.getEnumSourceNames().XML_NEGOTIATED.value
        else:
            source_name = self.getEnumSourceNames().TYPED_LINK.value
        dc_core_metadata = None
        requestHelper = RequestHelper(self.target_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.xml)
        #self.logger.info('FsF-F2-01M : Sending request to access metadata from -: {}'.format(self.target_url))
        neg_source, xml_response = requestHelper.content_negotiate(
            'FsF-F2-01M')
        if requestHelper.getHTTPResponse() is not None:
            self.logger.info(
                'FsF-F2-01M : Trying to extract/parse metadata from -: {}'.
                format(source_name))
            #dom = lxml.html.fromstring(self.landing_html.encode('utf8'))
            if neg_source != 'xml':
                self.logger.info(
                    'FsF-F2-01M : Expected XML but content negotiation responded -: '
                    + str(neg_source))
            else:
                parser = lxml.etree.XMLParser(strip_cdata=False)
                tree = lxml.etree.XML(xml_response, parser)
                root_element = tree.tag
                if root_element.endswith('}OAI-PMH'):
                    self.logger.info(
                        'FsF-F2-01M : Found OAI-PMH type XML envelope, unpacking \'metadata\' element for further processing'
                    )
                    metatree = tree.find('.//{*}metadata/*')
                elif root_element.endswith('}mets'):
                    self.logger.info(
                        'FsF-F2-01M : Found METS type XML envelope, unpacking all \'mods\' elements for further processing'
                    )
                    envelope_metadata = self.get_mapped_xml_metadata(
                        tree, Mapper.XML_MAPPING_METS.value)
                    metatree = tree.find('.//{*}dmdSec/{*}mdWrap/{*}xmlData/*')
                elif root_element.endswith('}GetRecordsResponse'):
                    self.logger.info(
                        'FsF-F2-01M : Found OGC CSW GetRecords type XML envelope, unpacking \'SearchResults\' element for further processing'
                    )
                    metatree = tree.find('.//{*}SearchResults/*')
                elif root_element.endswith('}GetRecordByIdResponse'):
                    self.logger.info(
                        'FsF-F2-01M : Found OGC CSW GetRecordByIdResponse type XML envelope, unpacking metadata element for further processing'
                    )
                    metatree = tree.find('.//*')
                else:
                    metatree = tree
                if metatree is not None:
                    root_namespace = None
                    nsmatch = re.match(r'^\{(.+)\}(.+)$', metatree.tag)
                    schema_locations = set(
                        metatree.xpath("//*/@xsi:schemaLocation",
                                       namespaces={'xsi': XSI}))
                    for schema_location in schema_locations:
                        self.namespaces = re.split('\s', schema_location)
                    if nsmatch:
                        root_namespace = nsmatch[1]
                        root_element = nsmatch[2]
                        print('#' + root_element + '#', root_namespace)
                        self.namespaces.append(root_namespace)
                    if root_element == 'codeBook':
                        xml_mapping = Mapper.XML_MAPPING_DDI_CODEBOOK.value
                        self.logger.info(
                            'FsF-F2-01M : Identified DDI codeBook XML based on root tag'
                        )
                    elif root_element == 'dc':
                        xml_mapping = Mapper.XML_MAPPING_DUBLIN_CORE.value
                        self.logger.info(
                            'FsF-F2-01M : Identified Dublin Core XML based on root tag'
                        )
                    elif root_element == 'mods':
                        xml_mapping = Mapper.XML_MAPPING_MODS.value
                        self.logger.info(
                            'FsF-F2-01M : Identified MODS XML based on root tag'
                        )

                    elif root_element == 'eml':
                        xml_mapping = Mapper.XML_MAPPING_EML.value
                        self.logger.info(
                            'FsF-F2-01M : Identified EML XML based on root tag'
                        )
                    elif root_element == 'MD_Metadata':
                        xml_mapping = Mapper.XML_MAPPING_GCMD_ISO.value
                        self.logger.info(
                            'FsF-F2-01M : Identified ISO 19115 XML based on root tag'
                        )
                    elif root_namespace:
                        if 'datacite.org/schema' in root_namespace:
                            xml_mapping = Mapper.XML_MAPPING_DATACITE.value
                            self.logger.info(
                                'FsF-F2-01M : Identified DataCite XML based on namespace'
                            )

        if xml_mapping and metatree is not None:
            xml_metadata = self.get_mapped_xml_metadata(metatree, xml_mapping)

        if envelope_metadata:
            for envelope_key, envelope_values in envelope_metadata.items():
                if envelope_key not in xml_metadata:
                    xml_metadata[envelope_key] = envelope_values
        return source_name, xml_metadata