Example #1
0
def test_extracting_identifiers_from_urls():
    # Returns None when it should
    assert dataone.extractIdentifierFromFullURL('asdf') is None
    assert dataone.extractIdentifierFromFullURL(1) is None
    assert dataone.extractIdentifierFromFullURL('1') is None
    assert dataone.extractIdentifierFromFullURL('http://google.com') is None

    # Extracts the right thing
    assert dataone.extractIdentifierFromFullURL('https://cn.dataone.org/cn/v1/meta/some_pid') == 'some_pid'
    assert dataone.extractIdentifierFromFullURL('https://cn.dataone.org/cn/v1/meta/kgordon.23.30') == 'kgordon.23.30'
    assert dataone.extractIdentifierFromFullURL('https://cn.dataone.org/cn/v1/resolve/kgordon.23.30') == 'kgordon.23.30'
    assert dataone.extractIdentifierFromFullURL('https://cn.dataone.org/cn/v1/object/kgordon.23.30') == 'kgordon.23.30'
    assert dataone.extractIdentifierFromFullURL('https://cn.dataone.org/cn/v2/object/kgordon.23.30') == 'kgordon.23.30'
Example #2
0
    def addDatasetTriples(self, dataset_node, doc):
        if self.model is None:
            raise Exception("Model not found.")

        identifier = dataone.extractDocumentIdentifier(doc)
        identifier_esc = urllib.quote_plus(identifier)

        # type Dataset
        self.add(dataset_node, 'rdf:type', 'geolink:Dataset')

        # Title
        title_element = doc.find("./str[@name='title']")

        if title_element is not None:
            self.add(dataset_node, 'rdfs:label', RDF.Node(title_element.text))

        # Add geolink:Identifier
        self.addIdentifierTriples(dataset_node, identifier)

        # Abstract
        abstract_element = doc.find("./str[@name='abstract']")

        if (abstract_element is not None):
            self.add(dataset_node, 'geolink:description', RDF.Node(abstract_element.text))

        # Spatial Coverage
        bound_north = doc.find("./float[@name='northBoundCoord']")
        bound_east = doc.find("./float[@name='eastBoundCoord']")
        bound_south = doc.find("./float[@name='southBoundCoord']")
        bound_west = doc.find("./float[@name='westBoundCoord']")

        if all(ele is not None for ele in [bound_north, bound_east, bound_south, bound_west]):
            if bound_north.text == bound_south.text and bound_west.text == bound_east.text:
                wktliteral = "POINT (%s %s)" % (bound_north.text, bound_east.text)
            else:
                wktliteral = "POLYGON ((%s %s, %s %s, %s %s, %s, %s))" % (bound_west.text, bound_north.text, bound_east.text, bound_north.text, bound_east.text, bound_south.text, bound_west.text, bound_south.text)

            self.add(dataset_node, 'geolink:hasGeometryAsWktLiteral', RDF.Node(wktliteral))

        # Temporal Coverage
        begin_date = doc.find("./date[@name='beginDate']")
        end_date = doc.find("./date[@name='endDate']")

        if begin_date is not None:
            self.add(dataset_node, 'geolink:hasStartDate', RDF.Node(begin_date.text))

        if end_date is not None:
            self.add(dataset_node, 'geolink:hasEndDate', RDF.Node(end_date.text))

        # Obsoletes as PROV#wasRevisionOf
        obsoletes_node = doc.find("./str[@name='obsoletes']")

        if obsoletes_node is not None:
            other_document_esc = urllib.quote_plus(obsoletes_node.text)
            self.add(dataset_node, 'prov:wasRevisionOf', RDF.Uri(self.repository.ns['d1dataset'] + other_document_esc))

        # Landing page
        self.add(dataset_node, 'geolink:hasLandingPage', RDF.Uri("https://search.dataone.org/#view/" + identifier_esc))


        # Digital Objects
        # If this document has a resource map, get digital objects from there
        # Otherwise, use the cito:documents field in Solr

        resource_map_identifiers = doc.findall("./arr[@name='resourceMap']/str")

        if len(resource_map_identifiers) > 0:
            for resource_map_node in resource_map_identifiers:
                resource_map_identifier = resource_map_node.text

                digital_objects = dataone.getAggregatedIdentifiers(resource_map_identifier)

                for digital_object in digital_objects:
                    digital_object_identifier = urllib.unquote_plus(digital_object)
                    self.addDigitalObject(identifier, digital_object_identifier)
        else:
            # If no resourceMap or documents field, at least add the metadata
            # file as a digital object
            # dataUrl e.g. https://cn.dataone.org/cn/v1/resolve/doi%3A10.6073%2FAA%2Fknb-lter-cdr.70061.123

            data_url_node = doc.find("./str[@name='dataUrl']")

            if data_url_node is not None:
                data_url = data_url_node.text
                digital_object = dataone.extractIdentifierFromFullURL(data_url)
                digital_object = urllib.unquote_plus(digital_object)

                self.addDigitalObject(identifier, digital_object)