Esempio n. 1
0
    for store_name in stores:
        stores[store_name].delete_all()

    stores = multi_store.MultiStore(stores, namespaces)
    vld = validator.Validator()

    page_xml = util.getXML(query)
    documents = page_xml.findall(".//doc")

    for doc in documents:
        identifier = doc.find(".//str[@name='identifier']").text

        print identifier

        scimeta = dataone.getScientificMetadata(identifier, cache=True)

        if scimeta is None:
            continue

        records = processing.extractCreators(identifier, scimeta)

        # Add records and organizations
        people = [p for p in records if 'type' in p and p['type'] == 'person']
        organizations = [o for o in records if 'type' in o and o['type'] == 'organization']

        # Always do organizations first, so peoples' organization URIs exist
        for organization in organizations:
            organization = vld.validate(organization)
            stores.addOrganization(organization)
Esempio n. 2
0
    def addDataset(self, identifier, doc=None):
        """Adds a dataset to the repository.

        Parameters:
        -----------
        identifier : str
            Non-urlencoded DataOne identifier

        doc : XML Element
            An XML element containing a result from the Solr index which
            contains a number of fields relating to a dataset.

        """

        if self.model is not None:
            raise Exception("Model existed when addDataset was called. This means the last Model wasn't cleaned up after finishing.")

        self.createModel()

        # Get Solr fields if they weren't passed in
        if doc is None:
            doc = dataone.getSolrIndexFields(identifier)

        identifier = dataone.extractDocumentIdentifier(doc)
        identifier_esc = urllib.quote_plus(identifier)

        dataset_node = RDF.Uri(self.repository.ns['d1dataset'] + identifier_esc)

        self.add(dataset_node, 'rdf:type', 'geolink:Dataset')

        # Delete if dataset is already in graph
        if self.datasetExists(identifier):
            logging.info("Dataset with identifier %s already exists. Deleting then re-adding.", identifier)
            self.deleteDataset(identifier)

        scimeta = dataone.getScientificMetadata(identifier)
        records = processing.extractCreators(identifier, scimeta)

        vld = validator.Validator()

        # Add Dataset triples first, we'll use them when we add people
        # to match to existing people by the current dataset's 'obsoletes' field

        self.addDatasetTriples(dataset_node, doc)

        # Add people and organizations
        people = [p for p in records if 'type' in p and p['type'] == 'person']
        organizations = [o for o in records if 'type' in o and o['type'] == 'organization']

        # Always do organizations first, so peoples' organization URIs exist
        for organization in organizations:
            organization = vld.validate(organization)
            self.addOrganization(organization)

        for person in people:
            person = vld.validate(person)
            self.addPerson(person)

        # Commit or reject the model here
        if self.model is None:
            raise Exception("Model was None. It should have been an RDF.Model.")

        self.insertModel()
        self.model = None  # Remove the model since we're done
Esempio n. 3
0
def main():
    # Settings
    config = util.loadJSONFile('settings.json')

    if 'last_run' not in config:
        print "Last run datetime not found in settings.json. Exiting."
        sys.exit()

    # Create from and to strings
    # from_string = config['last_run']
    # to_string = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S.0Z")
    from_string = "2015-01-01T15:00:00.0Z"
    to_string = "2015-01-06T16:05:00.0Z"

    from_string = "2015-01-06T16:00:00.0Z"
    to_string = "2015-01-06T16:05:00.0Z"

    # from_string =   "2015-01-01T15:00:00.0Z"
    # to_string   =   "2015-01-06T16:05:00.0Z"

    from_string = "2015-03-15T23:21:15.567Z"
    to_string = "2015-05-30T23:21:15.567Z"

    # Load scimeta cache
    cache_dir = "/Users/mecum/src/d1dump/documents/"
    identifier_map = util.createIdentifierMap(
        "/Users/mecum/src/d1dump/idents.csv")
    print "Read in %d identifier mappings." % len(identifier_map)

    # Load formats map
    print "Loading formats map from GitHub..."
    formats_map = util.loadFormatsMap()
    print "Loaded %d format URIs from GitHub." % len(formats_map)

    # Load triple stores
    namespaces = {
        "foaf": "http://xmlns.com/foaf/0.1/",
        "dcterms": "http://purl.org/dc/terms/",
        "datacite": "http://purl.org/spar/datacite/",
        "owl": "http://www.w3.org/2002/07/owl#",
        "xsd": "http://www.w3.org/2001/XMLSchema#",
        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "glview": "http://schema.geolink.org/dev/view/",
        "d1people": "https://dataone.org/person/",
        "d1org": "https://dataone.org/organization/",
        "d1resolve": "https://cn.dataone.org/cn/v1/resolve/",
        "prov": "http://www.w3.org/ns/prov#",
        "d1node": "https://cn.dataone.org/cn/v1/node/",
        "d1landing": "https://search.dataone.org/#view/",
        "d1repo": "https://cn.dataone.org/cn/v1/node/"
    }

    graph_dict = {
        'people':
        graph.Graph("http://virtuoso/", "8890", 'geolink', namespaces),
        'organizations':
        graph.Graph("http://virtuoso/", "8890", 'geolink', namespaces),
        'datasets':
        graph.Graph("http://virtuoso/", "8890", 'geolink', namespaces)
    }

    graphs = multi_store.MultiStore(graph_dict, namespaces)

    # Create a record validator
    vld = validator.Validator()

    query_string = dataone.createSinceQuery(from_string, to_string, None, 0)
    num_results = dataone.getNumResults(query_string)

    # Calculate the number of pages we need to get to get all results
    page_size = 1000
    num_pages = num_results / page_size
    if num_results % page_size > 0:
        num_pages += 1

    # Establish which fields we want to get from the Solr index
    fields = [
        "identifier", "title", "abstract", "author", "authorLastName",
        "origin", "submitter", "rightsHolder", "documents", "resourceMap",
        "authoritativeMN", "obsoletes", "northBoundCoord", "eastBoundCoord",
        "southBoundCoord", "westBoundCoord", "startDate", "endDate",
        "datasource", "replicaMN", "resourceMap"
    ]

    print "Found %d documents over %d page(s)." % (num_results, num_pages)

    sys.exit()

    # Process each page
    for page in range(1, num_pages + 1):
        print "Processing page %d." % page

        page_xml = dataone.getSincePage(from_string, to_string, fields, page,
                                        page_size)
        docs = page_xml.findall(".//doc")

        for doc in docs:
            identifier = doc.find("./str[@name='identifier']").text
            print "Adding dataset for %s. " % identifier

            # Skip if it's already in the datasets graph
            if graphs.datasetExists(identifier):
                print "Dataset %s already in graph. Continuing." % identifier
                # continue

            # continue
            scimeta = dataone.getScientificMetadata(identifier,
                                                    identifier_map,
                                                    cache_dir,
                                                    cache=True)

            if scimeta is None:
                print "Unable to get scimeta for %s. Skipping." % identifier
                continue

            records = processing.extractCreators(identifier, scimeta)

            # Add records and organizations
            people = [
                p for p in records if 'type' in p and p['type'] == 'person'
            ]
            organizations = [
                o for o in records
                if 'type' in o and o['type'] == 'organization'
            ]

            # Always do organizations first, so peoples' organization URIs exist
            for organization in organizations:
                organization = vld.validate(organization)
                graphs.addOrganization(organization)

            for person in people:
                person = vld.validate(person)
                graphs.addPerson(person)

            graphs.addDataset(doc, scimeta, formats_map)

    graphs.save()

    # Save settings
    # config['last_run'] = to_string
    # util.saveJSONFile(config, 'settings.json')

    return
Esempio n. 4
0
def main():
    # Settings
    config = util.loadJSONFile('settings.json')

    if 'last_run' not in config:
        print "Last run datetime not found in settings.json. Exiting."
        sys.exit()

    # Create from and to strings
    # from_string = config['last_run']
    # to_string = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S.0Z")
    from_string =   "2015-01-01T15:00:00.0Z"
    to_string   =   "2015-01-06T16:05:00.0Z"

    from_string = "2015-01-06T16:00:00.0Z"
    to_string = "2015-01-06T16:05:00.0Z"

    # from_string =   "2015-01-01T15:00:00.0Z"
    # to_string   =   "2015-01-06T16:05:00.0Z"

    from_string =   "2015-03-15T23:21:15.567Z"
    to_string   =   "2015-05-30T23:21:15.567Z"

    # Load scimeta cache
    cache_dir = "/Users/mecum/src/d1dump/documents/"
    identifier_map = util.createIdentifierMap("/Users/mecum/src/d1dump/idents.csv")
    print "Read in %d identifier mappings." % len(identifier_map)

    # Load formats map
    print "Loading formats map from GitHub..."
    formats_map = util.loadFormatsMap()
    print "Loaded %d format URIs from GitHub." % len(formats_map)

    # Load triple stores
    namespaces = {
        "foaf": "http://xmlns.com/foaf/0.1/",
        "dcterms": "http://purl.org/dc/terms/",
        "datacite": "http://purl.org/spar/datacite/",
        "owl": "http://www.w3.org/2002/07/owl#",
        "xsd": "http://www.w3.org/2001/XMLSchema#",
        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "glview": "http://schema.geolink.org/dev/view/",
        "d1people": "https://dataone.org/person/",
        "d1org": "https://dataone.org/organization/",
        "d1resolve": "https://cn.dataone.org/cn/v1/resolve/",
        "prov": "http://www.w3.org/ns/prov#",
        "d1node": "https://cn.dataone.org/cn/v1/node/",
        "d1landing": "https://search.dataone.org/#view/",
        "d1repo": "https://cn.dataone.org/cn/v1/node/"
    }

    store_dict = {
        'people': store.Store("http://*****:*****@name='identifier']").text
            print "Adding dataset for %s. " % identifier

            # Skip if it's already in the datasets graph
            if stores.datasetExists(identifier):
                print "Dataset %s already in graph. Continuing." % identifier
                # continue

            # continue
            scimeta = dataone.getScientificMetadata(identifier, identifier_map, cache_dir, cache=True)

            if scimeta is None:
                print "Unable to get scimeta for %s. Skipping." % identifier
                continue

            records = processing.extractCreators(identifier, scimeta)

            # Add records and organizations
            people = [p for p in records if 'type' in p and p['type'] == 'person']
            organizations = [o for o in records if 'type' in o and o['type'] == 'organization']

            # Always do organizations first, so peoples' organization URIs exist
            for organization in organizations:
                organization = vld.validate(organization)
                stores.addOrganization(organization)

            for person in people:
                person = vld.validate(person)
                stores.addPerson(person)

            stores.addDataset(doc, scimeta, formats_map)


    stores.save()

    # Save settings
    # config['last_run'] = to_string
    # util.saveJSONFile(config, 'settings.json')

    return
Esempio n. 5
0
    }

    stores = multi_store.MultiStore(stores, namespaces)
    stores.clear()


    # Establish which fields we want to get from the Solr index
    fields = ["identifier","title","abstract","author",
    "authorLastName", "origin","submitter","rightsHolder","documents",
    "resourceMap","authoritativeMN","obsoletes","northBoundCoord",
    "eastBoundCoord","southBoundCoord","westBoundCoord","startDate","endDate",
    "datasource","replicaMN","resourceMap"]

    vld = validator.Validator()

    scimeta = dataone.getScientificMetadata(identifier, cache=True)
    doc = dataone.getSolrIndex(identifier, fields)
    records = processing.extractCreators(identifier, scimeta)

    print records

    # Add records and organizations
    people = [p for p in records if 'type' in p and p['type'] == 'person']
    organizations = [o for o in records if 'type' in o and o['type'] == 'organization']

    # Always do organizations first, so peoples' organization URIs exist
    for organization in organizations:
        organization = vld.validate(organization)
        stores.addOrganization(organization)

    for person in people: