vld = validator.Validator() page_xml = util.getXML(query) documents = page_xml.findall(".//doc") for doc in documents: identifier = doc.find(".//str[@name='identifier']").text print identifier scimeta = dataone.getScientificMetadata(identifier, cache=True) if scimeta is None: continue records = processing.extractCreators(identifier, scimeta) # Add records and organizations people = [p for p in records if 'type' in p and p['type'] == 'person'] organizations = [o for o in records if 'type' in o and o['type'] == 'organization'] # Always do organizations first, so peoples' organization URIs exist for organization in organizations: organization = vld.validate(organization) stores.addOrganization(organization) for person in people: person = vld.validate(person) stores.addPerson(person) stores.addDataset(doc, scimeta, formats_map)
def addDataset(self, identifier, doc=None): """Adds a dataset to the graph. Parameters: ----------- identifier : str Non-urlencoded DataOne identifier doc : XML Element An XML element containing a result from the Solr index which contains a number of fields relating to a dataset. Returns: None """ if self.model is not None: raise Exception( "Model existed when addDataset was called. This means the last Model wasn't cleaned up after finishing." ) self.createModel() # Get Solr fields if they weren't passed in if doc is None: doc = dataone.getSolrIndexFields(identifier) identifier = dataone.extractDocumentIdentifier(doc) identifier_esc = urllib.unquote(identifier).decode('utf8') dataset_node = RDF.Uri(self.graph.ns['d1dataset'] + identifier_esc) self.add(dataset_node, 'rdf:type', 'geolink:Dataset') # Delete if dataset is already in graph if self.datasetExists(identifier): logging.info( "Dataset with identifier %s already exists. Deleting then re-adding.", identifier) self.deleteDataset(identifier) scimeta = dataone.getScientificMetadata(identifier) records = processing.extractCreators(identifier, scimeta) vld = validator.Validator() # Add Dataset triples first, we'll use them when we add people # to match to existing people by the current dataset's 'obsoletes' field self.addDatasetTriples(dataset_node, doc) # Add people and organizations people = [p for p in records if 'type' in p and p['type'] == 'person'] organizations = [ o for o in records if 'type' in o and o['type'] == 'organization' ] # Always do organizations first, so peoples' organization URIs exist for organization in organizations: organization = vld.validate(organization) self.addOrganization(organization) for person in people: person = vld.validate(person) self.addPerson(person) # Commit or reject the model here if self.model is None: raise Exception( "Model was None. It should have been an RDF.Model.") self.insertModel() self.model = None # Remove the model since we're done return
def addDataset(self, identifier, doc=None): """Adds a dataset to the repository. Parameters: ----------- identifier : str Non-urlencoded DataOne identifier doc : XML Element An XML element containing a result from the Solr index which contains a number of fields relating to a dataset. """ if self.model is not None: raise Exception("Model existed when addDataset was called. This means the last Model wasn't cleaned up after finishing.") self.createModel() # Get Solr fields if they weren't passed in if doc is None: doc = dataone.getSolrIndexFields(identifier) identifier = dataone.extractDocumentIdentifier(doc) identifier_esc = urllib.quote_plus(identifier) dataset_node = RDF.Uri(self.repository.ns['d1dataset'] + identifier_esc) self.add(dataset_node, 'rdf:type', 'geolink:Dataset') # Delete if dataset is already in graph if self.datasetExists(identifier): logging.info("Dataset with identifier %s already exists. Deleting then re-adding.", identifier) self.deleteDataset(identifier) scimeta = dataone.getScientificMetadata(identifier) records = processing.extractCreators(identifier, scimeta) vld = validator.Validator() # Add Dataset triples first, we'll use them when we add people # to match to existing people by the current dataset's 'obsoletes' field self.addDatasetTriples(dataset_node, doc) # Add people and organizations people = [p for p in records if 'type' in p and p['type'] == 'person'] organizations = [o for o in records if 'type' in o and o['type'] == 'organization'] # Always do organizations first, so peoples' organization URIs exist for organization in organizations: organization = vld.validate(organization) self.addOrganization(organization) for person in people: person = vld.validate(person) self.addPerson(person) # Commit or reject the model here if self.model is None: raise Exception("Model was None. It should have been an RDF.Model.") self.insertModel() self.model = None # Remove the model since we're done
def main(): # Settings config = util.loadJSONFile('settings.json') if 'last_run' not in config: print "Last run datetime not found in settings.json. Exiting." sys.exit() # Create from and to strings # from_string = config['last_run'] # to_string = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S.0Z") from_string = "2015-01-01T15:00:00.0Z" to_string = "2015-01-06T16:05:00.0Z" from_string = "2015-01-06T16:00:00.0Z" to_string = "2015-01-06T16:05:00.0Z" # from_string = "2015-01-01T15:00:00.0Z" # to_string = "2015-01-06T16:05:00.0Z" from_string = "2015-03-15T23:21:15.567Z" to_string = "2015-05-30T23:21:15.567Z" # Load scimeta cache cache_dir = "/Users/mecum/src/d1dump/documents/" identifier_map = util.createIdentifierMap("/Users/mecum/src/d1dump/idents.csv") print "Read in %d identifier mappings." % len(identifier_map) # Load formats map print "Loading formats map from GitHub..." formats_map = util.loadFormatsMap() print "Loaded %d format URIs from GitHub." % len(formats_map) # Load triple stores namespaces = { "foaf": "http://xmlns.com/foaf/0.1/", "dcterms": "http://purl.org/dc/terms/", "datacite": "http://purl.org/spar/datacite/", "owl": "http://www.w3.org/2002/07/owl#", "xsd": "http://www.w3.org/2001/XMLSchema#", "rdfs": "http://www.w3.org/2000/01/rdf-schema#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "glview": "http://schema.geolink.org/dev/view/", "d1people": "https://dataone.org/person/", "d1org": "https://dataone.org/organization/", "d1resolve": "https://cn.dataone.org/cn/v1/resolve/", "prov": "http://www.w3.org/ns/prov#", "d1node": "https://cn.dataone.org/cn/v1/node/", "d1landing": "https://search.dataone.org/#view/", "d1repo": "https://cn.dataone.org/cn/v1/node/" } store_dict = { 'people': store.Store("http://*****:*****@name='identifier']").text print "Adding dataset for %s. " % identifier # Skip if it's already in the datasets graph if stores.datasetExists(identifier): print "Dataset %s already in graph. Continuing." % identifier # continue # continue scimeta = dataone.getScientificMetadata(identifier, identifier_map, cache_dir, cache=True) if scimeta is None: print "Unable to get scimeta for %s. Skipping." % identifier continue records = processing.extractCreators(identifier, scimeta) # Add records and organizations people = [p for p in records if 'type' in p and p['type'] == 'person'] organizations = [o for o in records if 'type' in o and o['type'] == 'organization'] # Always do organizations first, so peoples' organization URIs exist for organization in organizations: organization = vld.validate(organization) stores.addOrganization(organization) for person in people: person = vld.validate(person) stores.addPerson(person) stores.addDataset(doc, scimeta, formats_map) stores.save() # Save settings # config['last_run'] = to_string # util.saveJSONFile(config, 'settings.json') return
def main(): # Settings config = util.loadJSONFile('settings.json') if 'last_run' not in config: print "Last run datetime not found in settings.json. Exiting." sys.exit() # Create from and to strings # from_string = config['last_run'] # to_string = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S.0Z") from_string = "2015-01-01T15:00:00.0Z" to_string = "2015-01-06T16:05:00.0Z" from_string = "2015-01-06T16:00:00.0Z" to_string = "2015-01-06T16:05:00.0Z" # from_string = "2015-01-01T15:00:00.0Z" # to_string = "2015-01-06T16:05:00.0Z" from_string = "2015-03-15T23:21:15.567Z" to_string = "2015-05-30T23:21:15.567Z" # Load scimeta cache cache_dir = "/Users/mecum/src/d1dump/documents/" identifier_map = util.createIdentifierMap( "/Users/mecum/src/d1dump/idents.csv") print "Read in %d identifier mappings." % len(identifier_map) # Load formats map print "Loading formats map from GitHub..." formats_map = util.loadFormatsMap() print "Loaded %d format URIs from GitHub." % len(formats_map) # Load triple stores namespaces = { "foaf": "http://xmlns.com/foaf/0.1/", "dcterms": "http://purl.org/dc/terms/", "datacite": "http://purl.org/spar/datacite/", "owl": "http://www.w3.org/2002/07/owl#", "xsd": "http://www.w3.org/2001/XMLSchema#", "rdfs": "http://www.w3.org/2000/01/rdf-schema#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "glview": "http://schema.geolink.org/dev/view/", "d1people": "https://dataone.org/person/", "d1org": "https://dataone.org/organization/", "d1resolve": "https://cn.dataone.org/cn/v1/resolve/", "prov": "http://www.w3.org/ns/prov#", "d1node": "https://cn.dataone.org/cn/v1/node/", "d1landing": "https://search.dataone.org/#view/", "d1repo": "https://cn.dataone.org/cn/v1/node/" } graph_dict = { 'people': graph.Graph("http://virtuoso/", "8890", 'geolink', namespaces), 'organizations': graph.Graph("http://virtuoso/", "8890", 'geolink', namespaces), 'datasets': graph.Graph("http://virtuoso/", "8890", 'geolink', namespaces) } graphs = multi_store.MultiStore(graph_dict, namespaces) # Create a record validator vld = validator.Validator() query_string = dataone.createSinceQuery(from_string, to_string, None, 0) num_results = dataone.getNumResults(query_string) # Calculate the number of pages we need to get to get all results page_size = 1000 num_pages = num_results / page_size if num_results % page_size > 0: num_pages += 1 # Establish which fields we want to get from the Solr index fields = [ "identifier", "title", "abstract", "author", "authorLastName", "origin", "submitter", "rightsHolder", "documents", "resourceMap", "authoritativeMN", "obsoletes", "northBoundCoord", "eastBoundCoord", "southBoundCoord", "westBoundCoord", "startDate", "endDate", "datasource", "replicaMN", "resourceMap" ] print "Found %d documents over %d page(s)." % (num_results, num_pages) sys.exit() # Process each page for page in range(1, num_pages + 1): print "Processing page %d." % page page_xml = dataone.getSincePage(from_string, to_string, fields, page, page_size) docs = page_xml.findall(".//doc") for doc in docs: identifier = doc.find("./str[@name='identifier']").text print "Adding dataset for %s. " % identifier # Skip if it's already in the datasets graph if graphs.datasetExists(identifier): print "Dataset %s already in graph. Continuing." % identifier # continue # continue scimeta = dataone.getScientificMetadata(identifier, identifier_map, cache_dir, cache=True) if scimeta is None: print "Unable to get scimeta for %s. Skipping." % identifier continue records = processing.extractCreators(identifier, scimeta) # Add records and organizations people = [ p for p in records if 'type' in p and p['type'] == 'person' ] organizations = [ o for o in records if 'type' in o and o['type'] == 'organization' ] # Always do organizations first, so peoples' organization URIs exist for organization in organizations: organization = vld.validate(organization) graphs.addOrganization(organization) for person in people: person = vld.validate(person) graphs.addPerson(person) graphs.addDataset(doc, scimeta, formats_map) graphs.save() # Save settings # config['last_run'] = to_string # util.saveJSONFile(config, 'settings.json') return
stores = multi_store.MultiStore(stores, namespaces) stores.clear() # Establish which fields we want to get from the Solr index fields = ["identifier","title","abstract","author", "authorLastName", "origin","submitter","rightsHolder","documents", "resourceMap","authoritativeMN","obsoletes","northBoundCoord", "eastBoundCoord","southBoundCoord","westBoundCoord","startDate","endDate", "datasource","replicaMN","resourceMap"] vld = validator.Validator() scimeta = dataone.getScientificMetadata(identifier, cache=True) doc = dataone.getSolrIndex(identifier, fields) records = processing.extractCreators(identifier, scimeta) print records # Add records and organizations people = [p for p in records if 'type' in p and p['type'] == 'person'] organizations = [o for o in records if 'type' in o and o['type'] == 'organization'] # Always do organizations first, so peoples' organization URIs exist for organization in organizations: organization = vld.validate(organization) stores.addOrganization(organization) for person in people: person = vld.validate(person) stores.addPerson(person)