def getIdentifiersSince(from_string, to_string, fields=None, page=1, page_size=1000): """Query page `page` of the Solr index using and retrieve the PIDs of the documents in the response. """ start = (page - 1) * page_size query_string = createSinceQueryURL(from_string, to_string, fields=fields, start=start) query_xml = util.getXML(query_string) identifiers = query_xml.findall(".//str[@name='identifier']") if identifiers is None: return [] identifier_strings = [] for identifier in identifiers: if identifier.text is not None: identifier_strings.append(identifier.text) return identifier_strings
def getScientificMetadata(identifier, cache=False): """Gets the scientific metadata for an identifier. In development, I'm keeping a cache of documents in the root of the d1lod folder at ./cache. This will need to be removed in production. This is toggled with the argument `cache`. Arguments: identifier: str PID of the document cache: bool Whether to cache files in the current working directory Returns: An XML document """ scimeta = None cache_filename = None cache_filepath = None # Try from cache first if cache is True: if not os.path.exists("./cache"): os.mkdir("./cache") if not os.path.exists("./cache/object/"): os.mkdir("./cache/object") cache_filename = base64.urlsafe_b64encode(identifier) cache_filepath = './cache/object/' + cache_filename if os.path.isfile(cache_filepath): print "Loading scientific metadata for %s from cache." % identifier scimeta = ET.parse(cache_filepath) if scimeta is not None: scimeta = scimeta.getroot() # Return cached copy if we successfully got it if scimeta is not None: return scimeta query_string = "https://cn.dataone.org/cn/v1/object/%s" % urllib.quote_plus( identifier) scimeta = util.getXML(query_string) # Cache what we found for next time if scimeta is not None and cache is True: if cache_filepath is None: if cache_filename is None: cache_filename = base64.urlsafe_b64encode(identifier) cache_filepath = './cache/meta/' + cache_filename with open(cache_filepath, "wb") as f: f.write(ET.tostring(scimeta)) return scimeta
def getSincePage(from_string, to_string, page=1, page_size=1000, fields=None): """Get a page off the Solr index for a query between two time periods.""" start = (page-1) * page_size query_string = createSinceQueryURL(from_string, to_string, start=start, page_size=page_size) query_xml = util.getXML(query_string) return query_xml
def getSincePage(from_string, to_string, page=1, page_size=1000, fields=None): """Get a page off the Solr index for a query between two time periods.""" start = (page - 1) * page_size query_string = createSinceQueryURL(from_string, to_string, start=start, page_size=page_size) query_xml = util.getXML(query_string) return query_xml
def getNumResults(query): """Performs a query and extracts just the number of results in the query.""" num_results = -1 xmldoc = util.getXML(query) result_node = xmldoc.findall(".//result") if result_node is not None: num_results = result_node[0].get('numFound') return int(num_results)
def getScientificMetadata(identifier, cache=True): """Gets the scientific metadata for an identifier. In development, I'm keeping a cache of documents in the root of the d1lod folder at ./cache. This will need to be removed in production. This is toggled with the argument `cache`. Arguments: identifier: str PID of the document cache: bool Whether to cache files in the current working directory Returns: An XML document """ scimeta = None # Try from cache first if cache is True: if not os.path.exists("./cache"): os.mkdir("./cache") if not os.path.exists("./cache/object/"): os.mkdir("./cache/object") cache_filename = base64.urlsafe_b64encode(identifier) cache_filepath = './cache/object/' + cache_filename if os.path.isfile(cache_filepath): print "Loading scientific metadata for %s from cache." % identifier scimeta = ET.parse(cache_filepath) if scimeta is not None: scimeta = scimeta.getroot() # Return cached copy if we successfully got it if scimeta is not None: return scimeta query_string = "https://cn.dataone.org/cn/v1/object/%s" % urllib.quote_plus(identifier) scimeta = util.getXML(query_string) # Cache what we found for next time if scimeta is not None and cache is True: with open(cache_filepath, "wb") as f: f.write(ET.tostring(scimeta)) return scimeta
def getSolrIndexFields(identifier, fields=None): """Gets a single document off the Solr index by searching for its identifier.""" # Escape colons first, then urlencode identifier_esc = identifier.replace(':', '\:') identifier_esc = urllib.quote_plus(identifier_esc) if fields is None: fields = getDefaultSolrIndexFields() query_string = "http://cn.dataone.org/cn/v1/query/solr/?fl=" + ",".join(fields) + "&q=id:" + identifier_esc + "&rows=1&start=0" query_xml = util.getXML(query_string) return query_xml.find(".//doc")
def getSolrIndexFields(identifier, fields=None): """Gets a single document off the Solr index by searching for its identifier.""" # Escape colons first, then urlencode identifier_esc = identifier.replace(':', '\\:') identifier_esc = urllib.quote_plus(identifier_esc) if fields is None: fields = getDefaultSolrIndexFields() query_string = "http://cn.dataone.org/cn/v1/query/solr/?fl=" + ",".join( fields) + "&q=id:" + identifier_esc + "&rows=1&start=0" query_xml = util.getXML(query_string) return query_xml.find(".//doc")
def getIdentifiersSince(from_string, to_string, fields=None, page=1, page_size=1000): """Query page `page` of the Solr index using and retrieve the PIDs of the documents in the response. """ start = (page-1) * page_size query_string = createSinceQueryURL(from_string, to_string, fields=fields, start=start) query_xml = util.getXML(query_string) identifiers = query_xml.findall(".//str[@name='identifier']") if identifiers is None: return [] identifier_strings = [] for identifier in identifiers: if identifier.text is not None: identifier_strings.append(identifier.text) return identifier_strings
} # Load triple stores stores = { 'people': store.Store("http://virtuoso/", "8890", 'geolink', namespaces), 'organizations': store.Store("http://virtuoso/", "8890", 'geolink', namespaces), 'datasets': store.Store("http://virtuoso/", "8890", 'geolink', namespaces) } for store_name in stores: stores[store_name].delete_all() stores = multi_store.MultiStore(stores, namespaces) vld = validator.Validator() page_xml = util.getXML(query) documents = page_xml.findall(".//doc") for doc in documents: identifier = doc.find(".//str[@name='identifier']").text print identifier scimeta = dataone.getScientificMetadata(identifier, cache=True) if scimeta is None: continue records = processing.extractCreators(identifier, scimeta) # Add records and organizations
} # Load triple stores stores = { 'people': store.Store("http://*****:*****@name='identifier']").text print identifier scimeta = dataone.getScientificMetadata(identifier, cache=True) if scimeta is None: continue records = processing.extractCreators(identifier, scimeta) # Add records and organizations