Ejemplo n.º 1
0
def getIdentifiersSince(from_string,
                        to_string,
                        fields=None,
                        page=1,
                        page_size=1000):
    """Query page `page` of the Solr index using and retrieve the PIDs
    of the documents in the response.
    """
    start = (page - 1) * page_size
    query_string = createSinceQueryURL(from_string,
                                       to_string,
                                       fields=fields,
                                       start=start)

    query_xml = util.getXML(query_string)
    identifiers = query_xml.findall(".//str[@name='identifier']")

    if identifiers is None:
        return []

    identifier_strings = []

    for identifier in identifiers:
        if identifier.text is not None:
            identifier_strings.append(identifier.text)

    return identifier_strings
Ejemplo n.º 2
0
def getScientificMetadata(identifier, cache=False):
    """Gets the scientific metadata for an identifier.

    In development, I'm keeping a cache of documents in the root of the
    d1lod folder at ./cache. This will need to be removed in
    production. This is toggled with the argument `cache`.

    Arguments:
        identifier: str
            PID of the document

        cache: bool
            Whether to cache files in the current working directory

    Returns:
        An XML document
    """

    scimeta = None
    cache_filename = None
    cache_filepath = None

    # Try from cache first
    if cache is True:
        if not os.path.exists("./cache"):
            os.mkdir("./cache")

        if not os.path.exists("./cache/object/"):
            os.mkdir("./cache/object")

        cache_filename = base64.urlsafe_b64encode(identifier)
        cache_filepath = './cache/object/' + cache_filename

        if os.path.isfile(cache_filepath):
            print "Loading scientific metadata for %s from cache." % identifier

            scimeta = ET.parse(cache_filepath)

            if scimeta is not None:
                scimeta = scimeta.getroot()

    # Return cached copy if we successfully got it
    if scimeta is not None:
        return scimeta

    query_string = "https://cn.dataone.org/cn/v1/object/%s" % urllib.quote_plus(
        identifier)
    scimeta = util.getXML(query_string)

    # Cache what we found for next time
    if scimeta is not None and cache is True:
        if cache_filepath is None:
            if cache_filename is None:
                cache_filename = base64.urlsafe_b64encode(identifier)
            cache_filepath = './cache/meta/' + cache_filename
        with open(cache_filepath, "wb") as f:
            f.write(ET.tostring(scimeta))

    return scimeta
Ejemplo n.º 3
0
def getSincePage(from_string, to_string, page=1, page_size=1000, fields=None):
    """Get a page off the Solr index for a query between two time periods."""

    start = (page-1) * page_size
    query_string = createSinceQueryURL(from_string, to_string, start=start, page_size=page_size)
    query_xml = util.getXML(query_string)

    return query_xml
Ejemplo n.º 4
0
def getSincePage(from_string, to_string, page=1, page_size=1000, fields=None):
    """Get a page off the Solr index for a query between two time periods."""

    start = (page - 1) * page_size
    query_string = createSinceQueryURL(from_string,
                                       to_string,
                                       start=start,
                                       page_size=page_size)
    query_xml = util.getXML(query_string)

    return query_xml
Ejemplo n.º 5
0
def getNumResults(query):
    """Performs a query and extracts just the number of results in the query."""

    num_results = -1

    xmldoc = util.getXML(query)
    result_node = xmldoc.findall(".//result")

    if result_node is not None:
        num_results = result_node[0].get('numFound')

    return int(num_results)
Ejemplo n.º 6
0
def getNumResults(query):
    """Performs a query and extracts just the number of results in the query."""

    num_results = -1

    xmldoc = util.getXML(query)
    result_node = xmldoc.findall(".//result")

    if result_node is not None:
        num_results = result_node[0].get('numFound')

    return int(num_results)
Ejemplo n.º 7
0
def getScientificMetadata(identifier, cache=True):
    """Gets the scientific metadata for an identifier.

    In development, I'm keeping a cache of documents in the root of the
    d1lod folder at ./cache. This will need to be removed in
    production. This is toggled with the argument `cache`.

    Arguments:
        identifier: str
            PID of the document

        cache: bool
            Whether to cache files in the current working directory

    Returns:
        An XML document
    """

    scimeta = None

    # Try from cache first
    if cache is True:
        if not os.path.exists("./cache"):
            os.mkdir("./cache")

        if not os.path.exists("./cache/object/"):
            os.mkdir("./cache/object")

        cache_filename = base64.urlsafe_b64encode(identifier)
        cache_filepath = './cache/object/' + cache_filename

        if os.path.isfile(cache_filepath):
            print "Loading scientific metadata for %s from cache." % identifier

            scimeta = ET.parse(cache_filepath)

            if scimeta is not None:
                scimeta = scimeta.getroot()

    # Return cached copy if we successfully got it
    if scimeta is not None:
        return scimeta

    query_string = "https://cn.dataone.org/cn/v1/object/%s" % urllib.quote_plus(identifier)
    scimeta = util.getXML(query_string)

    # Cache what we found for next time
    if scimeta is not None and cache is True:
        with open(cache_filepath, "wb") as f:
            f.write(ET.tostring(scimeta))

    return scimeta
Ejemplo n.º 8
0
def getSolrIndexFields(identifier, fields=None):
    """Gets a single document off the Solr index by searching for its identifier."""

    # Escape colons first, then urlencode
    identifier_esc = identifier.replace(':', '\:')
    identifier_esc = urllib.quote_plus(identifier_esc)

    if fields is None:
        fields = getDefaultSolrIndexFields()

    query_string = "http://cn.dataone.org/cn/v1/query/solr/?fl=" + ",".join(fields) + "&q=id:" + identifier_esc + "&rows=1&start=0"
    query_xml = util.getXML(query_string)

    return query_xml.find(".//doc")
Ejemplo n.º 9
0
def getSolrIndexFields(identifier, fields=None):
    """Gets a single document off the Solr index by searching for its identifier."""

    # Escape colons first, then urlencode
    identifier_esc = identifier.replace(':', '\\:')
    identifier_esc = urllib.quote_plus(identifier_esc)

    if fields is None:
        fields = getDefaultSolrIndexFields()

    query_string = "http://cn.dataone.org/cn/v1/query/solr/?fl=" + ",".join(
        fields) + "&q=id:" + identifier_esc + "&rows=1&start=0"
    query_xml = util.getXML(query_string)

    return query_xml.find(".//doc")
Ejemplo n.º 10
0
def getIdentifiersSince(from_string, to_string, fields=None, page=1, page_size=1000):
    """Query page `page` of the Solr index using and retrieve the PIDs
    of the documents in the response.
    """
    start = (page-1) * page_size
    query_string = createSinceQueryURL(from_string, to_string, fields=fields, start=start)

    query_xml = util.getXML(query_string)
    identifiers = query_xml.findall(".//str[@name='identifier']")

    if identifiers is None:
        return []

    identifier_strings = []

    for identifier in identifiers:
        if identifier.text is not None:
            identifier_strings.append(identifier.text)

    return identifier_strings
Ejemplo n.º 11
0
    }

    # Load triple stores
    stores = {
        'people': store.Store("http://virtuoso/", "8890", 'geolink', namespaces),
        'organizations': store.Store("http://virtuoso/",  "8890", 'geolink', namespaces),
        'datasets': store.Store("http://virtuoso/", "8890", 'geolink', namespaces)
    }

    for store_name in stores:
        stores[store_name].delete_all()

    stores = multi_store.MultiStore(stores, namespaces)
    vld = validator.Validator()

    page_xml = util.getXML(query)
    documents = page_xml.findall(".//doc")

    for doc in documents:
        identifier = doc.find(".//str[@name='identifier']").text

        print identifier

        scimeta = dataone.getScientificMetadata(identifier, cache=True)

        if scimeta is None:
            continue

        records = processing.extractCreators(identifier, scimeta)

        # Add records and organizations
Ejemplo n.º 12
0
    }

    # Load triple stores
    stores = {
        'people': store.Store("http://*****:*****@name='identifier']").text

        print identifier

        scimeta = dataone.getScientificMetadata(identifier, cache=True)

        if scimeta is None:
            continue

        records = processing.extractCreators(identifier, scimeta)

        # Add records and organizations