Ejemplo n.º 1
0
def patch_well_known_namespaces(etree_module):

    from owslib.namespaces import Namespaces
    ns = Namespaces()

    """Monkey patches the etree module to add some well-known namespaces."""
    etree_module._namespace_map.update(ns.get_namespaces())
def patch_well_known_namespaces(etree_module):

    import warnings
    from owslib.namespaces import Namespaces
    ns = Namespaces()
    """Monkey patches the etree module to add some well-known namespaces."""

    try:
        register_namespace = etree_module.register_namespace
    except AttributeError:
        try:
            etree_module._namespace_map

            def register_namespace(prefix, uri):
                etree_module._namespace_map[uri] = prefix
        except AttributeError:

            def register_namespace(prefix, uri):
                pass

            warnings.warn(
                "Only 'lxml.etree' >= 2.3 and 'xml.etree.ElementTree' >= 1.3 are fully supported!"
            )

    for k, v in six.iteritems(ns.get_namespaces()):
        register_namespace(k, v)
Ejemplo n.º 3
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(
        ["fes", "ogc", "om", "gml32", "sa", "sml", "swe20", "swes", "xlink"])
    ns["ows"] = n.get_namespace("ows110")
    ns["sos"] = n.get_namespace("sos20")
    return ns
Ejemplo n.º 4
0
 def get_namespaces(self):
     """
     """
     n = Namespaces()
     namespaces = n.get_namespaces(["sml", "gml", "xlink", "swe"])
     namespaces["ism"] = "urn:us:gov:ic:ism:v2"
     return namespaces
Ejemplo n.º 5
0
def patch_well_known_namespaces(etree_module):

    from owslib.namespaces import Namespaces
    ns = Namespaces()
    """Monkey patches the etree module to add some well-known namespaces."""
    for k, v in ns.get_namespaces().iteritems():
        etree_module.register_namespace(k, v)
Ejemplo n.º 6
0
 def get_namespaces():
     """
     Returns specified namespaces using owslib Namespaces function.
     """
     n = Namespaces()
     ns = n.get_namespaces(
         ["gco", "gmd", "gml", "gml32", "gmx", "gts", "srv", "xlink", "dc"])
     return ns
Ejemplo n.º 7
0
 def get_namespaces():
     """
     Returns specified namespaces using owslib Namespaces function.
     """
     n = Namespaces()
     ns = n.get_namespaces(
         ["gco", "gmd", "gml", "gml32", "gmx", "gts", "srv", "xlink", "dc"])
     return ns
Ejemplo n.º 8
0
def __get_namespaces():
    """Get default namespaces from OWSLib, extended with the 'gfc' namespace
    to be able to parse feature catalogues."""
    n = Namespaces()
    ns = n.get_namespaces()
    ns[None] = n.get_namespace("gmd")
    ns['gfc'] = 'http://www.isotc211.org/2005/gfc'
    return ns
Ejemplo n.º 9
0
def patch_well_known_namespaces(etree_module):

    from owslib.namespaces import Namespaces
    ns = Namespaces()

    """Monkey patches the etree module to add some well-known namespaces."""
    for k,v in ns.get_namespaces().iteritems():
        etree_module.register_namespace(k, v)
Ejemplo n.º 10
0
def get_namespaces_io100():
    n = Namespaces()
    ns = n.get_namespaces(["ogc","swes","sml","xlink","xsi"])
    ns["ows"] = n.get_namespace("ows110")
    ns["sos"] = n.get_namespace("sos20")
    ns["gml"] = n.get_namespace("gml32")
    ns["om"] = n.get_namespace("om20")
    ns['swe'] = 'http://www.opengis.net/swe/2.0'
    ns["sams"] = "http://www.opengis.net/samplingSpatial/2.0"
    ns["sf"] = "http://www.opengis.net/sampling/2.0"
    return ns
Ejemplo n.º 11
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces([
        "fes", "gml32", "ogc", "om20", "sa", "sml", "swe20", "swes", "wml2",
        "xlink", "xsi"
    ])
    ns["gda"] = 'http://www.opengis.net/sosgda/1.0'
    ns["ns"] = "http://www.opengis.net/gml/3.2"
    ns["ows"] = n.get_namespace("ows110")
    ns["sams"] = "http://www.opengis.net/samplingSpatial/2.0"
    ns["sf"] = "http://www.opengis.net/sampling/2.0"
    ns["sos"] = n.get_namespace("sos20")
    return ns
Ejemplo n.º 12
0
def patch_well_known_namespaces(etree_module):
    """Monkey patches the etree module to add some well-known namespaces."""

    ns = Namespaces()

    try:
        register_namespace = etree_module.register_namespace
    except AttributeError:
        etree_module._namespace_map

        def register_namespace(prefix, uri):
            etree_module._namespace_map[uri] = prefix

    for k, v in list(ns.get_namespaces().items()):
        register_namespace(k, v)
Ejemplo n.º 13
0
def patch_well_known_namespaces(etree_module):

    import warnings
    from owslib.namespaces import Namespaces
    ns = Namespaces()

    """Monkey patches the etree module to add some well-known namespaces."""

    try:
        register_namespace = etree_module.register_namespace
    except AttributeError:
        try:
            etree_module._namespace_map

            def register_namespace(prefix, uri):
                etree_module._namespace_map[uri] = prefix
        except AttributeError:
            def register_namespace(prefix, uri):
                pass
            warnings.warn("Only 'lxml.etree' >= 2.3 and 'xml.etree.ElementTree' >= 1.3 are fully supported!")

    for k, v in ns.get_namespaces().iteritems():
        register_namespace(k, v)
Ejemplo n.º 14
0
def get_namespaces():
    n = Namespaces()
    return n.get_namespaces(["sml101","gml","xlink","swe101"])
Ejemplo n.º 15
0
def get_namespaces():
    n = Namespaces()
    namespaces = n.get_namespaces(["sml","gml","xlink"])
    namespaces["ism"] = "urn:us:gov:ic:ism:v2"
    return namespaces
Ejemplo n.º 16
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["atom", "dc", "gml", "gml32", "xlink"])
    ns.update(add_namespaces)
    ns[None] = n.get_namespace("atom")
    return ns
Ejemplo n.º 17
0
def get_namespaces():
    n = Namespaces()
    return n.get_namespaces(["sml101", "gml", "xlink", "swe101"])
Ejemplo n.º 18
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["ogc"])
    ns[None] = n.get_namespace("ogc")
    return ns
Ejemplo n.º 19
0
def get_namespaces():
    n = Namespaces()
    return n.get_namespaces(["sml","gml","xlink"])
Ejemplo n.º 20
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["gm03"])
    ns[None] = n.get_namespace("gm03")
    return ns
Ejemplo n.º 21
0
def get_namespaces():
    n = Namespaces()
    return n.get_namespaces()
Ejemplo n.º 22
0
def get_namespaces():
    ns = Namespaces()
    return ns.get_namespaces(["om10", "swe101", "swe20", "gml311", "xlink"])
Ejemplo n.º 23
0
def get_namespaces():
    n = Namespaces()
    return n.get_namespaces()
Ejemplo n.º 24
0
def get_namespaces():
    ns = Namespaces()
    return ns.get_namespaces(["om10", "swe101", "swe20", "gml311", "xlink"])
Ejemplo n.º 25
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(
        ["gco", "gmd", "gml", "gml32", "gmx", "gts", "srv", "xlink"])
    ns[None] = n.get_namespace("gmd")
    return ns
Ejemplo n.º 26
0
def get_namespaces():
    ns = Namespaces()
    return ns.get_namespaces(["swe20", "xlink"])
Ejemplo n.º 27
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces("dif")
    ns[None] = n.get_namespace("dif")
    return ns
Ejemplo n.º 28
0
Archivo: gm03.py Proyecto: Gustry/QGIS
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["gm03"])
    ns[None] = n.get_namespace("gm03")
    return ns
Ejemplo n.º 29
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["ogc","sa","sml","gml","sos","swe","xlink"])
    ns["ows"] = n.get_namespace("ows110")
    return ns
Ejemplo n.º 30
0
def get_namespaces():
    ns = Namespaces()
    return ns.get_namespaces(["swe20"])
Ejemplo n.º 31
0
    def harvest(self, request):
        ## harvest (Harvester object, request = [community, source, verb, mdprefix, mdsubset])
        # Harvest all files with <mdprefix> and <mdsubset> from <source> via sickle module and store those to hard drive.
        #
        # Parameters:
        # -----------
        # (list)  request - A list with following items:
        #                    1. community
        #                    2. source (OAI URL)
        #                    3. verb (ListIdentifiers, ListRecords or JSONAPI)
        #                    4. mdprefix (OAI md format as oai_dc, iso etc.)
        #                    5. mdsubset
        #
        # Return Values:
        # --------------
        # 1. (integer)  is -1 if something went wrong    
    
    
        # create a request dictionary:
        req = {
            "community" : request[0],
            "url"   : request[1],
            "lverb" : request[2],
            "mdprefix"  : request[3],
            "mdsubset"  : request[4]   if len(request)>4 else None
        }
   
        # create dictionary with stats:
        resKeys=['count','tcount','ecount','time']
        results = dict.fromkeys(resKeys,0)

        stats = {
            "tottcount" : 0,    # total number of provided datasets
            "totcount"  : 0,    # total number of successful harvested datasets
            "totecount" : 0,    # total number of failed datasets
            "totdcount" : 0,    # total number of all deleted datasets
            "tcount"    : 0,    # number of all provided datasets per subset
            "count"     : 0,    # number of all successful harvested datasets per subset
            "ecount"    : 0,    # number of all failed datasets per subset
            "dcount"    : 0,    # number of all deleted datasets per subset
            
            "timestart" : time.time(),  # start time per subset process
        }
        
        # the gbif api client
        class GBIF_CLIENT(object):
        
            # call action api:
            ## GBIF.action('package_list',{})
        
            def __init__ (self, api_url): ##, api_key):
                self.api_url = api_url
                self.logger = logging.getLogger('root')
     
            def JSONAPI(self, action, offset, chunklen, key):
                ## JSONAPI (action) - method
                return self.__action_api(action, offset, chunklen, key)

            def __action_api (self, action, offset, chunklen, key):
                # Make the HTTP request for get datasets from GBIF portal
                response=''
                rvalue = 0
                ## offset = 0
                limit=chunklen ## None for DataCite-JSON !!!
                api_url = self.api_url
                if key :
                    action_url = "{apiurl}/{action}/{key}".format(apiurl=api_url,action=action,key=str(key))
                elif offset == None :
                    action_url = "{apiurl}/{action}".format(apiurl=api_url,action=action)	
                else :
                    action_url = "{apiurl}/{action}?offset={offset}&limit={limit}".format(apiurl=api_url,action=action,offset=str(offset),limit=str(limit))	

                self.logger.debug('action_url: %s' % action_url)
                try:
                    request = Request(action_url)
                    response = urlopen(request)
                except HTTPError as e:
                   self.logger.error('%s : The server %s couldn\'t fulfill the action %s.' % (e.code,self.api_url,action))
                   if ( e.code == 403 ):
                       self.logger.critical('Access forbidden, maybe the API key is not valid?')
                       exit(e.code)
                   elif ( e.code == 409):
                       self.logger.critical('Maybe you have a parameter error?')
                       return {"success" : False}
                   elif ( e.code == 500):
                       self.logger.critical('Internal server error')
                       exit(e.code)
                except URLError as e:
                   exit('%s' % e.reason)
                else :
                   out = json.loads(response.read())
                   assert response.code >= 200
                   return out        

        requests_log = logging.getLogger("requests")
        requests_log.setLevel(logging.WARNING)
        
        # if the number of files in a subset dir is greater than <count_break>
        # then create a new one with the name <set> + '_' + <count_set>
        count_break = 5000
        count_set = 1
        start=time.time()

        # set subset:
        mdsubset=req["mdsubset"]
        if (not mdsubset):
            subset = 'SET'
        elif mdsubset.endswith('_'): # no OAI subsets, but different OAI-URLs for same community
            subset = mdsubset[:-1]
            mdsubset=None
        elif len(mdsubset) > 2 and mdsubset[-1].isdigit() and  mdsubset[-2] == '_' :
            subset = mdsubset[:-2]
        else:
            subset = mdsubset
            if req["community"] == "b2share" or re.match(r'http(.*?)b2share(.*?)api(.*?)',req["url"]) :
                setMapFile= '%s/mapfiles/b2share_mapset.json' % (os.getcwd())
            elif req["community"] == "dara" and req["url"] == "https://www.da-ra.de/oaip/oai" :
                setMapFile= '%s/mapfiles/dara_mapset.json' % (os.getcwd())
            else:
                setMapFile=None
            if setMapFile :
                with open(setMapFile) as sm :    
                    setMap = json.load(sm)
                    if mdsubset in setMap:
                        mdsubset = setMap[mdsubset]
            
        if (self.fromdate):
            subset = subset + '_f' + self.fromdate

        self.logger.debug(' |- Subset:    \t%s' % subset )

        # make subset dir:
        subsetdir = '/'.join([self.base_outdir,req['community']+'-'+req['mdprefix'],subset+'_'+str(count_set)])

        noffs=0 # set to number of record, where harvesting should start
        stats['tcount']=noffs
        fcount=0
        oldperc=0
        ntotrecs=0
        choffset=0
        chunklen=1000
        pageno=1
        records=list()

        ## JSON-API
        jsonapi_verbs=['dataset','works','records']
        if req["lverb"] in jsonapi_verbs :
            GBIF = GBIF_CLIENT(req['url'])   # create GBIF object   
            harvestreq=getattr(GBIF,'JSONAPI', None)
            outtypedir='hjson'
            outtypeext='json'
            if mdsubset and req["lverb"] == 'works' :
                haction='works?publisher-id='+mdsubset
                dresultkey='data'
            elif req["lverb"] == 'records' :
                haction=req["lverb"]
                if mdsubset :
                    haction+='?q=community:'+mdsubset+'&size='+str(chunklen)+'&page='+str(pageno)
                dresultkey='hits'
            else:
                haction=req["lverb"]
                dresultkey='results'
            try:
                chunk=harvestreq(**{'action':haction,'offset':None,'chunklen':chunklen,'key':None})
                self.logger.debug(" Got first %d records : chunk['data'] %s " % (chunklen,chunk[dresultkey]))
            except (HTTPError,ConnectionError,Exception) as e:
                self.logger.critical("%s :\n\thaction %s\n\tharvest request %s\n" % (e,haction,req))
                return -1

            if req["lverb"] == 'dataset':
                while('endOfRecords' in chunk and not chunk['endOfRecords']):
                    if 'results' in chunk :
                        records.extend(chunk['results'])
                    choffset+=chunklen
                    chunk = harvestreq(**{'action':haction,'offset':choffset,'chunklen':chunklen,'key':None})
                    self.logger.debug(" Got next records [%d,%d] from chunk %s " % (choffset,choffset+chunklen,chunk))
            elif req["lverb"] == 'records':
                records.extend(chunk['hits']['hits'])
                while('hits' in chunk and 'next' in chunk['links']):
                    if 'hits' in chunk :
                        records.extend(chunk['hits']['hits'])
                    pageno+=1
                    chunk =harvestreq(**{'action':haction,'page':pageno,'size':chunklen,'key':None})
                    self.logger.debug(" Got next records [%d,%d] from chunk %s " % (choffset,choffset+chunklen,chunk))
            else:
                if 'data' in chunk :
                    records.extend(chunk['data'])
                    
        # OAI-PMH (verb = ListRecords/Identifier )
        elif req["lverb"].startswith('List'):
            sickle = Sickle(req['url'], max_retries=3, timeout=300)
            outtypedir='xml'
            outtypeext='xml'
            harvestreq=getattr(sickle,req["lverb"], None)
            try:
                records,rc=tee(harvestreq(**{'metadataPrefix':req['mdprefix'],'set':mdsubset,'ignore_deleted':True,'from':self.fromdate}))
            except (HTTPError,ConnectionError) as err:
                self.logger.critical("%s during connecting to %s\n" % (err,req['url']))
                return -1
            except (ImportError,etree.XMLSyntaxError,CannotDisseminateFormat,Exception) as err:
                self.logger.critical("%s during harvest request %s\n" % (err,req))
                return -1

        # CSW2.0
        elif req["lverb"].startswith('csw'):
            outtypedir='xml'
            outtypeext='xml'
            startposition=0
            maxrecords=20
            try:
                src = CatalogueServiceWeb(req['url'])
                NS = Namespaces()
                namespaces=NS.get_namespaces()
                if req['mdprefix'] == 'iso19139' or req['mdprefix'] == 'own' : 
                    nsp = namespaces['gmd']
                else :
                    nsp = namespaces['csw']

                harvestreq=getattr(src,'getrecords2')
                chunk = harvestreq(**{'esn':'full','startposition':choffset,'maxrecords':maxrecords,'outputschema':nsp})
                chunklist=list(src.records.items())
                while(len(chunklist) > 0) :
                    records.extend(chunklist)
                    choffset+=maxrecords
                    chunk = harvestreq(**{'esn':'full','startposition':choffset,'maxrecords':maxrecords,'outputschema':nsp})
                    chunklist=list(src.records.items())
                    self.logger.debug(" Got next %s records [%d,%d] from chunk " % (nsp,choffset,choffset+chunklen))
            except (HTTPError,ConnectionError) as err:
                self.logger.critical("%s during connecting to %s\n" % (err,req['url']))
                return -1
            except (ImportError,CannotDisseminateFormat,Exception) as err:
                self.logger.error("%s : During harvest request %s\n" % (err,req))
                ##return -1

        # Restful API POST request
        elif req["lverb"].startswith('POST'):
            outtypedir='hjson'
            outtypeext='json'
            startposition=0
            maxrecords=1000
            try:
                url=req['url']
                data={ "text" : "mnhn", "searchTextInMetadata" : True, "searchTextInAdditionalData" : True, "page" : 1, "size" : 1000, "highlight" : { "preTag" : "<b>", "postTag" : "</b>", "fragmentSize" : 500, "fragmentsCount" : 1 } }
                headers = {'content-type': 'application/json'}
                response = requests.post(url, data=json.dumps(data), headers=headers, verify=False )##, stream=True ) ##HEW-D auth=('myusername', 'mybasicpass'))
                records=response.json()['result']
            except (HTTPError,ConnectionError) as err:
                self.logger.critical("%s during connecting to %s\n" % (err,req['url']))
                return -1
            except (ImportError,CannotDisseminateFormat,Exception) as err:
                self.logger.critical("%s during harvest request \n" % err)
                return -1
            
        # CKAN-API request
        elif req["lverb"].startswith('ckan_api'):
            outtypedir='hjson'
            outtypeext='json'
            startposition=0
            maxrecords=1000
            try:
                url=req['url']
                action_url = '{url}/{action}'.format(url=url,action='package_list')
                self.logger.debug('action_url %s' % action_url)            
                data_string=json.dumps({}).encode('utf8')
                request = Request(action_url,data_string)
                self.logger.debug('request %s' % request)            
                response = urlopen(request)
                self.logger.debug('response %s' % response)            
                records= json.loads(response.read())['result']
                self.logger.debug('records %s' % records[:10])
                sys.exit(-1)
            except (HTTPError,ConnectionError) as err:
                self.logger.critical("%s during connecting to %s\n" % (err,req['url']))
                return -1
            except (ImportError,CannotDisseminateFormat,Exception) as err:
                self.logger.critical("%s during harvest request \n" % err)
                return -1
            
        # SparQL
        elif req["lverb"].startswith('Sparql'):
            outtypedir='hjson'
            outtypeext='json'
            startposition=0
            maxrecords=1000
            try:
                src = SPARQLWrapper(req['url'])
                harvestreq=getattr(src,'query','format') ##
                statement='''prefix cpmeta: <http://meta.icos-cp.eu/ontologies/cpmeta/>
prefix prov: <http://www.w3.org/ns/prov#>
prefix dcterms: <http://purl.org/dc/terms/>
select
?url ?doi
(concat("11676/", substr(str(?url), strlen(str(?url)) - 23)) AS ?pid)
(if(bound(?theTitle), ?theTitle, ?fileName) as ?title)
(if(bound(?theDescription), ?theDescription, ?spec) as ?description)
?submissionTime ?tempCoverageFrom ?tempCoverageTo
?dataLevel ?format ?sha256sum ?latitude ?longitude ?spatialCoverage
where{
   ?url cpmeta:wasSubmittedBy [
     prov:endedAtTime ?submissionTime ;
     prov:wasAssociatedWith [a ?submitterClass]
    ] .
 ?url cpmeta:hasObjectSpec [rdfs:label ?spec ; cpmeta:hasDataLevel ?dataLevel; cpmeta:hasFormat/rdfs:label ?format ] .
  FILTER(?submitterClass = cpmeta:ThematicCenter || ?submitterClass = cpmeta:ES || ?dataLevel = "3"^^xsd:integer)
   ?url cpmeta:hasName ?fileName .
   ?url cpmeta:hasSha256sum ?sha256sum .
   OPTIONAL{?url dcterms:title ?theTitle ; dcterms:description ?theDescription}
   OPTIONAL{?coll dcterms:hasPart ?url . ?coll cpmeta:hasDoi ?doi }
   {
     {
         ?url cpmeta:wasAcquiredBy ?acq .
         ?acq prov:startedAtTime ?tempCoverageFrom; prov:endedAtTime ?tempCoverageTo; prov:wasAssociatedWith ?station .
         {
           {
                ?station cpmeta:hasLatitude ?latitude .
                ?station cpmeta:hasLongitude ?longitude .
           }UNION{
                ?url cpmeta:hasSpatialCoverage/cpmeta:asGeoJSON ?spatialCoverage .
           }
         }
     }UNION{
         ?url cpmeta:hasStartTime ?tempCoverageFrom .
         ?url cpmeta:hasEndTime ?tempCoverageTo .
         ?url cpmeta:hasSpatialCoverage/cpmeta:asGeoJSON ?spatialCoverage .
     }
   }
}
limit 10'''



                '''
prefix cpmeta: <http://meta.icos-cp.eu/ontologies/cpmeta/>
prefix prov: <http://www.w3.org/ns/prov#>
select (str(?submTime) as ?time) ?dobj ?spec ?dataLevel ?fileName ?submitterName where{
  ?dobj cpmeta:hasObjectSpec [rdfs:label ?spec ; cpmeta:hasDataLevel ?dataLevel].
  ?dobj cpmeta:hasName ?fileName .
  ?dobj cpmeta:wasSubmittedBy ?submission .
  ?submission prov:endedAtTime ?submTime .
  ?submission prov:wasAssociatedWith [cpmeta:hasName ?submitterName].
}
order by desc(?submTime)
limit 1000
'''                            
                src.setQuery(statement)
                src.setReturnFormat(JSON)
                records = harvestreq().convert()['results']['bindings']
            except (HTTPError,ConnectionError) as err:
                self.logger.critical("%s during connecting to %s\n" % (err,req['url']))
                return -1
            except (ImportError,CannotDisseminateFormat,Exception) as err:
                self.logger.critical("%s during harvest request %s\n" % (err,req))
                return -1
            
        else:
            self.logger.critical(' Not supported harvest type %s' %  req["lverb"])
            sys.exit()

        self.logger.debug(" Harvest method used %s" % req["lverb"])
        try:
            if req["lverb"].startswith('List'):
                ntotrecs=len(list(rc))
            else:
                ntotrecs=len(records) 
        except Exception as err:
            self.logger.error('%s Iteratation does not work ?' % (err))
            
        print ("\t|- Retrieved %d records in %d sec - write %s files to disc" % (ntotrecs,time.time()-start,outtypeext.upper()) )
        if ntotrecs == 0 :
            self.logger.warning("\t|- Can not access any records to harvest")
            return -1

        self.logger.debug(' | %-4s | %-25s | %-25s |' % ('#','OAI Identifier','DS Identifier'))
        start2=time.time()

        if (not os.path.isdir(subsetdir+'/'+ outtypedir)):
            os.makedirs(subsetdir+'/' + outtypedir)
        
        delete_ids=list()
        # loop over records
        for record in records :
            ## counter and progress bar
            stats['tcount'] += 1
            fcount+=1
            if fcount <= noffs : continue
            if ntotrecs > 0 :
                perc=int(fcount*100/ntotrecs)
                bartags=int(perc/5)
                if perc%10 == 0 and perc != oldperc :
                    oldperc=perc
                    print ("\r\t[%-20s] %5d (%3d%%) in %d sec" % ('='*bartags, fcount, perc, time.time()-start2 ))
                    sys.stdout.flush()
                    
            # Set oai_id and generate a uniquely identifier for this dataset:
            delete_flag=False
            if req["lverb"] == 'dataset' or req["lverb"] == 'works' or req["lverb"] == 'records' : ## Harvest via JSON-API
                if 'key' in record :
                    oai_id = record['key']
                elif 'id' in record :
                    oai_id = record['id']
            
            elif req["lverb"] == 'csw': ## Harvest via CSW2.0
                if hasattr(record,'identifier') :
                    oai_id = record.identifier
                elif(record):
                    oai_id = record[0]
                else:
                    self.logger.critical('Record %s has no attrribute identifier %s' % record) 
            
            elif req["lverb"] == 'ListIdentifiers' : ## OAI-PMH harvesting of XML records
                if (record.deleted):
                    stats['totdcount'] += 1
                    delete_flag=True
                    ##HEW-D continue
                else:
                    oai_id = record.identifier
                    try:
                        record = sickle.GetRecord(**{'metadataPrefix':req['mdprefix'],'identifier':record.identifier})
                    except (CannotDisseminateFormat,Exception) as err:
                        self.logger.error('%s during GetRecord of %s' % (err,record.identifier))
                        stats['ecount'] += 1
                        continue
            elif req["lverb"] == 'ListRecords' :
                if (record.header.deleted):
                    stats['totdcount'] += 1
                    continue
                else:
                    oai_id = record.header.identifier
            elif req["lverb"].startswith('Sparql'):
                if 'fileName' in record:
                    oai_id=record['fileName']['value']
                elif 'title' in record:
                    oai_id=record['title']['value']

            elif req["lverb"].startswith('POST'):
                if 'depositIdentifier' in record:
                    oai_id=record['depositIdentifier']

            elif req["lverb"].startswith('ckan_api'):
                try:
                    oai_id=record
                    ##HEW-D action_url = '{url}/{action}?id={record}'.format(url=url,action='package_show',record=record)
                    action_url = '{url}/{action}'.format(url=url,action='package_show')
                    self.logger.debug('action_url %s' % action_url)            
                    self.logger.debug('data_string %s' % data_string)
                    data_string=json.dumps({"id": record }).encode('utf8')
                    request = Request(action_url,data_string)
                    self.logger.debug('request %s' % request)            
                    response = urlopen(request)
                    self.logger.debug('response %s' % response)            
                    record= json.loads(response.read())['result']
                    self.logger.debug('records %s' % records)
                except (HTTPError,ConnectionError) as err:
                    self.logger.critical("%s during connecting to %s\n" % (err,req['url']))
                    return -1
                except (ImportError,CannotDisseminateFormat,Exception) as err:
                    self.logger.critical("%s during harvest request \n" % err)
                    return -1

            # generate a uniquely identifier and a filename for this dataset:
            uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, oai_id))
            outfile = '%s/%s/%s.%s' % (subsetdir,outtypedir,os.path.basename(uid),outtypeext)

            if delete_flag : # record marked as deleted on provider site 
                jsonfile = '%s/%s/%s.%s' % (subsetdir,'json',os.path.basename(uid),'json')
                # remove xml and json file:
                os.remove(xmlfile)
                os.remove(jsonfile)
                delete_ids.append(uid)

            # write record on disc
            try:
                self.logger.debug('    | h | %-4d | %-45s | %-45s |' % (stats['count']+1,oai_id,uid))
                self.logger.debug('Try to write the harvested JSON record to %s' % outfile)
     
                if outtypeext == 'xml':   # get and write the XML content:
                    if req["lverb"] == 'csw':
                        metadata = etree.fromstring(record[1].xml)
                    elif hasattr(record,'raw'):
                        metadata = etree.fromstring(record.raw)
                    elif hasattr(record,'xml'):
                        metadata = etree.fromstring(record.xml)

                    if (metadata is not None):
                        try:
                            metadata = etree.tostring(metadata, pretty_print = True).decode('utf-8')
                        except (Exception,UnicodeEncodeError) as e:
                            self.logger.critical('%s : Metadata: %s ...' % (e,metadata[:20]))
                        ##if PY2 :
                        ##    try:
                        ##        metadata = metadata.encode('utf-8')
                        ##    except (Exception,UnicodeEncodeError) as e :
                        ##        self.logger.debug('%s : Metadata : %s ...' % (e,metadata[20]))

                        try:
                            f = open(outfile, 'w')
                            f.write(metadata)
                            f.close
                        except (Exception,IOError) as err :
                            self.logger.critical("%s : Cannot write metadata in xml file %s" % (err,outfile))
                            stats['ecount'] +=1
                            continue
                        else:
                            logging.debug('Harvested XML file written to %s' % outfile)
                            stats['count'] += 1
                    else:
                        stats['ecount'] += 1
                        self.logger.error('No metadata available for %s' % record)

                elif outtypeext == 'json':   # get the raw json content:
                     if (record is not None):
                         try:
                             with open(outfile, 'w') as f:
                                 json.dump(record,f, sort_keys = True, indent = 4)
                         except IOError:
                             logging.error("[ERROR] Cannot write metadata in out file '%s': %s\n" % (outfile))
                             stats['ecount'] +=1
                             continue
                         else :
                            stats['count'] += 1
                            logging.debug('Harvested JSON file written to %s' % outfile)
                     else:
                        stats['ecount'] += 1
                        logging.warning('    [WARNING] No metadata available for %s' % record['key']) ##HEW-???' % oai_id)


            except TypeError as e:
                    logging.error('    [ERROR] TypeError: %s' % e)
                    stats['ecount']+=1        
                    continue
            except Exception as e:
                    logging.error("    [ERROR] %s and %s" % (e,traceback.format_exc()))
                    ## logging.debug(metadata)
                    stats['ecount']+=1
                    continue

            # Next or last subset?
            if (stats['count'] == count_break) or (fcount == ntotrecs):
                    print('       | %d records written to subset directory %s ' % (stats['count'], subsetdir))

                    # clean up current subset and write ids to remove to delete file
                    for df in os.listdir(subsetdir+'/'+ outtypedir):
                        df=os.path.join(subsetdir+'/'+ outtypedir,df)
                        logging.debug('File to delete : %s' % df)
                        id=os.path.splitext(os.path.basename(df))[0]
                        jf=os.path.join(subsetdir+'/json/',id+'.json')
                        if os.stat(df).st_mtime < start - 1 * 86400:
                            os.remove(df)
                            logging.warning('File %s is deleted' % df)
                            if os.path.exists(jf) : 
                                os.remove(jf)
                                logging.warning('File %s is deleted' % jf)
                            delete_ids.append(id)
                            logging.warning('Append Id %s to list delete_ids' % id)
                            stats['dcount']+=1

                    print('       | %d records deleted from subset directory %s ' % (stats['dcount'], subsetdir))

                    if not fcount == ntotrecs : # next subset neded
                        subsetdir = self.save_subset(req, stats, subset, count_set)
                        if (not os.path.isdir(subsetdir+'/'+ outtypedir)):
                            os.makedirs(subsetdir+'/' + outtypedir)

                        count_set += 1
                                                        
                    # add all subset stats to total stats and reset the temporal subset stats:
                    for key in ['tcount', 'ecount', 'count', 'dcount']:
                        stats['tot'+key] += stats[key]
                        stats[key] = 0
                            
                        # start with a new time:
                        stats['timestart'] = time.time()
                
                    logging.debug('    | %d records written to subset directory %s (if not failed).'% (stats['count'], subsetdir))

        # path to the file with all ids to delete:
        delete_file = '/'.join([self.base_outdir,'delete',req['community']+'-'+req['mdprefix']+'.del'])
        if len(delete_ids) > 0 :
            with open(delete_file, 'a+') as file:
                for id in delete_ids :
                    file.write(id+'\n')

        # add all subset stats to total stats and reset the temporal subset stats:
        for key in ['tcount', 'ecount', 'count', 'dcount']:
                stats['tot'+key] += stats[key]
            
        print ('   \t|- %-10s |@ %-10s |\n\t| Provided | Harvested | Failed | Deleted |\n\t| %8d | %9d | %6d | %6d |' % ( 'Finished',time.strftime("%H:%M:%S"),
                    stats['tottcount'],
                    stats['totcount'],
                    stats['totecount'],
                    stats['totdcount']
                ))
Ejemplo n.º 32
0
    def harvest(self, request):
        ## harvest (Harvester object, request = [community, source, verb, mdprefix, mdsubset])
        # Harvest all files with <mdprefix> and <mdsubset> from <source> via sickle module and store those to hard drive.
        #
        # Parameters:
        # -----------
        # (list)  request - A list with following items:
        #                    1. community
        #                    2. source (OAI URL)
        #                    3. verb (ListIdentifiers, ListRecords or JSONAPI)
        #                    4. mdprefix (OAI md format as oai_dc, iso etc.)
        #                    5. mdsubset
        #
        # Return Values:
        # --------------
        # 1. (integer)  is -1 if something went wrong

        # create a request dictionary:
        req = {
            "community": request[0],
            "url": request[1],
            "lverb": request[2],
            "mdprefix": request[3],
            "mdsubset": request[4] if len(request) > 4 else None
        }

        # create dictionary with stats:
        resKeys = ['count', 'tcount', 'ecount', 'time']
        results = dict.fromkeys(resKeys, 0)

        stats = {
            "tottcount": 0,  # total number of provided datasets
            "totcount": 0,  # total number of successful harvested datasets
            "totecount": 0,  # total number of failed datasets
            "totdcount": 0,  # total number of all deleted datasets
            "tcount": 0,  # number of all provided datasets per subset
            "count":
            0,  # number of all successful harvested datasets per subset
            "ecount": 0,  # number of all failed datasets per subset
            "dcount": 0,  # number of all deleted datasets per subset
            "timestart": time.time(),  # start time per subset process
        }

        # the gbif api client
        class GBIF_CLIENT(object):

            # call action api:
            ## GBIF.action('package_list',{})

            def __init__(self, api_url):  ##, api_key):
                self.api_url = api_url
                self.logger = logging.getLogger('root')

            def JSONAPI(self, action, offset, chunklen, key):
                ## JSONAPI (action) - method
                return self.__action_api(action, offset, chunklen, key)

            def __action_api(self, action, offset, chunklen, key):
                # Make the HTTP request for get datasets from GBIF portal
                response = ''
                rvalue = 0
                ## offset = 0
                limit = chunklen  ## None for DataCite-JSON !!!
                api_url = self.api_url
                if key:
                    action_url = "{apiurl}/{action}/{key}".format(
                        apiurl=api_url, action=action, key=str(key))
                elif offset == None:
                    action_url = "{apiurl}/{action}".format(apiurl=api_url,
                                                            action=action)
                else:
                    action_url = "{apiurl}/{action}?offset={offset}&limit={limit}".format(
                        apiurl=api_url,
                        action=action,
                        offset=str(offset),
                        limit=str(limit))

                self.logger.debug('action_url: %s' % action_url)
                try:
                    request = Request(action_url)
                    response = urlopen(request)
                except HTTPError as e:
                    self.logger.error(
                        '%s : The server %s couldn\'t fulfill the action %s.' %
                        (e.code, self.api_url, action))
                    if (e.code == 403):
                        self.logger.critical(
                            'Access forbidden, maybe the API key is not valid?'
                        )
                        exit(e.code)
                    elif (e.code == 409):
                        self.logger.critical(
                            'Maybe you have a parameter error?')
                        return {"success": False}
                    elif (e.code == 500):
                        self.logger.critical('Internal server error')
                        exit(e.code)
                except URLError as e:
                    exit('%s' % e.reason)
                else:
                    out = json.loads(response.read())
                    assert response.code >= 200
                    return out

        requests_log = logging.getLogger("requests")
        requests_log.setLevel(logging.WARNING)

        # if the number of files in a subset dir is greater than <count_break>
        # then create a new one with the name <set> + '_' + <count_set>
        count_break = 5000
        count_set = 1
        start = time.time()

        # set subset:
        mdsubset = req["mdsubset"]
        if (not mdsubset):
            subset = 'SET'
        elif mdsubset.endswith(
                '_'
        ):  # no OAI subsets, but different OAI-URLs for same community
            subset = mdsubset[:-1]
            mdsubset = None
        elif len(mdsubset) > 2 and mdsubset[-1].isdigit(
        ) and mdsubset[-2] == '_':
            subset = mdsubset[:-2]
        else:
            subset = mdsubset
            if req["community"] == "b2share" or re.match(
                    r'http(.*?)b2share(.*?)api(.*?)', req["url"]):
                setMapFile = '%s/mapfiles/b2share_mapset.json' % (os.getcwd())
            elif req["community"] == "dara" and req[
                    "url"] == "https://www.da-ra.de/oaip/oai":
                setMapFile = '%s/mapfiles/dara_mapset.json' % (os.getcwd())
            else:
                setMapFile = None
            if setMapFile:
                with open(setMapFile) as sm:
                    setMap = json.load(sm)
                    if mdsubset in setMap:
                        mdsubset = setMap[mdsubset]

        if (self.fromdate):
            subset = subset + '_f' + self.fromdate

        self.logger.debug(' |- Subset:    \t%s' % subset)

        # make subset dir:
        subsetdir = '/'.join([
            self.base_outdir, req['community'] + '-' + req['mdprefix'],
            subset + '_' + str(count_set)
        ])

        noffs = 0  # set to number of record, where harvesting should start
        stats['tcount'] = noffs
        fcount = 0
        oldperc = 0
        ntotrecs = 0
        choffset = 0
        chunklen = 1000
        pageno = 1
        records = list()

        ## JSON-API
        jsonapi_verbs = ['dataset', 'works', 'records']
        if req["lverb"] in jsonapi_verbs:
            GBIF = GBIF_CLIENT(req['url'])  # create GBIF object
            harvestreq = getattr(GBIF, 'JSONAPI', None)
            outtypedir = 'hjson'
            outtypeext = 'json'
            if mdsubset and req["lverb"] == 'works':
                haction = 'works?publisher-id=' + mdsubset
                dresultkey = 'data'
            elif req["lverb"] == 'records':
                haction = req["lverb"]
                if mdsubset:
                    haction += '?q=community:' + mdsubset + '&size=' + str(
                        chunklen) + '&page=' + str(pageno)
                dresultkey = 'hits'
            else:
                haction = req["lverb"]
                dresultkey = 'results'
            try:
                chunk = harvestreq(
                    **{
                        'action': haction,
                        'offset': None,
                        'chunklen': chunklen,
                        'key': None
                    })
                self.logger.debug(" Got first %d records : chunk['data'] %s " %
                                  (chunklen, chunk[dresultkey]))
            except (HTTPError, ConnectionError, Exception) as e:
                self.logger.critical(
                    "%s :\n\thaction %s\n\tharvest request %s\n" %
                    (e, haction, req))
                return -1

            if req["lverb"] == 'dataset':
                while ('endOfRecords' in chunk and not chunk['endOfRecords']):
                    if 'results' in chunk:
                        records.extend(chunk['results'])
                    choffset += chunklen
                    chunk = harvestreq(
                        **{
                            'action': haction,
                            'offset': choffset,
                            'chunklen': chunklen,
                            'key': None
                        })
                    self.logger.debug(
                        " Got next records [%d,%d] from chunk %s " %
                        (choffset, choffset + chunklen, chunk))
            elif req["lverb"] == 'records':
                records.extend(chunk['hits']['hits'])
                while ('hits' in chunk and 'next' in chunk['links']):
                    if 'hits' in chunk:
                        records.extend(chunk['hits']['hits'])
                    pageno += 1
                    chunk = harvestreq(
                        **{
                            'action': haction,
                            'page': pageno,
                            'size': chunklen,
                            'key': None
                        })
                    self.logger.debug(
                        " Got next records [%d,%d] from chunk %s " %
                        (choffset, choffset + chunklen, chunk))
            else:
                if 'data' in chunk:
                    records.extend(chunk['data'])

        # OAI-PMH (verb = ListRecords/Identifier )
        elif req["lverb"].startswith('List'):
            sickle = Sickle(req['url'], max_retries=3, timeout=300)
            outtypedir = 'xml'
            outtypeext = 'xml'
            harvestreq = getattr(sickle, req["lverb"], None)
            try:
                records, rc = tee(
                    harvestreq(
                        **{
                            'metadataPrefix': req['mdprefix'],
                            'set': mdsubset,
                            'ignore_deleted': True,
                            'from': self.fromdate
                        }))
            except (HTTPError, ConnectionError) as err:
                self.logger.critical("%s during connecting to %s\n" %
                                     (err, req['url']))
                return -1
            except (ImportError, etree.XMLSyntaxError, CannotDisseminateFormat,
                    Exception) as err:
                self.logger.critical("%s during harvest request %s\n" %
                                     (err, req))
                return -1

        # CSW2.0
        elif req["lverb"].startswith('csw'):
            outtypedir = 'xml'
            outtypeext = 'xml'
            startposition = 0
            maxrecords = 20
            try:
                src = CatalogueServiceWeb(req['url'])
                NS = Namespaces()
                namespaces = NS.get_namespaces()
                if req['mdprefix'] == 'iso19139' or req['mdprefix'] == 'own':
                    nsp = namespaces['gmd']
                else:
                    nsp = namespaces['csw']

                harvestreq = getattr(src, 'getrecords2')
                chunk = harvestreq(
                    **{
                        'esn': 'full',
                        'startposition': choffset,
                        'maxrecords': maxrecords,
                        'outputschema': nsp
                    })
                chunklist = list(src.records.items())
                while (len(chunklist) > 0):
                    records.extend(chunklist)
                    choffset += maxrecords
                    chunk = harvestreq(
                        **{
                            'esn': 'full',
                            'startposition': choffset,
                            'maxrecords': maxrecords,
                            'outputschema': nsp
                        })
                    chunklist = list(src.records.items())
                    self.logger.debug(
                        " Got next %s records [%d,%d] from chunk " %
                        (nsp, choffset, choffset + chunklen))
            except (HTTPError, ConnectionError) as err:
                self.logger.critical("%s during connecting to %s\n" %
                                     (err, req['url']))
                return -1
            except (ImportError, CannotDisseminateFormat, Exception) as err:
                self.logger.error("%s : During harvest request %s\n" %
                                  (err, req))
                ##return -1

        # SparQL
        elif req["lverb"].startswith('Sparql'):
            outtypedir = 'hjson'
            outtypeext = 'json'
            startposition = 0
            maxrecords = 1000
            try:
                src = SPARQLWrapper(req['url'])
                harvestreq = getattr(src, 'query', 'format')  ##
                statement = '''
prefix cpmeta: <http://meta.icos-cp.eu/ontologies/cpmeta/>
prefix prov: <http://www.w3.org/ns/prov#>
select (str(?submTime) as ?time) ?dobj ?spec ?dataLevel ?fileName ?submitterName where{
  ?dobj cpmeta:hasObjectSpec [rdfs:label ?spec ; cpmeta:hasDataLevel ?dataLevel].
  ?dobj cpmeta:hasName ?fileName .
  ?dobj cpmeta:wasSubmittedBy ?submission .
  ?submission prov:endedAtTime ?submTime .
  ?submission prov:wasAssociatedWith [cpmeta:hasName ?submitterName].
}
order by desc(?submTime)
limit 1000
'''
                src.setQuery(statement)
                src.setReturnFormat(JSON)
                records = harvestreq().convert()['results']['bindings']
            except (HTTPError, ConnectionError) as err:
                self.logger.critical("%s during connecting to %s\n" %
                                     (err, req['url']))
                return -1
            except (ImportError, CannotDisseminateFormat, Exception) as err:
                self.logger.critical("%s during harvest request %s\n" %
                                     (err, req))
                return -1

        else:
            self.logger.critical(' Not supported harvest type %s' %
                                 req["lverb"])
            sys.exit()

        self.logger.debug(" Harvest method used %s" % req["lverb"])
        try:
            if req["lverb"].startswith('List'):
                ntotrecs = len(list(rc))
            else:
                ntotrecs = len(records)
        except Exception as err:
            self.logger.error('%s Iteratation does not work ?' % (err))

        print("\t|- Retrieved %d records in %d sec - write %s files to disc" %
              (ntotrecs, time.time() - start, outtypeext.upper()))
        if ntotrecs == 0:
            self.logger.warning("\t|- Can not access any records to harvest")
            return -1

        self.logger.debug(' | %-4s | %-25s | %-25s |' %
                          ('#', 'OAI Identifier', 'DS Identifier'))
        start2 = time.time()

        if (not os.path.isdir(subsetdir + '/' + outtypedir)):
            os.makedirs(subsetdir + '/' + outtypedir)

        delete_ids = list()
        # loop over records
        for record in records:
            ## counter and progress bar
            stats['tcount'] += 1
            fcount += 1
            if fcount <= noffs: continue
            if ntotrecs > 0:
                perc = int(fcount * 100 / ntotrecs)
                bartags = int(perc / 5)
                if perc % 10 == 0 and perc != oldperc:
                    oldperc = perc
                    print("\r\t[%-20s] %5d (%3d%%) in %d sec" %
                          ('=' * bartags, fcount, perc, time.time() - start2))
                    sys.stdout.flush()

            # Set oai_id and generate a uniquely identifier for this dataset:
            delete_flag = False
            if req["lverb"] == 'dataset' or req["lverb"] == 'works' or req[
                    "lverb"] == 'records':  ## Harvest via JSON-API
                if 'key' in record:
                    oai_id = record['key']
                elif 'id' in record:
                    oai_id = record['id']

            elif req["lverb"] == 'csw':  ## Harvest via CSW2.0
                if hasattr(record, 'identifier'):
                    oai_id = record.identifier
                elif (record):
                    oai_id = record[0]
                else:
                    self.logger.critical(
                        'Record %s has no attrribute identifier %s' % record)

            elif req[
                    "lverb"] == 'ListIdentifiers':  ## OAI-PMH harvesting of XML records
                if (record.deleted):
                    stats['totdcount'] += 1
                    delete_flag = True
                    ##HEW-D continue
                else:
                    oai_id = record.identifier
                    record = sickle.GetRecord(
                        **{
                            'metadataPrefix': req['mdprefix'],
                            'identifier': record.identifier
                        })
            elif req["lverb"] == 'ListRecords':
                if (record.header.deleted):
                    stats['totdcount'] += 1
                    continue
                else:
                    oai_id = record.header.identifier
            elif req["lverb"].startswith('Sparql'):
                oai_id = record['fileName']['value']

            # generate a uniquely identifier and a filename for this dataset:
            uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, oai_id))
            outfile = '%s/%s/%s.%s' % (subsetdir, outtypedir,
                                       os.path.basename(uid), outtypeext)

            if delete_flag:  # record marked as deleted on provider site
                jsonfile = '%s/%s/%s.%s' % (subsetdir, 'json',
                                            os.path.basename(uid), 'json')
                # remove xml and json file:
                os.remove(xmlfile)
                os.remove(jsonfile)
                delete_ids.append(uid)

            # write record on disc
            try:
                self.logger.debug('    | h | %-4d | %-45s | %-45s |' %
                                  (stats['count'] + 1, oai_id, uid))
                self.logger.debug(
                    'Try to write the harvested JSON record to %s' % outfile)

                if outtypeext == 'xml':  # get and write the XML content:
                    if req["lverb"] == 'csw':
                        metadata = etree.fromstring(record[1].xml)
                    elif hasattr(record, 'raw'):
                        metadata = etree.fromstring(record.raw)
                    elif hasattr(record, 'xml'):
                        metadata = etree.fromstring(record.xml)

                    if (metadata is not None):
                        try:
                            metadata = etree.tostring(
                                metadata, pretty_print=True).decode('utf-8')
                        except (Exception, UnicodeEncodeError) as e:
                            self.logger.critical('%s : Metadata: %s ...' %
                                                 (e, metadata[:20]))
                        ##if PY2 :
                        ##    try:
                        ##        metadata = metadata.encode('utf-8')
                        ##    except (Exception,UnicodeEncodeError) as e :
                        ##        self.logger.debug('%s : Metadata : %s ...' % (e,metadata[20]))

                        try:
                            f = open(outfile, 'w')
                            f.write(metadata)
                            f.close
                        except (Exception, IOError) as err:
                            self.logger.critical(
                                "%s : Cannot write metadata in xml file %s" %
                                (err, outfile))
                            stats['ecount'] += 1
                            continue
                        else:
                            logging.debug('Harvested XML file written to %s' %
                                          outfile)
                            stats['count'] += 1
                    else:
                        stats['ecount'] += 1
                        self.logger.error('No metadata available for %s' %
                                          record)

                elif outtypeext == 'json':  # get the raw json content:
                    if (record is not None):
                        try:
                            with open(outfile, 'w') as f:
                                json.dump(record, f, sort_keys=True, indent=4)
                        except IOError:
                            logging.error(
                                "[ERROR] Cannot write metadata in out file '%s': %s\n"
                                % (outfile))
                            stats['ecount'] += 1
                            continue
                        else:
                            stats['count'] += 1
                            logging.debug('Harvested JSON file written to %s' %
                                          outfile)
                    else:
                        stats['ecount'] += 1
                        logging.warning(
                            '    [WARNING] No metadata available for %s' %
                            record['key'])  ##HEW-???' % oai_id)

            except TypeError as e:
                logging.error('    [ERROR] TypeError: %s' % e)
                stats['ecount'] += 1
                continue
            except Exception as e:
                logging.error("    [ERROR] %s and %s" %
                              (e, traceback.format_exc()))
                ## logging.debug(metadata)
                stats['ecount'] += 1
                continue

            # Next or last subset?
            if (stats['count'] == count_break) or (fcount == ntotrecs):
                print('       | %d records written to subset directory %s ' %
                      (stats['count'], subsetdir))

                # clean up current subset and write ids to remove to delete file
                for df in os.listdir(subsetdir + '/' + outtypedir):
                    df = os.path.join(subsetdir + '/' + outtypedir, df)
                    logging.debug('File to delete : %s' % df)
                    id = os.path.splitext(os.path.basename(df))[0]
                    jf = os.path.join(subsetdir + '/json/', id + '.json')
                    if os.stat(df).st_mtime < start - 1 * 86400:
                        os.remove(df)
                        logging.warning('File %s is deleted' % df)
                        if os.path.exists(jf):
                            os.remove(jf)
                            logging.warning('File %s is deleted' % jf)
                        delete_ids.append(id)
                        logging.warning('Append Id %s to list delete_ids' % id)
                        stats['dcount'] += 1

                print('       | %d records deleted from subset directory %s ' %
                      (stats['dcount'], subsetdir))

                if not fcount == ntotrecs:  # next subset neded
                    subsetdir = self.save_subset(req, stats, subset, count_set)
                    if (not os.path.isdir(subsetdir + '/' + outtypedir)):
                        os.makedirs(subsetdir + '/' + outtypedir)

                    count_set += 1

                # add all subset stats to total stats and reset the temporal subset stats:
                for key in ['tcount', 'ecount', 'count', 'dcount']:
                    stats['tot' + key] += stats[key]
                    stats[key] = 0

                    # start with a new time:
                    stats['timestart'] = time.time()

                logging.debug(
                    '    | %d records written to subset directory %s (if not failed).'
                    % (stats['count'], subsetdir))

        # path to the file with all ids to delete:
        delete_file = '/'.join([
            self.base_outdir, 'delete',
            req['community'] + '-' + req['mdprefix'] + '.del'
        ])
        if len(delete_ids) > 0:
            with open(delete_file, 'a') as file:
                for id in delete_ids:
                    file.write(id + '\n')

        # add all subset stats to total stats and reset the temporal subset stats:
        for key in ['tcount', 'ecount', 'count', 'dcount']:
            stats['tot' + key] += stats[key]

        print(
            '   \t|- %-10s |@ %-10s |\n\t| Provided | Harvested | Failed | Deleted |\n\t| %8d | %9d | %6d | %6d |'
            % ('Finished', time.strftime("%H:%M:%S"), stats['tottcount'],
               stats['totcount'], stats['totecount'], stats['totdcount']))
Ejemplo n.º 33
0
def get_namespaces():
    n = Namespaces()
    return n.get_namespaces(["sml", "gml", "xlink"])
Ejemplo n.º 34
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces("dif")
    ns[None] = n.get_namespace("dif")
    return ns
Ejemplo n.º 35
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["gml32", "ogc", "xsd"])
    ns[None] = n.get_namespace("ogc")
    return ns
Ejemplo n.º 36
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["dif", "fes", "gml", "ogc", "xs", "xsi"])
    ns[None] = n.get_namespace("ogc")
    return ns
Ejemplo n.º 37
0
def get_namespaces():
    ns = Namespaces()
    return ns.get_namespaces(["swe20", "xlink", "sos20", "om20", "gml32",
                              "xsi", "wml2"])
Ejemplo n.º 38
0
Archivo: iso.py Proyecto: wsidl/OWSLib
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["gco","gmd","gml","gml32","gmx","gts","srv","xlink"])
    ns[None] = n.get_namespace("gmd")
    return ns
Ejemplo n.º 39
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["gco","gmd","gml","gml32","gmx","gts","srv","xlink"])
    ns["che"] = 'http://www.geocat.ch/2008/che'
    ns[None] = n.get_namespace("gmd")
    return ns
Ejemplo n.º 40
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["ogc", "sml", "gml", "sos", "swe", "xlink"])
    ns["ows"] = n.get_namespace("ows110")
    return ns
Ejemplo n.º 41
0
def get_namespaces():
    n = Namespaces()
    namespaces = n.get_namespaces(["sml","gml","xlink"])
    namespaces["ism"] = "urn:us:gov:ic:ism:v2"
    return namespaces
Ejemplo n.º 42
0
def get_namespaces():
    n = Namespaces()
    return n.get_namespaces(["gml","ogc","ows","wfs"])
Ejemplo n.º 43
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["atom", "dc", "gml", "gml32", "xlink"])
    ns.update(add_namespaces)
    ns[None] = n.get_namespace("atom")
    return ns
Ejemplo n.º 44
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["dif","fes","gml","ogc","xs","xsi"])
    ns[None] = n.get_namespace("ogc")
    return ns
Ejemplo n.º 45
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["ogc"])
    ns[None] = n.get_namespace("ogc")
    return ns
Ejemplo n.º 46
0
def get_namespaces():
    n = Namespaces()
    return n.get_namespaces(["gmd", "gml", "gmi", "ogc","ows","wfs"])
Ejemplo n.º 47
0
def get_namespaces():
    n = Namespaces()
    ns = n.get_namespaces(["fes","ogc","om","gml32","sml","swe20","swes","xlink"])
    ns["ows"] = n.get_namespace("ows110")
    ns["sos"] = n.get_namespace("sos20")
    return ns