コード例 #1
0
ファイル: RESTLoader.py プロジェクト: dhenry314/gleanomatic
 def run(self):
     offset = self.defaultOffset
     count = self.defaultCount
     url = self.getNextURL()
     while url:
         self.logger.info("Pulling REST records from " + str(url))
         try:
             response = Utils.getResponse(url)
         except Exception as e:
             self.logger.warning("Could not get content from " + str(url) +
                                 " ERROR: " + str(e))
             continue
         try:
             data = Utils.getJSONFromResponse(response)
         except Exception as e:
             self.logger.warning("Could not get content from " + str(url) +
                                 " ERROR: " + str(e))
             continue
         if len(data) == 0:
             url = None
         if self.recordType == 'list':
             records = []
             for uri in data:
                 records.append(uri)
         else:
             records = self.getRecords(url)
         self.addBatch(records)
         offset = offset + count
         url = self.getNextURL(offset, count)
コード例 #2
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def addResource(self, uri, sourceNamespace, setNamespace, batchTag=None):
     self.logger.info("Adding resource with uri: " + str(uri))
     record = None
     message = None
     try:
         Utils.checkURI(uri)
     except URIException as e:
         raise Exception("Resource uri did not validate. uri: " + str(uri))
     params = {
         'sourceNamespace': sourceNamespace,
         'setNamespace': setNamespace,
         'uri': uri
     }
     if batchTag:
         params['batchTag'] = batchTag
     try:
         response = Utils.postRSData(self.resourceURI, params)
     except Exception as e:
         raise BadResourceURL(
             "Could not add resource. resourceURI: " +
             str(self.resourceURI), e)
     record = Utils.getJSONFromResponse(response)
     message = self.getMessage(record)
     if message:
         self.logger.warning(message)
     return record, message
コード例 #3
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def addCapability(self, capURL, sourceNamespace, setNamespace, capType):
     self.logger.info("Adding capability with url:" + str(capURL))
     record = None
     message = None
     try:
         Utils.checkURI(capURL)
     except Exception as e:
         self.logger.warning("Capability URL did not validate. url: " +
                             str(capURL) + " ERROR: " + str(e))
         raise Exception("Capability URL did not validate. url: " +
                         str(capURL) + " ERROR: " + str(e))
     params = {
         'sourceNamespace': sourceNamespace,
         'setNamespace': setNamespace,
         'uri': capURL,
         'capabilityType': capType
     }
     try:
         response = Utils.postRSData(self.capabilityURI, params)
     except Exception as e:
         self.logger.critical("Could not add capability. capabiltyURI: " +
                              str(self.capabilityURI) + " ERROR: " + str(e))
         raise BadResourceURL(str(e))
     record = Utils.getJSONFromResponse(response)
     message = self.getMessage(record)
     if message:
         self.logger.warning(message)
     return record, message
コード例 #4
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def getMessage(self, record):
     message = Utils.getRecordAttr(record, 'message')
     msg = Utils.getRecordAttr(record, 'msg')
     if message:
         return message
     if msg:
         return msg
     return None
コード例 #5
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def getManifest(self, batchTag, sourceNamespace, setNamespace):
     url = self.endpointURI + "/static/" + str(sourceNamespace) + "/" + str(
         setNamespace) + "/" + str(batchTag) + "/manifest"
     urlCheck = Utils.checkURI(url)
     if not urlCheck:
         return False
     contents = Utils.getContent(url)
     return contents
コード例 #6
0
 def getResourceContent(self, resID):
     url = self.targetURI + '/content/' + resID
     try:
         response = Utils.getResponse(url)
         record = Utils.getJSONFromResponse(response)
     except Exception as e:
         raise BadResourceURL("Could not get data from url", e, self.logger)
     return record
コード例 #7
0
 def postLog(self, level, msg):
     fullNamespace = self.sourceNamespace + "/" + self.setNamespace
     log = {
         "MSG": msg,
         "LEVEL": level,
         "NAMESPACE": fullNamespace,
         "BATCHTAG": self.batchTag
     }
     Utils.postToLog(log)
コード例 #8
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def getResources(self, offset=0, count=20):
     url = self.endpointURI + str("resource")
     url = str(url) + "?offset=" + str(offset) + "&count=" + str(count)
     urlCheck = Utils.checkURI(url)
     if not urlCheck:
         return False
     f = urllib.request.urlopen(url)
     contents = Utils.getContent(url)
     return contents
コード例 #9
0
 def run(self):
     try:
         response = Utils.getResponse(self.ListSource)
     except Exception as e:
         raise RSLoaderError(
             "Could not load URLs from listSource. Error: " + str(e), None,
             self.logger)
     records = Utils.getJSONFromResponse(response)
     self.addBatch(records)
コード例 #10
0
 def __init__(self, sourceNamespace, setNamespace, opts, mode='latest'):
     for key, value in opts.items():
         setattr(self, key, value)
     Utils.validateRequired(opts, ['targetURL'])
     self.batchTag = Utils.getCurrentBatchTimestamp()
     self.logger = gl.gleanomaticLogger(sourceNamespace, setNamespace,
                                        self.batchTag)
     self.logger.info("Initializing RESTPublisher")
     self.reader = RSReader(sourceNamespace, setNamespace,
                            {"batchTag": self.batchTag}, mode)
コード例 #11
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def loadCapabilityList(self, sourceNamespace, setNamespace):
     url = self.endpointURI + "/RS/" + str(sourceNamespace) + "/" + str(
         setNamespace) + "/capabilitylist.json"
     urlCheck = Utils.checkURI(url)
     if not urlCheck:
         return False
     response = Utils.getResponse(url)
     data = Utils.getJSONFromResponse(response)
     if 'urlset' in data:
         if 'url' in data['urlset']:
             return data['urlset']['url']
     return []
コード例 #12
0
ファイル: OAILoader.py プロジェクト: dhenry314/gleanomatic
 def __init__(self, sourceNamespace, setNamespace, opts):
     super().__init__(sourceNamespace, setNamespace, opts)
     self.logger.info("initializing OAILoader")
     try:
         Utils.validateRequired(opts, ['OAISource', 'OAIMetaDataPrefix'])
     except ValueError as e:
         raise RSLoaderError("Missing required parameter.", e, self.logger)
     try:
         Utils.checkURI(str(self.OAISource) + "?verb=Identify")
     except Exception as e:
         raise RSLoaderError("OAISource url did not validate. ", e,
                             self.logger)
     return None
コード例 #13
0
 def __init__(self, sourceNamespace, setNamespace, opts):
     super().__init__(sourceNamespace, setNamespace, opts)
     self.logger.info("initializing ListLoader")
     try:
         Utils.validateRequired(opts, ['ListSource'])
     except ValueError as e:
         raise RSLoaderError("Missing required parameter.", e, self.logger)
     try:
         Utils.checkURI(str(self.ListSource))
     except Exception as e:
         raise RSLoaderError("ListSource url did not validate. ", e,
                             self.logger)
     return None
コード例 #14
0
def addFromBatch(datum,attempts=1):
    parts = datum.split("||")
    params = {'uri': parts[0], 'sourceNamespace' : parts[1], 'setNamespace' : parts[2], 'batchTag': parts[3]}
    resourceURI = str(appConfig.targetURI) + "/resource"
    namespace = str(parts[1]) + "/" + str(parts[2])
    response = Utils.postRSData(resourceURI,params)
    respJ = Utils.getJSONFromResponse(response)
    if not respJ:
        print("Failed to post " + str(parts[0]))
        Utils.postToLog({"LEVEL":"WARNING",
                         "MSG": "Failed to post " + str(parts[0]), 
                         "NAMESPACE": namespace,
                         "BATCHTAG": str(parts[3]),
                         "RESP": respJ})
        return True
    if 'ID' in respJ:
        print("Posted " + str(parts[0]))
        Utils.postToLog({"LEVEL":"INFO",
                         "MSG": "Posted " + str(parts[0]) + " to resource/" + str(respJ['ID']), 
                         "NAMESPACE": namespace,
                         "BATCHTAG": str(parts[3])})
    else:
        if attempts > 3:
            print("Failed to post " + str(parts[0]))
            Utils.postToLog({"LEVEL":"WARNING",
                         "MSG": "Failed to post " + str(parts[0]), 
                         "NAMESPACE": namespace,
                         "BATCHTAG": str(parts[3]),
                         "RESP": respJ})
        else:
            attempts = attempts + 1
            time.sleep(3)
            addFromBatch(datum,attempts)
            
    return True
コード例 #15
0
 def __init__(self, sourceNamespace, setNamespace, opts):
     try:
         super().__init__(sourceNamespace, setNamespace, opts)
     except Exception as e:
         raise Exception("Could not start RSLoader. " + str(e))
     self.logger.info("initializing OAILoader")
     Utils.validateRequired(opts, ['OAISource', 'OAIMetaDataPrefix'])
     try:
         Utils.checkURI(str(self.OAISource) + "?verb=Identify")
     except Exception as e:
         self.logger.critical(
             self.msg("OAISource url did not validate. " + str(e)))
         raise ValueError("OAISource url did not validate. " + str(e))
     return None
コード例 #16
0
 def initRecord(self, resID):
     resURL = str(self.config.targetURI) + "/resource/" + str(resID)
     resourceJSON = Utils.getContent(resURL)
     self.resource = Utils.jsonToDict(resourceJSON)
     self.logger = gl.gleanomaticLogger(self.resource['sourceNamespace'],
                                        self.resource['setNamespace'],
                                        'OAIMap')
     self.logger.info("Initializing record in OAIMap.")
     mapConfig = maps[self.resource["sourceNamespace"]][
         self.resource["setNamespace"]]
     if 'prefix' in mapConfig:
         self.prefix = mapConfig['prefix']
     self.mapper = self.getMapper(mapConfig["mapper"])
     url = str(self.config.targetURI) + "/content/" + str(resID)
     content = Utils.getContent(url)
     try:
         data = Utils.getDictFromXML(content)
     except Exception as e:
         self.logger.critical("Could not get dict from xml. ERROR: " +
                              str(e))
         raise Exception(str(e))
     try:
         record = data["OAI-PMH"]["GetRecord"]["record"]
         self.header = record["header"]
         self.metadata = record["metadata"]
     except KeyError as e:
         try:
             self.header = record["header"]
         except KeyError as e:
             self.logger.critical(
                 "Could not find metadata or header in record.")
             raise BadOAIRecord(str(e))
         try:
             status = self.header["status"]
         except KeyError as e:
             try:
                 status = self.header["@status"]
             except KeyError as e:
                 self.logger.critical("No status in header.")
                 raise BadOAIRecord(str(e))
         if status == 'deleted':
             self.deleted = True
         else:
             self.logger.critical("Unknown status type: " + str(status))
             raise BadOAIRecord("No metadata.  Unknown status: " +
                                str(status))
     if not self.deleted:
         mbr = MOHUBBaseRecord()
         self.resultRecord = mbr.getBaseRecord()
     return record
コード例 #17
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def addDump(self, batchTag, sourceNamespace, setNamespace):
     response = None
     params = {
         'sourceNamespace': sourceNamespace,
         'setNamespace': setNamespace,
         'batchTag': batchTag
     }
     try:
         response = Utils.postRSData(self.capabilityURI, params)
     except Exception as e:
         raise AddDumpException("Could not post dump.", e)
     d = Utils.getJSONFromResponse(response)
     d = self.convertToRSDomain(d)
     return d
コード例 #18
0
 def __init__(self, sourceNamespace, setNamespace, opts, mode='latest'):
     self.transformURI = appConfig.transformURI
     for key, value in opts.items():
         setattr(self, key, value)
     Utils.validateRequired(opts, ['transformName', 'targetSet'])
     #parse target namespaces out of targetSet
     parts = self.targetSet.split('/')
     self.targetSourceNS = parts[0]
     self.targetSetNS = parts[1]
     self.loader = RSLoader(self.targetSourceNS, self.targetSetNS)
     self.reader = RSReader(sourceNamespace, setNamespace,
                            {"batchTag": self.loader.batchTag}, mode)
     self.logger = self.loader.logger
     self.logger.info("initializing Transformer")
コード例 #19
0
ファイル: OAILoader.py プロジェクト: dhenry314/gleanomatic
 def pullDynamicOAIByURL(self, url):
     while url:
         self.logger.info("Pulling dynamic OAI from " + str(url))
         try:
             data = Utils.getContent(url)
         except Exception as e:
             self.logger.warning("Could not get content from " + str(url) +
                                 " ERROR: " + str(e))
             continue
         OAIerror = self.getError(data)
         if OAIerror:
             raise RSLoaderError(
                 "Could not pull OAI records. OAIError: " + str(OAIerror),
                 None, self.logger)
         rawIDs = data.split('<identifier>')
         #first item is the header
         del rawIDs[0]
         records = []
         result = None
         for rawID in rawIDs:
             parts = rawID.split('</identifier>')
             resourceURL = str(
                 self.OAISource) + "?verb=GetRecord&metadataPrefix=" + str(
                     self.OAIMetaDataPrefix) + "&identifier=" + str(
                         parts[0])
             records.append(resourceURL)
         self.addBatch(records)
         rToken = self.getResumptionToken(data)
         if rToken:
             url = str(
                 self.OAISource
             ) + "?verb=ListIdentifiers&resumptionToken=" + str(rToken)
         else:
             url = None
コード例 #20
0
ファイル: RSLoader.py プロジェクト: dhenry314/gleanomatic
 def makeDump(self):
     if self.createDump:
         try:
             contents = self.targetEndpoint.addDump(self.batchTag,
                                                    self.sourceNamespace,
                                                    self.setNamespace)
         except Exception as e:
             logger.critical(self.msg("Could not add dump."))
             raise AddDumpException("Could not add dump.", e)
         zipURI = contents
         while True:
             retries = 0
             try:
                 uriResponse = Utils.checkURI(zipURI)
             except Exception as e:
                 #allow up to 1 hour for zip creation - sleep 60 seconds and try 60 times
                 time.sleep(60)
                 retries = retries + 1
                 if retries > 60:
                     logger.critical(
                         self.msg("Too many retries waiting for " +
                                  str(zipURI)))
                     raise AddDumpException(
                         "Too many retries waiting for " + str(zipURI))
                 continue
             if uriResponse:
                 logger.info("Found zipURI.")
                 break
         result = self.addCapability(zipURI, 'dump')
         return result
     return False
コード例 #21
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def __init__(self, endpointURI):
     logger.info("Initializing RSRestClient")
     #ensure that there is a trailing slash on the endpoint
     if endpointURI[-1] != "/":
         endpointURI = str(endpointURI) + "/"
     self.endpointURI = endpointURI
     self.resourceURI = str(self.endpointURI) + "resource"
     logger.info("Checking resourceURI: " + str(self.resourceURI))
     try:
         Utils.checkURI(self.resourceURI)
     except Exception as e:
         logger.critical("ResourceURI did not validate: " +
                         str(self.resourceURI) + " ERROR:" + str(e))
         raise TargetURIException(
             "ResourceURI did not validate: " + str(self.resourceURI), e)
     self.capabilityURI = str(self.endpointURI) + "capability"
コード例 #22
0
 def initRecord(self):
     resURL = str(self.config.targetURI) + "/resource/" + str(self.resID)
     resourceJSON = Utils.getContent(resURL)
     self.resource = Utils.jsonToDict(resourceJSON)
     self.logger = gl.gleanomaticLogger(self.resource['sourceNamespace'],self.resource['setNamespace'],'MimsyMap')
     self.logger.info("Initializing record in MimsyMap.")
     url = str(self.config.targetURI) + "/content/" + str(self.resID)
     try:
         response = Utils.getResponse(url)
         record = Utils.getJSONFromResponse(response)
     except Exception as e:
         raise gError("Could not get data from url",e,self.logger)
     if not self.deleted:
         mbr = MOHUBBaseRecord()
         self.resultRecord = mbr.getBaseRecord()
     return record
コード例 #23
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def loadManifestIDs(self, sourceNamespace, setNamespace, batchTag):
     url = self.endpointURI + "/static/" + str(sourceNamespace) + "/" + str(
         setNamespace) + "/" + str(batchTag) + "/manifest"
     urlCheck = Utils.checkURI(url)
     if not urlCheck:
         return False
     ids = []
     contents = Utils.getContent(url)
     lines = contents.split("\n")
     for line in lines:
         parts = line.split('><')
         resourceID = parts[-1]
         resourceID = resourceID.replace('/resource/', '')
         resourceID = resourceID.replace('>', '')
         ids.append(resourceID)
     return ids
コード例 #24
0
 def __init__(self, sourceNamespace, setNamespace, opts):
     super().__init__(sourceNamespace, setNamespace, opts)
     self.logger.info("initializing ESLoader")
     try:
         Utils.validateRequired(
             opts, ['ESHost', 'ESPort', 'ESIndex', 'ESType', 'body'])
     except ValueError as e:
         raise RSLoaderError("Missing required parameter.", e, self.logger)
     self.es = Elasticsearch([{
         'host': self.ESHost,
         'port': self.ESPort
     }],
                             timeout=self.timeout)
     if not self.es.indices.exists(index=self.ESIndex):
         raise RSLoaderError("ES Index " + self.ESIndex + " not exist.")
     self.baseRecordURL = 'http://' + str(self.ESHost) + ':' + str(
         self.ESPort) + '/' + str(self.ESIndex) + '/' + str(
             self.ESType) + '/'
     return None
コード例 #25
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def loadResourceListIndex(self, sourceNamespace, setNamespace):
     url = self.endpointURI + "/RS/" + str(sourceNamespace) + "/" + str(
         setNamespace) + "/resourcelistindex.json"
     urlCheck = Utils.checkURI(url)
     if not urlCheck:
         return False
     response = Utils.getResponse(url)
     data = Utils.getJSONFromResponse(response)
     urls = []
     if 'sitemapindex' in data:
         if 'sitemap' in data['sitemapindex']:
             sitemap = data['sitemapindex']['sitemap']
             for record in sitemap:
                 if 'rs:ln' in record:
                     if '@type' in record['rs:ln']:
                         if str(record['rs:ln']
                                ['@type']).lower() == 'application/json':
                             urls.append(record['rs:ln']['@href'])
     return urls
コード例 #26
0
 def __init__(self,sourceNamespace,setNamespace,opts={}):
     self.batchTag = Utils.getCurrentBatchTimestamp()
     self.logger = gl.gleanomaticLogger(sourceNamespace,setNamespace,self.batchTag)
     self.logger.info("Initializing RSLoader")
     self.targetURI = appConfig.targetURI
     self.targetEndpoint = rc.RSRestClient(self.targetURI,self.logger)
     self.sourceNamespace = sourceNamespace
     self.setNamespace = setNamespace
     self.createDump = appConfig.createDump
    
     for key, value in opts.items():
         setattr(self, key, value)
コード例 #27
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def loadResourceListIDs(self, url):
     url = self.convertToRSDomain(url)
     urlCheck = Utils.checkURI(url)
     if not urlCheck:
         return False
     response = Utils.getResponse(url)
     data = Utils.getJSONFromResponse(response)
     ids = []
     if 'urlset' in data:
         if 'url' in data['urlset']:
             urls = data['urlset']['url']
             for record in urls:
                 if 'rs:ln' in record:
                     if 'rel' in record['rs:ln']:
                         if str(record['rs:ln']
                                ['rel']).lower() == 'describedby':
                             resourceID = record['rs:ln']['href']
                             resourceID = resourceID.replace(
                                 '/resource/', '')
                             ids.append(resourceID)
     return ids
コード例 #28
0
 def loadIDs(self, limit=None):
     if self.mode == 'all':
         self.index = self.targetEndpoint.loadResourceListIndex(
             self.sourceNamespace, self.setNamespace)
         while True:
             ids = self.getNextIDs()
             if not ids:
                 break
             else:
                 for resourceID in ids:
                     self.resourceIDs.append(resourceID)
             if limit:
                 if len(self.resourceIDs) > limit:
                     break
     elif self.mode == 'latest':
         latestTag = None
         batchTags = []
         capURLs = self.targetEndpoint.loadCapabilityList(
             self.sourceNamespace, self.setNamespace)
         if limit:
             capURLs = capURLs[:limit]
         if capURLs:
             for record in capURLs:
                 thisTag = None
                 if 'rs:md' in record:
                     if '@until' in record['rs:md']:
                         thisTag = Utils.getBatchTimestamp(
                             record['rs:md']['@until'])
                 else:
                     #is it a zip file named by batchtag?
                     if '.zip' in record['loc']:
                         parts = record['loc'].split('/')
                         filename = parts[-1]
                         thisTag = filename.replace('.zip', '')
                 if thisTag:
                     numTag = int(thisTag)
                     batchTags.append(numTag)
             batchTags.sort()
             latestTag = batchTags[-1]
         ids = self.targetEndpoint.loadManifestIDs(self.sourceNamespace,
                                                   self.setNamespace,
                                                   latestTag)
         for resourceID in ids:
             self.resourceIDs.append(resourceID)
     else:
         print("Unknown mode: " + str(mode))
     return True
コード例 #29
0
 def pullDynamicOAI(self):
     url = str(
         self.OAISource) + "?verb=ListIdentifiers&metadataPrefix=" + str(
             self.OAIMetaDataPrefix)
     if self.OAIset:
         url = url + "&set=" + str(self.OAIset)
     while url:
         logger.info("Pulling dynamic OAI from " + str(url))
         data = Utils.getContent(url)
         OAIerror = self.getError(data)
         if OAIerror:
             logger.critical(
                 self.msg("Could not pull OAI records. Error: " +
                          str(OAIerror)))
             raise ValueError("Could not pull OAI records. ERROR:  " +
                              str(OAIerror))
         rawIDs = data.split('<identifier>')
         #first item is the header
         del rawIDs[0]
         records = []
         result = None
         for rawID in rawIDs:
             parts = rawID.split('</identifier>')
             resourceURL = str(
                 self.OAISource) + "?verb=GetRecord&metadataPrefix=" + str(
                     self.OAIMetaDataPrefix) + "&identifier=" + str(
                         parts[0])
             records.append(resourceURL)
         self.addBatch(records)
         rToken = self.getResumptionToken(data)
         if rToken:
             url = str(
                 self.OAISource
             ) + "?verb=ListIdentifiers&resumptionToken=" + str(rToken)
         else:
             url = None
コード例 #30
0
ファイル: RSRestClient.py プロジェクト: dhenry314/gleanomatic
 def deleteResource(self, uri):
     response = Utils.deleteContent(uri)
     if not response:
         raise Exception("Could not delete resource at " + str(uri))
     d = response.read()
     return d