def setUp(self):
     SeecrTestCase.setUp(self)
     self.stateDir = join(self.tempdir, "state")
     self.logDir = join(self.tempdir, "log")
     self.domainId = "adomain"
     makedirs(join(self.stateDir, self.domainId))
     repoId1LogDir = join(self.logDir, self.domainId, "invalid", "repoId1")
     repoId2LogDir = join(self.logDir, self.domainId, "invalid", escapeFilename("repoId/2"))
     makedirs(repoId1LogDir)
     makedirs(repoId2LogDir)
     open(join(repoId1LogDir, "invalidId1"), 'w').write("<diagnostic>ERROR1</diagnostic>")
     open(join(repoId1LogDir, "invalidId&2"), 'w').write("<diagnostic>ERROR2</diagnostic>")
     open(join(repoId2LogDir, escapeFilename("invalidId/3")), 'w').write("<diagnostic>ERROR3</diagnostic>")
     open(join(self.stateDir, self.domainId, "repoId1_invalid.ids"), 'w').write("invalidId1\ninvalidId&2")
     open(join(self.stateDir, self.domainId, escapeFilename("repoId/2_invalid.ids")), 'w').write("invalidId/3")
     open(join(self.stateDir, self.domainId, "repoId3_invalid.ids"), 'w').write("")
     self.status = RepositoryStatus(self.logDir, self.stateDir)
     observer = CallTrace("HarvesterData")
     observer.returnValues["getRepositoryGroupIds"] = ["repoGroupId1", "repoGroupId2"]
     def getRepositoryIds(domainId, repositoryGroupId):
         if repositoryGroupId == "repoGroupId1":
             return ["repoId1", "repoId/2"]
         return ["repoId3", "anotherRepoId"]
     observer.methods["getRepositoryIds"] = getRepositoryIds
     def getRepositoryGroupId(domainId, repositoryId):
         return 'repoGroupId1' if repositoryId in ['repoId1', 'repoId/2'] else 'repoGroupId2'
     observer.methods["getRepositoryGroupId"] = getRepositoryGroupId
     self.status.addObserver(observer)
    def setUp(self):
        SeecrTestCase.setUp(self)
        self.domainId = "adomain"
        self.stateDir = mkdir(self.tempdir, "state")
        mkdir(self.stateDir, self.domainId)
        self.logDir = mkdir(self.tempdir, "log")
        repoId1LogDir = mkdir(self.logDir, self.domainId, "invalid", "repoId1")
        repoId2LogDir = mkdir(self.logDir, self.domainId, "invalid",
                              escapeFilename("repoId/2"))
        _writeFile(repoId1LogDir,
                   "invalidId1",
                   data="<diagnostic>ERROR1</diagnostic>")
        _writeFile(repoId1LogDir,
                   "invalidId&2",
                   data="<diagnostic>ERROR2</diagnostic>")
        _writeFile(repoId2LogDir,
                   escapeFilename("invalidId/3"),
                   data="<diagnostic>ERROR3</diagnostic>")
        _writeFile(self.stateDir,
                   self.domainId,
                   "repoId1_invalid.ids",
                   data="invalidId1\ninvalidId&2")
        _writeFile(self.stateDir,
                   self.domainId,
                   escapeFilename("repoId/2_invalid.ids"),
                   data="invalidId/3")
        _writeFile(self.stateDir,
                   self.domainId,
                   "repoId3_invalid.ids",
                   data="")
        self.status = RepositoryStatus(self.logDir, self.stateDir)
        observer = CallTrace("HarvesterData")
        observer.returnValues["getRepositoryGroupIds"] = [
            "repoGroupId1", "repoGroupId2"
        ]

        def getRepositoryIds(domainId, repositoryGroupId):
            if repositoryGroupId == "repoGroupId1":
                return ["repoId1", "repoId/2"]
            return ["repoId3", "anotherRepoId"]

        observer.methods["getRepositoryIds"] = getRepositoryIds

        def getRepositoryGroupId(domainId, repositoryId):
            return 'repoGroupId1' if repositoryId in ['repoId1', 'repoId/2'
                                                      ] else 'repoGroupId2'

        observer.methods["getRepositoryGroupId"] = getRepositoryGroupId
        self.status.addObserver(observer)
Beispiel #3
0
 def assertName(self, name):
     fname = join( '/tmp', escapeFilename(name))
     open(fname, 'w').close()
     try:
         self.assertTrue(isfile(fname))
     finally:
         remove(fname)
Beispiel #4
0
 def _getRepositoryJson(self, domainId, repositoryId):
     repojsonfile = join(
         '/var/lib/meresco-harvester/data',
         escapeFilename("%s.%s.repository" % (domainId, repositoryId)))
     if not isfile(repojsonfile):
         return None
     return repojsonfile
def prepareOaiPmh(dataDirs, tempDir, storage, batchSize):
    print 'DATADIRS', dataDirs
    oaiSuspendRegister = SuspendRegister()
    oaiJazz = OaiJazz(tempDir)
    oaiJazz.addObserver(oaiSuspendRegister)
    oaiJazzOperations = {
        'ADD': oaiJazz.addOaiRecord,
        'DEL': oaiJazz.deleteOaiRecord
    }
    for dataDir in dataDirs:
        for action, filename, setSpecs in iterOaiData(dataDir):
            identifier, metadataPrefix = filename.rsplit('.', 1)
            oaiJazzOperations[action](
                identifier=identifier,
                setSpecs=setSpecs,
                metadataPrefixes=[metadataPrefix],
            )
            storage.addFile(filename, join(dataDir, escapeFilename(filename)))
            sleep(0.000001)
    oaiJazz.commit()

    oaiPmh = be(
        (IllegalFromFix(),
            (OaiPmh(repositoryName='Mock', adminEmail='*****@*****.**', supportXWait=True, batchSize=batchSize),
                # (LogComponent('OaiPmh'),),
                (oaiJazz,),
                (oaiSuspendRegister,),
                (storage,),
            )
        )
    )
    return oaiPmh
Beispiel #6
0
def dumpOai(port,
            path,
            oaiDumpDir,
            metadataPrefix,
            set_=None,
            host=None,
            limit=None,
            append=False):
    host = host or '127.0.0.1'
    baseurl = 'http://%s:%s%s' % (host, port, path)
    if not append:
        isdir(oaiDumpDir) and rmtree(oaiDumpDir)
        makedirs(oaiDumpDir)
    with open(join(oaiDumpDir, 'oai.ids'), 'a') as ids:
        for oaiItem in islice(
                iterateOaiPmh(baseurl=baseurl,
                              metadataPrefix=metadataPrefix,
                              set=set_), limit):
            filename = '%s.%s' % (oaiItem.identifier, metadataPrefix)
            ids.write('%s %s |%s|\n' %
                      ('DEL' if oaiItem.deleted else 'ADD', filename, '|'.join(
                          sorted(oaiItem.setSpecs))))
            if not oaiItem.deleted:
                open(join(oaiDumpDir, escapeFilename(filename)), 'w').write(
                    lxmltostring(oaiItem.metadata, pretty_print=True))
    print("Oai dump created in %s" % oaiDumpDir)
Beispiel #7
0
 def add(self, uploadid):
     uploadid = escapeFilename(uploadid)
     if uploadid in self._ids:
         return
     self._ids.append(uploadid)
     self._idsfile.write(uploadid + "\n")
     self._idsfile.flush()
 def getRunningStatesForDomain(self, domainId):
     return sorted([
         mergeDicts(jsonLoad(open(filepath)), {'repositoryId': repoId})
         for groupId in self.call.getRepositoryGroupIds(domainId=domainId)
         for repoId in self.call.getRepositoryIds(domainId=domainId, repositoryGroupId=groupId)
         for filepath in [join(self._statePath, domainId, escapeFilename("%s.running" % repoId))]
         if isfile(filepath)
     ], key=lambda d: d['changedate'], reverse=True)
    def _invalidCount(self, domainId, repositoryId):
        invalidFile = join(self._statePath, domainId,
                           escapeFilename("%s_invalid.ids" % repositoryId))
        if not isfile(invalidFile):
            return 0

        with open(invalidFile) as fp:
            return len(fp.readlines())
Beispiel #10
0
def writeIds(filename, ids):
    f = open(filename,'w')
    try:
        for id in ids:
            f.write(escapeFilename(id))
            f.write('\n')
    finally:
        f.close()
 def _getStorage(self, name, mayCreate=False):
     storage = self._storage.get(name)
     if storage is None:
         directory = join(self._directory, escapeFilename(name))
         if isdir(directory) or mayCreate:
             self._storage[name] = storage = SequentialStorage(directory)
         else:
             raise KeyError(name)
     return storage
 def invalidRecords(self, domainId, repositoryId):
     invalidFile = join(self._statePath, domainId, escapeFilename("%s_invalid.ids" % repositoryId))
     if not isfile(invalidFile):
         return []
     return reversed(
         [x[:-1] if x[-1] == '\n' else x for x in
             (unescapeFilename(line) for line in open(invalidFile) if line.strip())
         ]
     )
Beispiel #13
0
def writeIds(filename, ids):
    path = pathlib.Path(filename)
    if ids is None or len(ids) == 0:
        path.unlink()
        return
    idfilenew = path.with_name(path.name + '.new')
    with idfilenew.open('w') as fp:
        for anId in ids:
            fp.write('{}\n'.format(escapeFilename(anId)))
    idfilenew.rename(path)
 def invalidRecords(self, domainId, repositoryId):
     invalidFile = join(self._statePath, domainId,
                        escapeFilename("%s_invalid.ids" % repositoryId))
     if not isfile(invalidFile):
         return []
     with open(invalidFile) as fp:
         return reversed([
             x[:-1] if x[-1] == '\n' else x for x in (unescapeFilename(line)
                                                      for line in fp
                                                      if line.strip())
         ])
Beispiel #15
0
 def delete(self, anUpload):
     filename = self._filenameFor(anUpload)
     if not self._target.oaiEnvelope:
         os.path.isfile(filename) and os.remove(filename)
         with open(os.path.join(self._target.path, 'deleted_records'),
                   'a') as f:
             f.write('%s\n' % escapeFilename(anUpload.id))
     else:
         xmlResult = self._createOutput(anUpload)
         with open(filename, 'w') as fd:
             fd.write(lxmltostring(xmlResult))
     self._logDelete(anUpload.id)
    def getRunningStatesForDomain(self, domainId):
        def _jsonLoad(filename):
            with open(filename) as fp:
                return jsonLoad(fp)

        return sorted([
            mergeDicts(_jsonLoad(filepath), {'repositoryId': repoId})
            for groupId in self.call.getRepositoryGroupIds(domainId=domainId)
            for repoId in self.call.getRepositoryIds(domainId=domainId,
                                                     repositoryGroupId=groupId)
            for filepath in [
                join(self._statePath, domainId,
                     escapeFilename("%s.running" % repoId))
            ] if isfile(filepath)
        ],
                      key=lambda d: d['changedate'],
                      reverse=True)
    def delete(self, anUpload):
        filename = self._filenameFor(anUpload)
        if not self._target.oaiEnvelope:
            os.path.isfile(filename) and os.remove(filename)
            f = open(os.path.join(self._target.path, "deleted_records"), "a")
            try:
                f.write("%s\n" % escapeFilename(anUpload.id))
            finally:
                f.close()
        else:
            xmlResult = self._createOutput(anUpload)
            fd = open(filename, "w")
            try:
                fd.write(lxmltostring(xmlResult))
            finally:
                fd.close()

        self._logDelete(anUpload.id)
Beispiel #18
0
def getRssLogger(repositoryId, logfileDir):

    logger = logging.getLogger(repositoryId)

    if len(logger.handlers) > 0:
        #print "Logger Available..."
        return logger

    # No handlers set yet, this is a new logger from the factory...
    logger.setLevel(logging.WARNING)

    LOG_FILENAME = join(logfileDir, escapeFilename(repositoryId))
    rfh = logging.handlers.RotatingFileHandler((LOG_FILENAME),
                                               maxBytes=MAXLOGSIZE,
                                               backupCount=BACKUPCOUNT)

    formatter = logging.Formatter("%(asctime)s %(message)s",
                                  "%Y-%m-%dT%H:%M:%SZ")
    rfh.setFormatter(formatter)

    logger.addHandler(rfh)
    #print "Created new Logger..."
    return logger
Beispiel #19
0
    def __init__(self, stateDir, logDir, name):
        self._statePath = pathlib.Path(stateDir)
        self.logPath = pathlib.Path(logDir)
        self._statePath.mkdir(parents=True, exist_ok=True)
        self.logPath.mkdir(parents=True, exist_ok=True)
        esc_name = escapeFilename(name)
        self.invalidLogPath = self.logPath / INVALID_DATA_MESSAGES_DIR / esc_name
        self._name = name

        self._ids = Ids(self._statePath / f'{esc_name}.ids')
        self._invalidIds = Ids(self._statePath / f'{esc_name}_invalid.ids')
        self._oldIds = Ids(self._statePath / f'{esc_name}.ids.old')

        self._statsfilepath = self._statePath / f'{esc_name}.stats'
        self._forceFinalNewlineOnStatsFile()
        self._resumptionFilepath = self._statePath / f'{esc_name}.next'
        self._runningFilepath = self._statePath / f'{esc_name}.running'
        self._countFilepath = self._statePath / f'{esc_name}.count'
        self.from_ = None
        self.token = None
        self._counts = None
        self.lastSuccessfulHarvest = None
        self._readState()
        self._statsfile = None
Beispiel #20
0
 def _invalidDataMessageFilePath(self, uploadid):
     repositoryId, recordId = uploadid.split(":", 1)
     return self._state.invalidLogPath / escapeFilename(recordId)
Beispiel #21
0
 def add(self, uploadid):
     if uploadid in self._ids:
         return
     self._ids.append(uploadid)
     self._idsfile.write('{}\n'.format(escapeFilename(uploadid)))
     self._idsfile.flush()
 def getInvalidRecord(self, domainId, repositoryId, recordId):
     invalidDir = join(self._logPath, domainId, INVALID_DATA_MESSAGES_DIR)
     with open(
             join(invalidDir, escapeFilename(repositoryId),
                  escapeFilename(recordId))) as fp:
         return parse(fp)
 def getInvalidRecord(self, domainId, repositoryId, recordId):
     invalidDir = join(self._logPath, domainId, INVALID_DATA_MESSAGES_DIR)
     return parse(open(join(invalidDir, escapeFilename(repositoryId), escapeFilename(recordId))))
Beispiel #24
0
 def _invalidDataMessageFilePath(self, uploadid):
     repositoryId, recordId = uploadid.split(":", 1)
     return join(self._logDir, INVALID_DATA_MESSAGES_DIR, escapeFilename(repositoryId), escapeFilename(recordId))
Beispiel #25
0
 def remove(self, uploadid):
     uploadid = escapeFilename(uploadid)
     if uploadid in self._ids:
         self._ids.remove(uploadid)
         self.close()
         self.open()
 def _invalidCount(self, domainId, repositoryId):
     invalidFile = join(self._statePath, domainId, escapeFilename("%s_invalid.ids" % repositoryId))
     return len(open(invalidFile).readlines()) if isfile(invalidFile) else 0
Beispiel #27
0
def dna(reactor, portNumber, config, tempDir, batchSize):
    print 'Config', config
    root = HandleRequestLog()

    storage = DataStorage()
    for data in config:
        oaiName = ''.join(data['path'].split('/'))
        oaiSuspendRegister = SuspendRegister()
        try:
            oaiJazz = OaiJazz(join(tempDir, oaiName), preciseDatestamp=True)  # needed for backwards compatibility with meresco-oai versions preceding 5.16
        except TypeError:
            oaiJazz = OaiJazz(join(tempDir, oaiName))
        oaiJazz = be(
            (oaiJazz,
                (oaiSuspendRegister,)
            )
        )
        oaiJazzOperations = {
            'ADD': oaiJazz.addOaiRecord,
            'DEL': oaiJazz.deleteOaiRecord
        }

        for directory in data['dirs']:
            for action, filename, setSpecs in iterOaiData(directory):
                identifier, metadataPrefix = filename.rsplit('.', 1)
                oaiJazzOperations[action](
                    identifier=identifier,
                    setSpecs=setSpecs,
                    metadataPrefixes=[metadataPrefix],
                )
                storage.addFile(filename, join(directory, escapeFilename(filename)))
                sleep(0.000001)
        oaiJazz.commit()

        try:
            oaiPmh = OaiPmh(repositoryName='Mock', adminEmail='*****@*****.**', supportXWait=True, batchSize=batchSize, preciseDatestamp=True)
        except TypeError:
            oaiPmh = OaiPmh(repositoryName='Mock', adminEmail='*****@*****.**', supportXWait=True, batchSize=batchSize)  # needed for backwards compatibility with meresco-oai versions preceding 5.16
        tree = be(
            (PathFilter(data['path'], excluding=['/ready']),
                (IllegalFromFix(),
                    (oaiPmh,
                        (oaiJazz,),
                        (oaiSuspendRegister,),
                        (storage,),
                    )
                )
            )
        )
        root.addObserver(tree)

    return \
        (Observable(),
            (ObservableHttpServer(reactor, portNumber),
                (LogCollector(),
                    (ApacheLogWriter(stdout),),
                    (root,
                        (PathFilter("/ready"),
                            (StringServer('yes', ContentTypePlainText),)
                        )
                    )
                )
            )
        )