def __init__(self, containerName, swift_url=None, swift_user=None, swift_pw=None, storage_url=None, token=None):
		'''
		Constructor
		'''
		self.log = logging.getLogger()
		self.containerName = containerName
		self.log.info('initializing...')
		if storage_url and token:
			self.sb = SwiftBackend(storage_url=storage_url, token=token)
		else:
			self.sb = SwiftBackend(swift_url=swift_url, swift_user=swift_user, swift_pw=swift_pw)
		self.numWorkers = 20
Ejemplo n.º 2
0
    def __init__(self,
                 containerName,
                 swift_url=None,
                 swift_user=None,
                 swift_pw=None,
                 storage_url=None,
                 token=None):
        '''
		Constructor
		'''
        self.log = logging.getLogger()
        self.containerName = containerName
        self.log.info('initializing...')
        self.swift_user = swift_user
        if storage_url and token:
            self.sb = SwiftBackend(storage_url=storage_url, token=token)
        else:
            self.sb = SwiftBackend(swift_url=swift_url,
                                   swift_user=swift_user,
                                   swift_pw=swift_pw)
        self.numWorkers = 20
Ejemplo n.º 3
0
    format='%(asctime)s - %(module)s - %(levelname)s ##\t  %(message)s')

containerName = 'Hillary\'s emails'
#swift_url = "http://192.168.209.204:8080/auth/v1.0"

#swift_user = "******"
#swift_pw = "testing"
swift_auth_url = "http://192.168.209.204:8080/auth/v1.0"
swift_store_url = "http://192.168.209.204:8080/v1/AUTH_{}"
swift_user = "******"
swift_pw = "testing"

swift_url = swift_auth_url
log = logging.getLogger()
sb = SwiftBackend(swift_url=swift_url,
                  swift_user=swift_user,
                  swift_pw=swift_pw)
conn = sb._getConnection()

if __name__ == '__main__':
    log.error('starting metadata extractor tester')
    ex = Extractor(containerName=containerName,
                   swift_url=swift_url,
                   swift_user=swift_user,
                   swift_pw=swift_pw)
    #ex.runIdentifierForWholeContainer()
    #ex.runFilterForWholeContainer()
    #ex.replicateMetaData()
    #Replicator.replicateMetadata(conn=conn,containerName=self.containerName,objectName=objName,objectType=objType)
    #Replicator.replicateMetadata(conn=conn,containerName=containerName,objectName="DSC00712.JPG",objectType="image/jpeg")
    ex.runReplicateMetadataForWholeContainer()
class Extractor(object):
	'''
	classdocs
	'''

	def __init__(self, containerName, swift_url=None, swift_user=None, swift_pw=None, storage_url=None, token=None):
		'''
		Constructor
		'''
		self.log = logging.getLogger()
		self.containerName = containerName
		self.log.info('initializing...')
		if storage_url and token:
			self.sb = SwiftBackend(storage_url=storage_url, token=token)
		else:
			self.sb = SwiftBackend(swift_url=swift_url, swift_user=swift_user, swift_pw=swift_pw)
		self.numWorkers = 20

	def dummyLoad(self, conn, objType, objName):
		print(objName)
		thisObjBlob = self.sb.getObjBlob(conn, self.containerName, objName)

	def getDataAndIdentifyContentType(self, conn, objType, objName):
		thisObjBlob = self.sb.getObjBlob(conn, self.containerName, objName)
		ctype = ContentTypeIdentifier().identifyContentType(thisObjBlob)
		if objType == ctype:
			return "same same..."
		return self.sb.updateObjContentType(conn, containerName=self.containerName, objName=objName,
		                                    newContentType=ctype)

	def getDataAndRunFilter(self, conn, objType, objName):
		thisObjBlob = self.sb.getObjBlob(conn, self.containerName, objName)
		try:
			thisFilter = ImportFilter.getFilterForObjType(objType)
		except:
			raise NoFilterFoundException("{}-{}".format(objName, objType))
		r = thisFilter.extractMetaData(thisObjBlob)
		return self.sb.updateMetaDataFields(conn=conn, containerName=self.containerName, objName=objName, metaDict=r)

	def getMetadataAndRunDisposal(self, conn, objType, objName):
		return RetentionChecker.checkRetentionDate(conn=conn, containerName=self.containerName, objectName=objName)

	def runForWholeContainer(self, functionOnObject):
		with swiftclient.multithreading.ConnectionThreadPoolExecutor(self.sb._getConnection,
		                                                             max_workers=self.numWorkers) as executor:
			objs = self.sb.get_object_list(self.containerName)
			future_results = []

			# first go through all objs in the container and spawn a thread to run the filter
			self.log.error('committing {} jobs for {}'.format(len(objs), functionOnObject.__name__))
			for thisObj in objs:
				try:
					thisObjType = thisObj['content_type']
					thisObjName = thisObj['name']

					self.log.info('running {} for type: {} on obj: {}'.format(functionOnObject.__name__, thisObjType,
					                                                          thisObjName))
					future_results.append(executor.submit(functionOnObject, thisObjType, thisObjName))
				except Exception as exc:
					self.log.warning('could not create job for obj: {}. Exc: {}'.format(thisObj, exc))

			# try to get the individual results from the filters
			self.log.error('Starting {} worker threads...'.format(self.numWorkers))
			numFailedJobs = 0
			numNoFilter = 0
			numNoRetentionDate = 0
			numRetentionInFuture = 0
			numOkJobs = 0
			for future in concurrent.futures.as_completed(future_results):
				try:
					data = future.result()
				except NoFilterFoundException as exc:
					self.log.info('no filter found: {}'.format(exc))
					numNoFilter += 1
				except NoRetentionDateException as exc:
					self.log.info('no retention date on obj: {}'.format(exc))
					numNoRetentionDate += 1
				except RetentionDateInFutureException as exc:
					self.log.info('retention date in future on obj: {}'.format(exc))
					numRetentionInFuture += 1
				except Exception as exc:
					self.log.info('worker failed with exception: {}'.format(exc))
					numFailedJobs += 1
				else:
					numOkJobs += 1
					self.log.info('worker succeeded on obj: {}'.format(data))
			self.log.warning('Workers done!')

			if functionOnObject == self.getDataAndRunFilter:
				msg = self.__msg_for_extractor(numFailedJobs, numNoFilter, numOkJobs, objs)
			elif functionOnObject == self.getMetadataAndRunDisposal:
				msg = self.__msg_for_disposal(numFailedJobs, numNoRetentionDate, numRetentionInFuture, numOkJobs, objs)
			else:
				msg = self.__msg_for_generic(numFailedJobs, numOkJobs, objs)
			self.log.warning(msg)
			return msg

	def __msg_for_extractor(self, numFailedJobs, numNoFilter, numOkJobs, objs):
		total = numOkJobs + numFailedJobs + numNoFilter
		msg = 'OK: {}, Failed: {}, No filter: {} -- Total: {}, Fail rate: {}%, Missing: {} '.format(
			numOkJobs,
			numFailedJobs,
			numNoFilter,
			total,
			((100 / (
				numOkJobs + numFailedJobs)) * numFailedJobs) if (
				(
					numOkJobs + numFailedJobs) > 0) else 0,
			len(objs) - total)
		return msg

	def __msg_for_disposal(self, numFailedJobs, numNoRetentionDate, numRetentionInFuture, numOkJobs, objs):
		total = numOkJobs + numRetentionInFuture + numNoRetentionDate + numFailedJobs
		msg = 'Deleted: {}, No retention: {}, Retention in future: {}, Failed: {} -- Total: {}, Fail rate: {}%, Missing: {} '.format(
			numOkJobs,
			numNoRetentionDate,
			numRetentionInFuture,
			numFailedJobs,
			total,
			((100 / total) * numFailedJobs) if (
				total > 0) else 0,
			len(objs) - total)
		return msg

	def __msg_for_generic(self, numFailedJobs, numOkJobs, objs):
		total = numOkJobs + numFailedJobs
		msg = 'OK: {}, Failed: {} -- Total: {}, Fail rate: {}%, Missing: {} '.format(
			numOkJobs,
			numFailedJobs,
			total,
			(
				(
					100 / total) * numFailedJobs) if (
				total > 0) else 0,
			len(objs) - total)
		return msg

	def runFilterForWholeContainer(self):
		return self.runForWholeContainer(functionOnObject=self.getDataAndRunFilter)

	def runIdentifierForWholeContainer(self):
		return self.runForWholeContainer(functionOnObject=self.getDataAndIdentifyContentType)

	def runDisposalForWholeContainer(self):
		return self.runForWholeContainer(functionOnObject=self.getMetadataAndRunDisposal)

	def runDummyLoad(self):
		return self.runForWholeContainer(functionOnObject=self.dummyLoad)
Ejemplo n.º 5
0
class Extractor(object):
    '''
	classdocs
	'''
    def __init__(self,
                 containerName,
                 swift_url=None,
                 swift_user=None,
                 swift_pw=None,
                 storage_url=None,
                 token=None):
        '''
		Constructor
		'''
        self.log = logging.getLogger()
        self.containerName = containerName
        self.log.info('initializing...')
        self.swift_user = swift_user
        if storage_url and token:
            self.sb = SwiftBackend(storage_url=storage_url, token=token)
        else:
            self.sb = SwiftBackend(swift_url=swift_url,
                                   swift_user=swift_user,
                                   swift_pw=swift_pw)
        self.numWorkers = 20

    def dummyLoad(self, conn, objType, objName):
        print(objName)
        thisObjBlob = self.sb.getObjBlob(conn, self.containerName, objName)

    def getDataAndIdentifyContentType(self, conn, objType, objName):
        thisObjBlob = self.sb.getObjBlob(conn, self.containerName, objName)
        ctype = ContentTypeIdentifier().identifyContentType(thisObjBlob)
        if objType == ctype:
            return "same same..."
        return self.sb.updateObjContentType(conn,
                                            containerName=self.containerName,
                                            objName=objName,
                                            newContentType=ctype)

    def getDataAndRunFilter(self, conn, objType, objName):
        thisObjBlob = self.sb.getObjBlob(conn, self.containerName, objName)
        try:
            thisFilter = ImportFilter.getFilterForObjType(objType)
        except:
            raise NoFilterFoundException("{}-{}".format(objName, objType))
        r = thisFilter.extractMetaData(thisObjBlob)
        return self.sb.updateMetaDataFields(conn=conn,
                                            containerName=self.containerName,
                                            objName=objName,
                                            metaDict=r)

    def getMetadataAndRunDisposal(self, conn, objType, objName):
        return RetentionChecker.checkRetentionDate(
            conn=conn, containerName=self.containerName, objectName=objName)

    def getMetadataAndReplicate(self, conn, objType, objName):
        postgreConn = self.postgreConnPool.getconn()
        try:
            Replicator.replicateMetadata(conn=conn,
                                         containerName=self.containerName,
                                         objectName=objName,
                                         objectType=objType,
                                         postgreConn=postgreConn)
        finally:
            self.postgreConnPool.putconn(postgreConn)

    def runForWholeContainer(self, functionOnObject):
        with swiftclient.multithreading.ConnectionThreadPoolExecutor(
                self.sb._getConnection,
                max_workers=self.numWorkers) as executor:
            objs = self.sb.get_object_list(self.containerName)
            future_results = []

            # first go through all objs in the container and spawn a thread to run the filter
            self.log.error('committing {} jobs for {}'.format(
                len(objs), functionOnObject.__name__))
            for thisObj in objs:
                try:
                    thisObjType = thisObj['content_type']
                    thisObjName = thisObj['name']

                    self.log.info('running {} for type: {} on obj: {}'.format(
                        functionOnObject.__name__, thisObjType, thisObjName))
                    future_results.append(
                        executor.submit(functionOnObject, thisObjType,
                                        thisObjName))
                except Exception as exc:
                    self.log.warning(
                        'could not create job for obj: {}. Exc: {}'.format(
                            thisObj, exc))

            # try to get the individual results from the filters
            self.log.error('Starting {} worker threads...'.format(
                self.numWorkers))
            numFailedJobs = 0
            numNoFilter = 0
            numNoRetentionDate = 0
            numRetentionInFuture = 0
            numOkJobs = 0
            for future in concurrent.futures.as_completed(future_results):
                try:
                    data = future.result()
                except NoFilterFoundException as exc:
                    self.log.info('no filter found: {}'.format(exc))
                    numNoFilter += 1
                except NoRetentionDateException as exc:
                    self.log.info('no retention date on obj: {}'.format(exc))
                    numNoRetentionDate += 1
                except RetentionDateInFutureException as exc:
                    self.log.info(
                        'retention date in future on obj: {}'.format(exc))
                    numRetentionInFuture += 1
                except Exception as exc:
                    self.log.exception('worker failed with exception')
                    numFailedJobs += 1
                else:
                    numOkJobs += 1
                    self.log.info('worker succeeded on obj: {}'.format(data))
            self.log.warning('Workers done!')

            if functionOnObject == self.getDataAndRunFilter:
                msg = self.__msg_for_extractor(numFailedJobs, numNoFilter,
                                               numOkJobs, objs)
            elif functionOnObject == self.getMetadataAndRunDisposal:
                msg = self.__msg_for_disposal(numFailedJobs,
                                              numNoRetentionDate,
                                              numRetentionInFuture, numOkJobs,
                                              objs)
            else:
                msg = self.__msg_for_generic(numFailedJobs, numOkJobs, objs)
            self.log.warning(msg)
            return msg

    def __msg_for_extractor(self, numFailedJobs, numNoFilter, numOkJobs, objs):
        total = numOkJobs + numFailedJobs + numNoFilter
        msg = 'OK: {}, Failed: {}, No filter: {} -- Total: {}, Fail rate: {}%, Missing: {} '.format(
            numOkJobs, numFailedJobs, numNoFilter, total,
            ((100 / (numOkJobs + numFailedJobs)) * numFailedJobs) if
            ((numOkJobs + numFailedJobs) > 0) else 0,
            len(objs) - total)
        return msg

    def __msg_for_disposal(self, numFailedJobs, numNoRetentionDate,
                           numRetentionInFuture, numOkJobs, objs):
        total = numOkJobs + numRetentionInFuture + numNoRetentionDate + numFailedJobs
        msg = 'Deleted: {}, No retention: {}, Retention in future: {}, Failed: {} -- Total: {}, Fail rate: {}%, Missing: {} '.format(
            numOkJobs, numNoRetentionDate, numRetentionInFuture, numFailedJobs,
            total, ((100 / total) * numFailedJobs) if (total > 0) else 0,
            len(objs) - total)
        return msg

    def __msg_for_generic(self, numFailedJobs, numOkJobs, objs):
        total = numOkJobs + numFailedJobs
        msg = 'OK: {}, Failed: {} -- Total: {}, Fail rate: {}%, Missing: {} '.format(
            numOkJobs, numFailedJobs, total, ((100 / total) * numFailedJobs) if
            (total > 0) else 0,
            len(objs) - total)
        return msg

    def runFilterForWholeContainer(self):
        return self.runForWholeContainer(
            functionOnObject=self.getDataAndRunFilter)

    def runIdentifierForWholeContainer(self):
        return self.runForWholeContainer(
            functionOnObject=self.getDataAndIdentifyContentType)

    def runDisposalForWholeContainer(self):
        return self.runForWholeContainer(
            functionOnObject=self.getMetadataAndRunDisposal)

    def runReplicateMetadataForWholeContainer(self):

        # create connection pool
        self.postgreConnPool = psycopg2.pool.ThreadedConnectionPool(
            5, self.numWorkers, None,
            **configuration.metadata_warehouse_endpoint)
        # create tables if not exist
        postgreConn = self.postgreConnPool.getconn()
        try:
            Replicator.createTablesIfAbsent(postgreConn)
        finally:
            self.postgreConnPool.putconn(postgreConn)

        return self.runForWholeContainer(
            functionOnObject=self.getMetadataAndReplicate)

    def runDummyLoad(self):
        return self.runForWholeContainer(functionOnObject=self.dummyLoad)