def __init__(self, persistence_object=None, avro_schema_object=None): ''' Private API. Unstable. Use it at your own risk. :param persistence_object: The implementation of a persistence_object; for example a dynamo table instance :param avro_schema_object: An Avro schema object that describes what's allowed in each RedshiftCluster :type avro_schema_object: Schema ''' self._clusters = Records(persistence_object=persistence_object, avro_schema_object=avro_schema_object)
def __init__(self, persistence_object=None, avro_schema_object=None): ''' Private API. Unstable. Use it at your own risk. :param persistence_object: The implementation of a persistence_object; for example a dynamo table instance :param avro_schema_object: An Avro schema object that describes what's allowed in each ScheduledJob :type avro_schema_object: Schema ''' self._records = Records(persistence_object=persistence_object, avro_schema_object=avro_schema_object) self.username = os.getenv('LOGNAME')
def __init__(self, persistence_object=None, avro_schema_object=None): """ Private API. Unstable. Use it at your own risk. :param persistence_object: The implementation of a persistence_object; for example a dynamo table instance :param avro_schema_object: An Avro schema object that describes what's allowed in each ETLRecord :type avro_schema_object: Schema """ self._records = Records(persistence_object=persistence_object, avro_schema_object=avro_schema_object)
def __init__(self, persistence_object=None, avro_schema_object=None): ''' Private API. Unstable. Use it at your own risk. :param persistence_object: The implementation of a persistence_object; for example a dynamo table instance :param avro_schema_object: An Avro schema object that describes what's allowed in each ScheduledJob :type avro_schema_object: Schema ''' self._records = Records( persistence_object=persistence_object, avro_schema_object=avro_schema_object) self.username = os.getenv('LOGNAME')
class ScheduledJobs(object): INDEX_ET_STATUS = 'ETStatusIndex' INDEX_LOAD_STATUS = 'LoadStatusIndex' INDEX_LOG_NAME_AND_LOG_SCHEMA_VERSION = 'LogNameLogSchemaVersionIndex' def __init__(self, persistence_object=None, avro_schema_object=None): ''' Private API. Unstable. Use it at your own risk. :param persistence_object: The implementation of a persistence_object; for example a dynamo table instance :param avro_schema_object: An Avro schema object that describes what's allowed in each ScheduledJob :type avro_schema_object: Schema ''' self._records = Records(persistence_object=persistence_object, avro_schema_object=avro_schema_object) self.username = os.getenv('LOGNAME') def get(self, **kwargs): ''' Returns an ScheduledJob :param kwargs: all kwarg are used together to get the job. It's required to pass in at least one valid kwarg; an unknown kwarg, or no kwargs, will result in ValueError :returns: ScheduledJob that matches given keys :rtype: :class:`.ScheduledJob` :raises KeyError: if request record is not found :raises PrimaryKeyError: if request record does not have conforming primary key Example:: >>> scheduled_job = scheduled_jobs.get( hash_key='1:public:my_cool_schema_alpha:2014-08-01:2014-08-02') >>> scheduled_job.get(s3_path=None) {'s3_path': 's3://my-bucket/logs/apache'} >>> # Example of no kwarg >>> scheduled_jobs.get() ValueError >>> # Example of unknown kwarg >>> scheduled_jobs.get(color='black') ValueError ''' return ScheduledJob(self._records.get(**kwargs)) def put(self, **kwargs): ''' Puts an ScheduledJob :param kwargs: each kwarg becomes key/value pair in the job. Passing in unknown kwarg will result in ValueError. If item already exists, ValueError will be raised. :returns: True if ScheduledJobs successfully persist the job :rtype: boolean :raises ValueError: if unknown kwarg is given :raises ValueError: if a duplicate record already exists :raises PrimaryKeyError: if the given record does not have a conforming primary key Example:: >>> sucess = scheduled_jobs.put(s3_path='s3://my-bucket/logs/apache', hash_key='1:public:my_cool_schema_alpha:2014-08-01:2014-08-02') >>> sucess True >>> # Trying to put the same item again >>> sucess = scheduled_jobs.put(s3_path='s3://my-bucket/logs/apache', hash_key='1:public:my_cool_schema_alpha:2014-08-01:2014-08-02') >>> ValueError >>> # Example of no kwarg >>> scheduled_jobs.put() ValueError >>> # Example of unknown kwarg >>> scheduled_jobs.put(color='black') ValueError ''' return self._records.put(**kwargs) def delete(self, **kwargs): ''' Delete :param kwargs: all kwarg are used together to get the job before deleting it. It's required to pass in at least one valid kwarg; an unknown kwarg, or no kwargs, will result in ValueError :returns: True if ScheduledJob is successfully deleted :type: boolean :raises KeyError: if request record is not found :raises PrimaryKeyError: if request record does not have conforming primary key Example:: >>> scheduled_job = scheduled_jobs.delete( hash_key='1:public:my_cool_schema_alpha:2014-08-01:2014-08-02') ''' job = self.get(**kwargs) return job.delete(self.username, 'user request') def get_jobs_with_et_status(self, et_status_value): ''' Get ScheduledJob matching given et_status :param et_status_value: value of et_status :type et_status_value: string :returns: An iterable of ScheduledJob matching given et_status Example:: >>> jobs = scheduled_jobs.get_jobs_with_et_status('running') >>> for job in job: print job.get(hash_key=None, et_status=None) {'hash_key': '1', 'et_status': 'running'} {'hash_key': '2', 'et_status': 'running'} ''' records = self._records.query_by_index(index=self.INDEX_ET_STATUS, et_status=et_status_value) def iter_record(): for record in records: yield ScheduledJob(record=record) return iter_record() def get_jobs_with_log_name(self, log_name, log_schema_version=None): ''' Get ScheduledJob matching given log_name and optional log_schema_version :param log_name: value of log_name :type log_name: string :param log_schema_version: optional value of log_schema_version :type log_schema_version: string or None :returns: An iterable of ScheduledJob matching given values Example:: >>> scheduled_jobs = ScheduleJobs() >>> jobs = scheduled_jobs.get_jobs_with_log_name('ranger') >>> for job in job: print job.get(hash_key=None, log_name=None) {'hash_key': '1', 'log_name': 'ranger'} {'hash_key': '2', 'log_name': 'ranger'} ''' records = self._records.query_by_index( index=self.INDEX_LOG_NAME_AND_LOG_SCHEMA_VERSION, log_name=log_name, log_schema_version=log_schema_version) def iter_record(): for record in records: yield ScheduledJob(record=record) return iter_record() def __iter__(self): def iter_records(): for record in self._records: yield ScheduledJob(record=record) return iter_records()
class RedshiftClusters(object): def __init__(self, persistence_object=None, avro_schema_object=None): ''' Private API. Unstable. Use it at your own risk. :param persistence_object: The implementation of a persistence_object; for example a dynamo table instance :param avro_schema_object: An Avro schema object that describes what's allowed in each RedshiftCluster :type avro_schema_object: Schema ''' self._clusters = Records( persistence_object=persistence_object, avro_schema_object=avro_schema_object) def get(self, **kwargs): ''' Returns an RedshiftCluster :param kwargs: all kwarg are used together to get the cluster. It's required to pass in at least one valid kwarg; an unknown kwarg, or no kwargs, will result in ValueError :returns: RedshiftCluster that matches given keys :rtype: :class:`.RedshiftCluster` :raises KeyError: if request cluster is not found :raises PrimaryKeyError: if request cluster does not have conforming primary key Example:: >>> redshift_cluster = redshift_clusters.get(redshift_id='cluster') >>> redshift_cluster.get(host=None, port=None) {'host': 'cluster.us-west-3.redshift.amazonaws.com', 'port': 5439} >>> # Example of no kwarg >>> redshift_clusters.get() ValueError >>> # Example of unknown kwarg >>> redshift_clusters.get(color='black') ValueError ''' return RedshiftCluster(self._clusters.get(**kwargs)) def put(self, **kwargs): ''' Puts a RedshiftCluster :param kwargs: each kwarg becomes key/value pair in the cluster. Passing in unknown kwarg will result in ValueError. If item already exists, ValueError will be raised. :returns: True if RedshiftClusters successfully persist the cluster :rtype: boolean :raises ValueError: if unknown kwarg is given :raises ValueError: if a duplicate cluster already exists :raises PrimaryKeyError: if the given cluster does not have a conforming primary key Example:: >>> success = redshift_clusters.put(redshift_id='cluster', host='cluster.us-west-2.redshift.amazonaws.com', port=5439) >>> success True >>> # Trying to put the same item again >>> success = redshift_clusters.put(redshift_id='cluster', host='cluster.us-west-2.redshift.amazonaws.com', port=5439) >>> ValueError >>> # Example of no kwarg >>> redshift_clusters.put() ValueError >>> # Example of unknown kwarg >>> redshift_clusters.put(color='black') ValueError ''' return self._clusters.put(**kwargs) def __iter__(self): def iter_clusters(): for cluster in self._clusters: yield RedshiftCluster(cluster=cluster) return iter_clusters()
class RedshiftClusters(object): def __init__(self, persistence_object=None, avro_schema_object=None): ''' Private API. Unstable. Use it at your own risk. :param persistence_object: The implementation of a persistence_object; for example a dynamo table instance :param avro_schema_object: An Avro schema object that describes what's allowed in each RedshiftCluster :type avro_schema_object: Schema ''' self._clusters = Records(persistence_object=persistence_object, avro_schema_object=avro_schema_object) def get(self, **kwargs): ''' Returns an RedshiftCluster :param kwargs: all kwarg are used together to get the cluster. It's required to pass in at least one valid kwarg; an unknown kwarg, or no kwargs, will result in ValueError :returns: RedshiftCluster that matches given keys :rtype: :class:`.RedshiftCluster` :raises KeyError: if request cluster is not found :raises PrimaryKeyError: if request cluster does not have conforming primary key Example:: >>> redshift_cluster = redshift_clusters.get(redshift_id='cluster') >>> redshift_cluster.get(host=None, port=None) {'host': 'cluster.us-west-3.redshift.amazonaws.com', 'port': 5439} >>> # Example of no kwarg >>> redshift_clusters.get() ValueError >>> # Example of unknown kwarg >>> redshift_clusters.get(color='black') ValueError ''' return RedshiftCluster(self._clusters.get(**kwargs)) def put(self, **kwargs): ''' Puts a RedshiftCluster :param kwargs: each kwarg becomes key/value pair in the cluster. Passing in unknown kwarg will result in ValueError. If item already exists, ValueError will be raised. :returns: True if RedshiftClusters successfully persist the cluster :rtype: boolean :raises ValueError: if unknown kwarg is given :raises ValueError: if a duplicate cluster already exists :raises PrimaryKeyError: if the given cluster does not have a conforming primary key Example:: >>> success = redshift_clusters.put(redshift_id='cluster', host='cluster.us-west-2.redshift.amazonaws.com', port=5439) >>> success True >>> # Trying to put the same item again >>> success = redshift_clusters.put(redshift_id='cluster', host='cluster.us-west-2.redshift.amazonaws.com', port=5439) >>> ValueError >>> # Example of no kwarg >>> redshift_clusters.put() ValueError >>> # Example of unknown kwarg >>> redshift_clusters.put(color='black') ValueError ''' return self._clusters.put(**kwargs) def __iter__(self): def iter_clusters(): for cluster in self._clusters: yield RedshiftCluster(cluster=cluster) return iter_clusters()
class ScheduledJobs(object): INDEX_ET_STATUS = 'ETStatusIndex' INDEX_LOAD_STATUS = 'LoadStatusIndex' INDEX_LOG_NAME_AND_LOG_SCHEMA_VERSION = 'LogNameLogSchemaVersionIndex' def __init__(self, persistence_object=None, avro_schema_object=None): ''' Private API. Unstable. Use it at your own risk. :param persistence_object: The implementation of a persistence_object; for example a dynamo table instance :param avro_schema_object: An Avro schema object that describes what's allowed in each ScheduledJob :type avro_schema_object: Schema ''' self._records = Records( persistence_object=persistence_object, avro_schema_object=avro_schema_object) self.username = os.getenv('LOGNAME') def get(self, **kwargs): ''' Returns an ScheduledJob :param kwargs: all kwarg are used together to get the job. It's required to pass in at least one valid kwarg; an unknown kwarg, or no kwargs, will result in ValueError :returns: ScheduledJob that matches given keys :rtype: :class:`.ScheduledJob` :raises KeyError: if request record is not found :raises PrimaryKeyError: if request record does not have conforming primary key Example:: >>> scheduled_job = scheduled_jobs.get( hash_key='1:public:my_cool_schema_alpha:2014-08-01:2014-08-02') >>> scheduled_job.get(s3_path=None) {'s3_path': 's3://my-bucket/logs/apache'} >>> # Example of no kwarg >>> scheduled_jobs.get() ValueError >>> # Example of unknown kwarg >>> scheduled_jobs.get(color='black') ValueError ''' return ScheduledJob(self._records.get(**kwargs)) def put(self, **kwargs): ''' Puts an ScheduledJob :param kwargs: each kwarg becomes key/value pair in the job. Passing in unknown kwarg will result in ValueError. If item already exists, ValueError will be raised. :returns: True if ScheduledJobs successfully persist the job :rtype: boolean :raises ValueError: if unknown kwarg is given :raises ValueError: if a duplicate record already exists :raises PrimaryKeyError: if the given record does not have a conforming primary key Example:: >>> sucess = scheduled_jobs.put(s3_path='s3://my-bucket/logs/apache', hash_key='1:public:my_cool_schema_alpha:2014-08-01:2014-08-02') >>> sucess True >>> # Trying to put the same item again >>> sucess = scheduled_jobs.put(s3_path='s3://my-bucket/logs/apache', hash_key='1:public:my_cool_schema_alpha:2014-08-01:2014-08-02') >>> ValueError >>> # Example of no kwarg >>> scheduled_jobs.put() ValueError >>> # Example of unknown kwarg >>> scheduled_jobs.put(color='black') ValueError ''' return self._records.put(**kwargs) def delete(self, **kwargs): ''' Delete :param kwargs: all kwarg are used together to get the job before deleting it. It's required to pass in at least one valid kwarg; an unknown kwarg, or no kwargs, will result in ValueError :returns: True if ScheduledJob is successfully deleted :type: boolean :raises KeyError: if request record is not found :raises PrimaryKeyError: if request record does not have conforming primary key Example:: >>> scheduled_job = scheduled_jobs.delete( hash_key='1:public:my_cool_schema_alpha:2014-08-01:2014-08-02') ''' job = self.get(**kwargs) return job.delete(self.username, 'user request') def get_jobs_with_et_status(self, et_status_value): ''' Get ScheduledJob matching given et_status :param et_status_value: value of et_status :type et_status_value: string :returns: An iterable of ScheduledJob matching given et_status Example:: >>> jobs = scheduled_jobs.get_jobs_with_et_status('running') >>> for job in job: print job.get(hash_key=None, et_status=None) {'hash_key': '1', 'et_status': 'running'} {'hash_key': '2', 'et_status': 'running'} ''' records = self._records.query_by_index( index=self.INDEX_ET_STATUS, et_status=et_status_value) def iter_record(): for record in records: yield ScheduledJob(record=record) return iter_record() def get_jobs_with_log_name(self, log_name, log_schema_version=None): ''' Get ScheduledJob matching given log_name and optional log_schema_version :param log_name: value of log_name :type log_name: string :param log_schema_version: optional value of log_schema_version :type log_schema_version: string or None :returns: An iterable of ScheduledJob matching given values Example:: >>> scheduled_jobs = ScheduleJobs() >>> jobs = scheduled_jobs.get_jobs_with_log_name('ranger') >>> for job in job: print job.get(hash_key=None, log_name=None) {'hash_key': '1', 'log_name': 'ranger'} {'hash_key': '2', 'log_name': 'ranger'} ''' records = self._records.query_by_index( index=self.INDEX_LOG_NAME_AND_LOG_SCHEMA_VERSION, log_name=log_name, log_schema_version=log_schema_version) def iter_record(): for record in records: yield ScheduledJob(record=record) return iter_record() def __iter__(self): def iter_records(): for record in self._records: yield ScheduledJob(record=record) return iter_records()
class ETLRecords(object): INDEX_JOB_ID_AND_DATA_DATE = 'ETLRecordByJobIdAndDataDate' def __init__(self, persistence_object=None, avro_schema_object=None): ''' Private API. Unstable. Use it at your own risk. :param persistence_object: The implementation of a persistence_object; for example a dynamo table instance :param avro_schema_object: An Avro schema object that describes what's allowed in each ETLRecord :type avro_schema_object: Schema ''' self._records = Records(persistence_object=persistence_object, avro_schema_object=avro_schema_object) def get(self, **kwargs): ''' Returns an ETLRecord :param kwargs: all kwarg are used together to get the record. It's required to pass in at least one valid kwarg; an unknown kwarg, or no kwargs, will result in ValueError :returns: ETLRecord that matches given keys :rtype: :class:`.ETLRecord` :raises KeyError: if request record is not found :raises PrimaryKeyError: if request record does not have conforming primary key Example:: >>> etl_record = etl_records.get( hash_key='1:public:search search_results', data_date='2014-07-26') >>> etl_record.get(data_date=None, s3_path=None) {'data_date': '2014-07-26', 's3_path': 's3://bucket/key1/schema.yaml'} >>> # Example of no kwarg >>> etl_records.get() ValueError >>> # Example of unknown kwarg >>> etl_records.get(color='black') ValueError ''' return ETLRecord(self._records.get(**kwargs)) def put(self, **kwargs): ''' Puts an ETLRecord :param kwargs: each kwarg becomes key/value pair in the record. Passing in unknown kwarg will result in ValueError. If item already exists, ValueError will be raised. :returns: True if ETLRecords successfully persist the record :rtype: boolean :raises ValueError: if unknown kwarg is given :raises ValueError: if a duplicate record already exists :raises PrimaryKeyError: if the given record does not have a conforming primary key Example:: >>> success = etl_records.put( hash_key='1:public:search search_results', data_date='2014-07-26', et_state='et_started') >>> success True >>> # Trying to put the same item again >>> success = etl_records.put( hash_key='1:public:search search_results', data_date='2014-07-26', et_state='et_started') >>> ValueError >>> # Example of no kwarg >>> etl_records.put() ValueError >>> # Example of unknown kwarg >>> etl_records.put(color='black') ValueError ''' return self._records.put(**kwargs) def get_runs_with_job_id(self, job_id, data_date=None): ''' Get ETLRecord matching given job_id :param job_id: id of the job :type job_id: string :returns: An iterable of ETLRecord matching given job_id Example:: >>> jobs = scheduled_jobs.get_jobs_with_job_id('1af2') >>> for job in job: print job.get(hash_key=None, data_date=None) {'hash_key': '1af2', 'data_date': '2014-07-01'} {'hash_key': '1af2', 'data_date': '2014-07-02'} ''' records = self._records.query_by_index( index=self.INDEX_JOB_ID_AND_DATA_DATE, job_id=job_id, data_date=data_date) def iter_record(): for record in records: yield ETLRecord(record=record) return iter_record() def delete_job_runs(self, job_id): ''' Attempt to delete all runs for a job id :param job_id: id of the job :type job_id: string :returns: True if all runs are successfully deleted :type: boolean ''' runs = self.get_runs_with_job_id(job_id) self._records.batch_delete(runs, hash_key=job_id, data_date=None) runs = self.get_runs_with_job_id(job_id) return len([r for r in runs]) == 0 def __iter__(self): def iter_records(): for record in self._records: yield ETLRecord(record=record) return iter_records()
class ETLRecords(object): INDEX_JOB_ID_AND_DATA_DATE = "ETLRecordByJobIdAndDataDate" def __init__(self, persistence_object=None, avro_schema_object=None): """ Private API. Unstable. Use it at your own risk. :param persistence_object: The implementation of a persistence_object; for example a dynamo table instance :param avro_schema_object: An Avro schema object that describes what's allowed in each ETLRecord :type avro_schema_object: Schema """ self._records = Records(persistence_object=persistence_object, avro_schema_object=avro_schema_object) def get(self, **kwargs): """ Returns an ETLRecord :param kwargs: all kwarg are used together to get the record. It's required to pass in at least one valid kwarg; an unknown kwarg, or no kwargs, will result in ValueError :returns: ETLRecord that matches given keys :rtype: :class:`.ETLRecord` :raises KeyError: if request record is not found :raises PrimaryKeyError: if request record does not have conforming primary key Example:: >>> etl_record = etl_records.get( hash_key='1:public:search search_results', data_date='2014-07-26') >>> etl_record.get(data_date=None, s3_path=None) {'data_date': '2014-07-26', 's3_path': 's3://bucket/key1/schema.yaml'} >>> # Example of no kwarg >>> etl_records.get() ValueError >>> # Example of unknown kwarg >>> etl_records.get(color='black') ValueError """ return ETLRecord(self._records.get(**kwargs)) def put(self, **kwargs): """ Puts an ETLRecord :param kwargs: each kwarg becomes key/value pair in the record. Passing in unknown kwarg will result in ValueError. If item already exists, ValueError will be raised. :returns: True if ETLRecords successfully persist the record :rtype: boolean :raises ValueError: if unknown kwarg is given :raises ValueError: if a duplicate record already exists :raises PrimaryKeyError: if the given record does not have a conforming primary key Example:: >>> success = etl_records.put( hash_key='1:public:search search_results', data_date='2014-07-26', et_state='et_started') >>> success True >>> # Trying to put the same item again >>> success = etl_records.put( hash_key='1:public:search search_results', data_date='2014-07-26', et_state='et_started') >>> ValueError >>> # Example of no kwarg >>> etl_records.put() ValueError >>> # Example of unknown kwarg >>> etl_records.put(color='black') ValueError """ return self._records.put(**kwargs) def get_runs_with_job_id(self, job_id, data_date=None): """ Get ETLRecord matching given job_id :param job_id: id of the job :type job_id: string :returns: An iterable of ETLRecord matching given job_id Example:: >>> jobs = scheduled_jobs.get_jobs_with_job_id('1af2') >>> for job in job: print job.get(hash_key=None, data_date=None) {'hash_key': '1af2', 'data_date': '2014-07-01'} {'hash_key': '1af2', 'data_date': '2014-07-02'} """ records = self._records.query_by_index( index=self.INDEX_JOB_ID_AND_DATA_DATE, job_id=job_id, data_date=data_date ) def iter_record(): for record in records: yield ETLRecord(record=record) return iter_record() def delete_job_runs(self, job_id): """ Attempt to delete all runs for a job id :param job_id: id of the job :type job_id: string :returns: True if all runs are successfully deleted :type: boolean """ runs = self.get_runs_with_job_id(job_id) self._records.batch_delete(runs, hash_key=job_id, data_date=None) runs = self.get_runs_with_job_id(job_id) return len([r for r in runs]) == 0 def __iter__(self): def iter_records(): for record in self._records: yield ETLRecord(record=record) return iter_records()