Esempio n. 1
0
def main():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        constants.set(settings.constants)

        with startup.SingleInstance(flavor_id=settings.args.filename):
            with aws.s3.Bucket(settings.destination) as bucket:

                if settings.param.debug:
                    if settings.source.durable:
                        Log.error("Can not run in debug mode with a durable queue")
                    synch = SynchState(bucket.get_key(SYNCHRONIZATION_KEY, must_exist=False))
                else:
                    synch = SynchState(bucket.get_key(SYNCHRONIZATION_KEY, must_exist=False))
                    if settings.source.durable:
                        synch.startup()

                queue = PersistentQueue(settings.param.queue_file)
                if queue:
                    last_item = queue[len(queue) - 1]
                    synch.source_key = last_item._meta.count + 1

                with pulse.Consumer(settings=settings.source, target=None, target_queue=queue, start=synch.source_key):
                    Thread.run("pulse log loop", log_loop, settings, synch, queue, bucket)
                    Thread.wait_for_shutdown_signal(allow_exit=True)
                    Log.warning("starting shutdown")

                queue.close()
                Log.note("write shutdown state to S3")
                synch.shutdown()

    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 2
0
 def __init__(
     self,
     bucket,  # NAME OF THE BUCKET
     aws_access_key_id=None,  # CREDENTIAL
     aws_secret_access_key=None,  # CREDENTIAL
     region=None,  # NAME OF AWS REGION, REQUIRED FOR SOME BUCKETS
     public=False,
     debug=False,
     settings=None
 ):
     self.uid = None
     self.bucket = s3.Bucket(settings=settings)
     Log.alert("Using {{bucket}} for S3 storage", bucket=self.bucket.name)
     self.temp_queue = PersistentQueue(bucket + "_queue.txt")
     self._figure_out_start_point()
     self.push_to_s3 = Thread.run("pushing to " + bucket, self._worker)
Esempio n. 3
0
class Storage(object):
    @use_settings
    def __init__(
        self,
        bucket,  # NAME OF THE BUCKET
        aws_access_key_id=None,  # CREDENTIAL
        aws_secret_access_key=None,  # CREDENTIAL
        region=None,  # NAME OF AWS REGION, REQUIRED FOR SOME BUCKETS
        public=False,
        debug=False,
        settings=None
    ):
        self.uid = None
        self.bucket = s3.Bucket(settings=settings)
        Log.alert("Using {{bucket}} for S3 storage", bucket=self.bucket.name)
        self.temp_queue = PersistentQueue(bucket + "_queue.txt")
        self._figure_out_start_point()
        self.push_to_s3 = Thread.run("pushing to " + bucket, self._worker)

    def _figure_out_start_point(self):
        # RECOVER FROM THE QUEUE
        acc = []
        while True:
            d = self.temp_queue.pop(timeout=ZERO)
            if d:
                acc.append(d)
            else:
                break
        self.temp_queue.rollback()

        if acc:
            # WAS IN THE MIDDLE OF A BATCH, FIND count
            data = acc[-1]
            today_ = data[UID_PATH].split(".")[0]
            todays_batch_count = int(data[UID_PATH].split(".")[1])
            count = todays_batch_count * BATCH_SIZE + data.etl.id + 1
            if DEBUG:
                Log.note(
                    "Next uid from queue is {{uid}}.{{count}}",
                    count=count % BATCH_SIZE,
                    uid=today_ + "." + unicode(todays_batch_count)
                )
            self.uid = UID(count)
            return

        # FIND LAST WHOLE BATCH FROM TODAY
        today_ = unicode(today())
        todays_keys = self.bucket.keys(prefix=unicode(today_))
        if not todays_keys:
            if DEBUG:
                Log.note("Next uid is {{uid}}.{{count}}", count=0, uid=today_+".0")
            self.uid = UID()
            return

        todays_batch_count = jx.sort(int(k.split(".")[1]) for k in todays_keys).last() + 1
        max_key = today_ + "." + unicode(todays_batch_count)

        if DEBUG:
            Log.note("Next uid is {{uid}}", uid=max_key)
        count = todays_batch_count * BATCH_SIZE
        self.uid = UID(count)

    def add(self, data):
        data = wrap(data)
        uid, count = self.uid.advance()
        link = expand_template(
            LINK_PATTERN,
            {
                "region": self.bucket.settings.region,
                "bucket": self.bucket.settings.bucket,
                "uid": uid
            }
        )
        data.etl.id = count
        data.etl.source.href = link
        data[UID_PATH] = uid
        self.temp_queue.add(data)
        return link, count

    def _worker(self, please_stop):
        curr = "0.0"
        acc = []
        last_count_written = -1
        next_write = Date.now()

        while not please_stop:
            d = self.temp_queue.pop(timeout=MINUTE)
            if d == None:
                if not acc:
                    continue
                # WRITE THE INCOMPLETE DATA TO S3, BUT NOT TOO OFTEN
                next_write = Date.now() + MINUTE
                try:
                    if last_count_written != len(acc):
                        if DEBUG:
                            Log.note("write incomplete data ({{num}} lines) to {{uid}} in S3 next (time = {{next_write}})", uid=curr, next_write=next_write, num=len(acc))
                        self.bucket.write_lines(curr, (convert.value2json(a) for a in acc))
                        last_count_written = len(acc)
                except Exception, e:
                    Log.note("Problem with write to S3", cause=e)
            elif d[UID_PATH] != curr:
                # WRITE acc TO S3 IF WE ARE MOVING TO A NEW KEY
                try:
                    if acc:
                        if DEBUG:
                            Log.note("write complete data ({{num}} lines) to {{curr}} in S3", num=len(acc), curr=curr)
                        self.bucket.write_lines(curr, (convert.value2json(a) for a in acc))
                        last_count_written = 0
                    curr = d[UID_PATH]
                    acc = [d]
                except Exception, e:
                    Log.warning("Can not store data", cause=e)
                    Thread.sleep(30*MINUTE)