Example #1
0
def queue_consumer(pull_queue, please_stop=None):
    queue = aws.Queue(pull_queue)
    time_offset = None
    request_count = 0

    while not please_stop:
        request = queue.pop(till=please_stop)
        if please_stop:
            break
        if not request:
            Log.note("Nothing in queue, pausing for 5 seconds...")
            (please_stop | Till(seconds=5)).wait()
            continue

        if SKIP_TRY_REQUESTS and 'try' in request.where['and'].eq.branch:
            Log.note("Skipping try revision.")
            queue.commit()
            continue

        now = Date.now().unix
        if time_offset is None:
            time_offset = now - request.meta.request_time

        next_request = request.meta.request_time + time_offset
        if next_request > now:
            Log.note("Next request in {{wait_time}}",
                     wait_time=Duration(seconds=next_request - now))
            Till(till=next_request).wait()

        Thread.run("request " + text_type(request_count), one_request, request)
        request_count += 1
        queue.commit()
Example #2
0
def main():
    try:
        settings = startup.read_settings(defs=[{
            "name": ["--id"],
            "help": "id (prefix, really) to process",
            "type": str,
            "dest": "id",
            "required": False
        }])
        constants.set(settings.constants)
        Log.start(settings.debug)

        queries.config.default = {
            "type": "elasticsearch",
            "settings": settings.elasticsearch.copy()
        }

        if settings.args.id:
            work_queue = Queue("local work queue")
            work_queue.extend(parse_id_argument(settings.args.id))
        else:
            work_queue = aws.Queue(settings=settings.work_queue)

        Log.note("Listen to queue {{queue}}, and read off of {{s3}}",
                 queue=settings.work_queue.name,
                 s3=settings.source.bucket)

        es = MultiDayIndex(settings.elasticsearch, queue_size=100000)

        threads = []
        please_stop = Signal()
        for _ in range(settings.threads):
            p = Thread.run("copy to es",
                           copy2es,
                           es,
                           settings,
                           work_queue,
                           please_stop=please_stop)
            threads.append(p)

        def monitor_progress(please_stop):
            while not please_stop:
                Log.note("Remaining: {{num}}", num=len(work_queue))
                Thread.sleep(seconds=10)

        Thread.run(name="monitor progress",
                   target=monitor_progress,
                   please_stop=please_stop)

        aws.capture_termination_signal(please_stop)
        Thread.wait_for_shutdown_signal(please_stop=please_stop,
                                        allow_exit=True)
        please_stop.go()
        Log.note("Shutdown started")
    except Exception, e:
        Log.error("Problem with etl", e)
Example #3
0
    def required_utility(self):
        queue = aws.Queue(self.settings.work_queue)
        pending = len(queue)

        tod_minimum = None
        if Date.now().hour not in [4, 5, 6, 7, 8, 9, 10, 11]:
            tod_minimum = 100

        return max(self.settings.minimum_utility, tod_minimum,
                   Math.ceiling(pending / 30))
Example #4
0
def main():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)

        source = get_container(settings.source)
        destination = get_container(settings.destination)

        work_queue = aws.Queue(settings.work_queue)
        backfill(source, destination, work_queue, settings)
    except Exception, e:
        Log.error("Problem with backfill", e)
Example #5
0
class ETL(Thread):
    @use_settings
    def __init__(self,
                 name,
                 work_queue,
                 workers,
                 resources,
                 please_stop,
                 wait_forever=False,
                 settings=None):
        # FIND THE WORKERS METHODS
        settings.workers = []
        for w in workers:
            w = deepcopy(w)

            for existing_worker in settings.workers:
                try:
                    fuzzytestcase.assertAlmostEqual(existing_worker.source,
                                                    w.source)
                    fuzzytestcase.assertAlmostEqual(
                        existing_worker.transformer, w.transformer)
                    # SAME SOURCE AND TRANSFORMER, MERGE THE destinations
                except Exception, e:
                    continue
                destination = get_container(w.destination)
                existing_worker._destination = Split(
                    existing_worker._destination, destination)
                break
            else:
                t_name = w.transformer
                w._transformer = dot.get_attr(sys.modules, t_name)
                if not w._transformer:
                    Log.error(
                        "Can not find {{path}} to transformer (are you sure you are pointing to a function?)",
                        path=t_name)
                w._source = get_container(w.source)
                w._destination = get_container(w.destination)
                settings.workers.append(w)

            w._notify = []
            for notify in listwrap(w.notify):
                w._notify.append(aws.Queue(notify))

        self.resources = resources
        self.settings = settings
        if isinstance(work_queue, Mapping):
            self.work_queue = aws.Queue(work_queue)
        else:
            self.work_queue = work_queue

        Thread.__init__(self, name, self.loop, please_stop=please_stop)
        self.start()
Example #6
0
    def __init__(self, endpoint, push_queue=None, timeout=30, db=None, kwargs=None):
        self.enabled = True
        self.num_bad_requests = 0
        self.endpoint = endpoint
        self.timeout = timeout
        self.push_queue = aws.Queue(push_queue) if push_queue else None
        self.config = kwargs

        self.db = Sqlite(filename=coalesce(db.filename, "tuid_client.sqlite"), kwargs=db)

        if not self.db.query("SELECT name FROM sqlite_master WHERE type='table';").data:
            with self.db.transaction() as transaction:
                self._setup(transaction)
Example #7
0
def diff(settings, please_stop=None):
    # EVERYTHING FROM ELASTICSEARCH
    es = MultiDayIndex(settings.elasticsearch, queue_size=100000)

    in_es = get_all_in_es(es)
    in_s3 = get_all_s3(in_es, settings)

    # IGNORE THE 500 MOST RECENT BLOCKS, BECAUSE THEY ARE PROBABLY NOT DONE
    in_s3 = in_s3[500:500 + settings.limit:]

    Log.note(
        "Queueing {{num}} keys (from {{min}} to {{max}}) for insertion to ES",
        num=len(in_s3),
        min=Math.MIN(in_s3),
        max=Math.MAX(in_s3))
    work_queue = aws.Queue(settings=settings.work_queue)
    work_queue.extend(in_s3)
Example #8
0
def queue_consumer(client, pull_queue, please_stop=None, kwargs=None):
    queue = aws.Queue(pull_queue)
    client = TuidClient(client)
    try_revs = {}
    test_try_revs = True

    #while len(queue) > 0:
    #    request = queue.pop(till=please_stop)
    #    if request:
    #        Log.note("Popping request from {{time}}", time=request.meta.request_time)
    #        queue.commit()

    while not please_stop:
        request = queue.pop(till=please_stop)
        if please_stop:
            break
        if not request:
            Log.note("Nothing in queue, pausing for 5 seconds...")
            (please_stop | Till(seconds=5)).wait()
            continue
        Log.note("Found something in queue")
        repo = 'mozilla-central'

        and_op = request.where['and']

        revision = None
        files = None
        for a in and_op:
            if a.eq.revision:
                revision = a.eq.revision
            elif a['in'].path:
                files = a['in'].path
            elif a.eq.path:
                files = [a.eq.path]

        if len(files) <= 0:
            Log.warning("No files in the given request: {{request}}",
                        request=request)
            continue

        if revision[:12] in try_revs and not test_try_revs:
            Log.warning(
                "Revision {{cset}} does not exist in the {{branch}} branch",
                cset=revision[:12],
                branch='mozilla-central')
            queue.commit()
            continue

        clog_url = HG_URL / 'mozilla-central' / 'json-log' / revision[:12]
        clog_obj = http.get_json(clog_url, retry=RETRY)
        if isinstance(clog_obj, (text_type, str)):
            Log.warning(
                "Revision {{cset}} does not exist in the {{branch}} branch",
                cset=revision[:12],
                branch='mozilla-central')
            try_revs[revision[:12]] = True
            if not test_try_revs:
                queue.commit()
                continue
            else:
                json_rev_url = 'https://hg.mozilla.org/try/json-rev/' + revision[:
                                                                                 12]
                clog_obj = http.get_json(json_rev_url, retry=RETRY)
                if 'phase' not in clog_obj:
                    Log.warning(
                        "Revision {{cset}} does not exist in the try branch",
                        cset=revision[:12],
                        branch='mozilla-central')
                    queue.commit()
                    continue

                if clog_obj['phase'] == 'draft':
                    repo = 'try'

        else:
            Log.note("Revision {{cset}} exists on mozilla-central.",
                     cset=revision[:12])

        request.branch = repo
        with Timer("Make TUID request from {{timestamp|date}}",
                   {"timestamp": request.meta.request_time}):
            client.enabled = True  # ENSURE THE REQUEST IS MADE
            result = http.post_json("http://localhost:5000/tuid",
                                    json=request,
                                    timeout=10000)
            if not client.enabled:
                Log.note("pausing consumer for {{num}}sec",
                         num=PAUSE_ON_FAILURE)
                Till(seconds=PAUSE_ON_FAILURE).wait()
            if result is None or len(result.data) != len(files):
                Log.warning("expecting response for every file requested")

        queue.commit()
Example #9
0
    def __init__(self, kwargs=None):
        self.settings = kwargs
        self.schema = SnowflakeSchema(self.settings.snowflake)
        self._extract = extract = kwargs.extract

        # SOME PREP
        get_git_revision()

        # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF
        with MySQL(**kwargs.snowflake.database) as db:
            processes = None
            try:
                processes = jx.filter(
                    db.query("show processlist"), {
                        "and": [{
                            "neq": {
                                "Command": "Sleep"
                            }
                        }, {
                            "neq": {
                                "Info": "show processlist"
                            }
                        }]
                    })
            except Exception as e:
                Log.warning("no database", cause=e)

            if processes:
                if DEBUG:
                    Log.warning("Processes are running\n{{list|json}}",
                                list=processes)
                else:
                    Log.error("Processes are running\n{{list|json}}",
                              list=processes)

        extract.type = listwrap(extract.type)
        extract.start = listwrap(extract.start)
        extract.batch = listwrap(extract.batch)
        extract.field = listwrap(extract.field)
        if any(
                len(extract.type) != len(other)
                for other in [extract.start, extract.batch, extract.field]):
            Log.error(
                "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object"
            )
        for i, t in enumerate(extract.type):
            if t == "time":
                extract.start[i] = Date(extract.start[i])
                extract.batch[i] = Duration(extract.batch[i])
            elif t == "number":
                pass
            else:
                Log.error('Expecting `extract.type` to be "number" or "time"')

        extract.threads = coalesce(extract.threads, 1)
        self.done_pulling = Signal()
        self.queue = Queue("all batches",
                           max=2 * coalesce(extract.threads, 1),
                           silent=True)

        self.bucket = s3.Bucket(self.settings.destination)
        self.notify = aws.Queue(self.settings.notify)
        Thread.run("get records", self.pull_all_remaining)
Example #10
0
def main():
    """
    CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE
    """
    try:
        settings = startup.read_settings(defs=[{
            "name": ["--bucket"],
            "help": "bucket to reprocess",
            "type": str,
            "dest": "bucket",
            "required": True
        }, {
            "name": ["--begin", "--start"],
            "help": "lowest key (or prefix) to reprocess",
            "type": str,
            "dest": "start",
            "default": "1",
            "required": False
        }, {
            "name": ["--end", "--stop"],
            "help": "highest key (or prefix) to reprocess",
            "type": str,
            "dest": "end",
            "default": None,
            "required": False
        }, {
            "name": ["--file"],
            "help": "path to file with CR-delimited prefix list",
            "type": str,
            "dest": "file",
            "default": None,
            "required": False
        }])
        Log.start(settings.debug)

        with aws.Queue(settings.work_queue) as work_queue:
            source = Connection(settings.aws).get_bucket(settings.args.bucket)

            if settings.args.file:
                now = Date.now()
                for prefix in File(settings.args.file):
                    all_keys = source.keys(prefix=key_prefix(prefix))
                    for k in all_keys:
                        Log.note("Adding {{key}}", key=k)
                        work_queue.add({
                            "bucket": settings.args.bucket,
                            "key": k,
                            "timestamp": now.unix,
                            "date/time": now.format()
                        })
                return

            if settings.args.end and settings.args.start:
                up_to = str(int(settings.args.end) - 1)
                prefix = strings.common_prefix(settings.args.start, up_to)
            else:
                prefix = None
            start = Version(settings.args.start)
            end = Version(settings.args.end)

            all_keys = source.keys(prefix=prefix)
            with Timer("filtering {{num}} keys", {"num": len(all_keys)}):
                all_keys = [(k, Version(k)) for k in all_keys
                            if k.find("None") == -1]
                all_keys = [(k, p) for k, p in all_keys if start <= p < end]
            with Timer("sorting {{num}} keys", {"num": len(all_keys)}):
                all_keys = qb.sort(all_keys, 1)
            for k, p in all_keys:
                Log.note("Adding {{key}}", key=k)
                now = Date.now()
                work_queue.add({
                    "bucket": settings.args.bucket,
                    "key": k,
                    "timestamp": now.unix,
                    "date/time": now.format()
                })

    except Exception, e:
        Log.error("Problem with etl", e)
Example #11
0
def list_queue(settings, num=10):
    queue = aws.Queue(settings)
    for i in range(num):
        content = queue.pop()
        Log.note("{{content}}", content=content)
    queue.rollback()