def start():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        main(settings)
    except Exception, e:
        Log.error("Problems exist", e)
Esempio n. 2
0
def main():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        backfill(settings)
    except Exception, e:
        Log.error("Problem with backfill", e)
Esempio n. 3
0
def main():

    try:
        settings = startup.read_settings(defs=[{
            "name": ["--id"],
            "help": "id(s) to process.  Use \"..\" for a range.",
            "type": str,
            "dest": "id",
            "required": False
        }])
        constants.set(settings.constants)
        Log.start(settings.debug)

        if settings.args.id:
            etl_one(settings)
            return

        hg = HgMozillaOrg(settings=settings.hg)
        resources = Dict(hg=dictwrap(hg))
        stopper = Signal()
        for i in range(coalesce(settings.param.threads, 1)):
            ETL(name="ETL Loop " + unicode(i),
                work_queue=settings.work_queue,
                resources=resources,
                workers=settings.workers,
                settings=settings.param,
                please_stop=stopper)

        Thread.wait_for_shutdown_signal(stopper, allow_exit=True)
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 4
0
def main():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        constants.set(settings.constants)

        with startup.SingleInstance(flavor_id=settings.args.filename):
            with aws.s3.Bucket(settings.destination) as bucket:

                if settings.param.debug:
                    if settings.source.durable:
                        Log.error("Can not run in debug mode with a durable queue")
                    synch = SynchState(bucket.get_key(SYNCHRONIZATION_KEY, must_exist=False))
                else:
                    synch = SynchState(bucket.get_key(SYNCHRONIZATION_KEY, must_exist=False))
                    if settings.source.durable:
                        synch.startup()

                queue = PersistentQueue(settings.param.queue_file)
                if queue:
                    last_item = queue[len(queue) - 1]
                    synch.source_key = last_item._meta.count + 1

                with pulse.Consumer(settings=settings.source, target=None, target_queue=queue, start=synch.source_key):
                    Thread.run("pulse log loop", log_loop, settings, synch, queue, bucket)
                    Thread.wait_for_shutdown_signal(allow_exit=True)
                    Log.warning("starting shutdown")

                queue.close()
                Log.note("write shutdown state to S3")
                synch.shutdown()

    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 5
0
def main():

    try:
        settings = startup.read_settings(defs=[{
            "name": ["--id"],
            "help": "id(s) to process.  Use \"..\" for a range.",
            "type": str,
            "dest": "id",
            "required": False
        }])
        constants.set(settings.constants)
        Log.start(settings.debug)

        if settings.args.id:
            etl_one(settings)
            return

        hg = HgMozillaOrg(settings=settings.hg)
        resources = Dict(hg=dictwrap(hg))
        stopper = Signal()
        for i in range(coalesce(settings.param.threads, 1)):
            ETL(
                name="ETL Loop " + unicode(i),
                work_queue=settings.work_queue,
                resources=resources,
                workers=settings.workers,
                settings=settings.param,
                please_stop=stopper
            )

        Thread.wait_for_shutdown_signal(stopper, allow_exit=True)
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 6
0
def main():
    """
    CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE
    """
    settings = startup.read_settings(defs=[
        {
            "name": ["--bucket"],
            "help": "bucket to scan",
            "type": str,
            "dest": "bucket",
            "required": True
        }
    ])
    Log.start(settings.debug)

    source = Connection(settings.aws).get_bucket(settings.args.bucket)

    for k in qb.sort(source.keys()):
        try:
            data = source.read_bytes(k)
            if convert.ascii2unicode(data).find("2e2834fa7ecd8d3bb1ad49ec981fdb89eb4df95e18") >= 0:
                Log.note("Found at {{key}}", key=k)
        except Exception, e:
            Log.warning("Problem with {{key}}", key=k, cause=e)
        finally:
def main():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        backfill(settings)
    except Exception, e:
        Log.error("Problem with backfill", e)
Esempio n. 8
0
def main():
    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        aws_args = dict(
            region_name=settings.aws.region,
            aws_access_key_id=unwrap(settings.aws.aws_access_key_id),
            aws_secret_access_key=unwrap(settings.aws.aws_secret_access_key)
        )
        ec2_conn = boto_ec2.connect_to_region(**aws_args)

        instances = _get_managed_instances(ec2_conn, settings.name)

        for i in instances:
            Log.note("Reset {{instance_id}} ({{name}}) at {{ip}}", insance_id=i.id, name=i.tags["Name"], ip=i.ip_address)
            _config_fabric(settings.fabric, i)
            try:
                _refresh_etl()  # TODO: UPON FAILURE, TERMINATE INSTANCE AND SPOT REQUEST
            except Exception, e:
                ec2_conn.terminate_instances([i.id])
                Log.warning("Problem resetting {{instance}}, terminated", instance=i.id, cause=e)
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 9
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        some_failures = http.post_json("http://activedata.allizom.org/query", data={
            "from": "unittest",
            "select": [
                {"name": "branch", "value": "build.branch"},
                {"name": "revision", "value": "build.revision12"},
                {"name": "suite", "value": "run.suite"},
                {"name": "chunk", "value": "run.chunk"},
                {"name": "test", "value": "result.test"}
            ],
            "where": {"and": [
                {"eq": {"result.ok": False}},
                {"gt": {"run.timestamp": Date.today() - WEEK}},
                {"missing": "treeherder.job.note"}
            ]},
            "format": "list",
            "limit": 10
        })


        th = TreeHerder(settings={})

        # th.get_job_classification("mozilla-inbound", "7380457b8ba0")
        for f in some_failures.data:
            th.get_job_classification(f.branch, f.revision)

    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 10
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        some_failures = http.post_json("http://activedata.allizom.org/query",
                                       data={
                                           "from":
                                           "unittest",
                                           "select": [{
                                               "name": "branch",
                                               "value": "build.branch"
                                           }, {
                                               "name":
                                               "revision",
                                               "value":
                                               "build.revision12"
                                           }, {
                                               "name": "suite",
                                               "value": "run.suite"
                                           }, {
                                               "name": "chunk",
                                               "value": "run.chunk"
                                           }, {
                                               "name": "test",
                                               "value": "result.test"
                                           }],
                                           "where": {
                                               "and": [{
                                                   "eq": {
                                                       "result.ok": False
                                                   }
                                               }, {
                                                   "gt": {
                                                       "run.timestamp":
                                                       Date.today() - WEEK
                                                   }
                                               }, {
                                                   "missing":
                                                   "treeherder.job.note"
                                               }]
                                           },
                                           "format":
                                           "list",
                                           "limit":
                                           10
                                       })

        th = TreeHerder(settings={})

        # th.get_job_classification("mozilla-inbound", "7380457b8ba0")
        for f in some_failures.data:
            th.get_job_classification(f.branch, f.revision)

    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 11
0
def main():
    settings = startup.read_settings()
    Log.start(settings.debug)
    try:
        with Multithread(update_repo, threads=10, outbound=False) as multi:
            for repo in Random.combination(settings.param.repos):
                multi.execute([{"repos": repo, "settings": settings}])
    finally:
        Log.stop()
Esempio n. 12
0
def main():
    settings = startup.read_settings()
    Log.start(settings.debug)
    try:
        with Multithread(update_repo, threads=10, outbound=False) as multi:
            for repo in Random.combination(settings.param.repos):
                multi.execute([{"repos": repo, "settings": settings}])
    finally:
        Log.stop()
Esempio n. 13
0
def main():
    settings = startup.read_settings()
    constants.set(settings.constants)
    Log.start(settings.debug)
    with startup.SingleInstance(flavor_id=settings.args.filename):
        try:
            full_etl(settings)
        finally:
            Log.stop()
Esempio n. 14
0
def main():
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)
        please_stop = Signal("main stop signal")
        Thread.wait_for_shutdown_signal(please_stop)
    except Exception, e:
        Log.error("Problem with etl", cause=e)
Esempio n. 15
0
def main():
    try:
        settings = startup.read_settings(defs=[{
            "name": ["--id"],
            "help": "id (prefix, really) to process",
            "type": str,
            "dest": "id",
            "required": False
        }])
        constants.set(settings.constants)
        Log.start(settings.debug)

        queries.config.default = {
            "type": "elasticsearch",
            "settings": settings.elasticsearch.copy()
        }

        if settings.args.id:
            work_queue = Queue("local work queue")
            work_queue.extend(parse_id_argument(settings.args.id))
        else:
            work_queue = aws.Queue(settings=settings.work_queue)

        Log.note("Listen to queue {{queue}}, and read off of {{s3}}",
                 queue=settings.work_queue.name,
                 s3=settings.source.bucket)

        es = MultiDayIndex(settings.elasticsearch, queue_size=100000)

        threads = []
        please_stop = Signal()
        for _ in range(settings.threads):
            p = Thread.run("copy to es",
                           copy2es,
                           es,
                           settings,
                           work_queue,
                           please_stop=please_stop)
            threads.append(p)

        def monitor_progress(please_stop):
            while not please_stop:
                Log.note("Remaining: {{num}}", num=len(work_queue))
                Thread.sleep(seconds=10)

        Thread.run(name="monitor progress",
                   target=monitor_progress,
                   please_stop=please_stop)

        aws.capture_termination_signal(please_stop)
        Thread.wait_for_shutdown_signal(please_stop=please_stop,
                                        allow_exit=True)
        please_stop.go()
        Log.note("Shutdown started")
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 16
0
def setup(settings=None):
    global config

    try:
        config = startup.read_settings(defs={
            "name": ["--process_num", "--process"],
            "help": "Additional port offset (for multiple Flask processes",
            "type": int,
            "dest": "process_num",
            "default": 0,
            "required": False
        },
                                       filename=settings)
        constants.set(config.constants)
        Log.start(config.debug)

        if config.args.process_num and config.flask.port:
            config.flask.port += config.args.process_num

        # PIPE REQUEST LOGS TO ES DEBUG
        if config.request_logs:
            request_logger = elasticsearch.Cluster(
                config.request_logs).get_or_create_index(config.request_logs)
            active_data.request_log_queue = request_logger.threaded_queue(
                max_size=2000)

        # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY
        containers.config.default = {
            "type": "elasticsearch",
            "settings": config.elasticsearch.copy()
        }

        # TURN ON /exit FOR WINDOWS DEBUGGING
        if config.flask.debug or config.flask.allow_exit:
            config.flask.allow_exit = None
            Log.warning("ActiveData is in debug mode")
            app.add_url_rule('/exit', 'exit', _exit)

        # TRIGGER FIRST INSTANCE
        FromESMetadata(config.elasticsearch)
        if config.saved_queries:
            setattr(save_query, "query_finder",
                    SaveQueries(config.saved_queries))
        HeaderRewriterFix(app, remove_headers=['Date', 'Server'])

        if config.flask.ssl_context:
            if config.args.process_num:
                Log.error(
                    "can not serve ssl and multiple Flask instances at once")
            setup_ssl()

        return app
    except Exception, e:
        Log.error(
            "Serious problem with ActiveData service construction!  Shutdown!",
            cause=e)
Esempio n. 17
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        hg = HgMozillaOrg(settings.hg)
        th = TreeHerder(settings=settings)
        find_some_work(th)
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 18
0
def main():
    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        big_data.MAX_STRING_SIZE = 100 * 1000 * 1000

        # get_active_data(settings)
        get_bugs(settings)
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 19
0
def main():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)

        source = get_container(settings.source)
        destination = get_container(settings.destination)

        work_queue = aws.Queue(settings.work_queue)
        backfill(source, destination, work_queue, settings)
    except Exception, e:
        Log.error("Problem with backfill", e)
Esempio n. 20
0
def main():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)

        source = get_container(settings.source)
        destination = get_container(settings.destination)

        work_queue = aws.Queue(settings.work_queue)
        backfill(source, destination, work_queue, settings)
    except Exception, e:
        Log.error("Problem with backfill", e)
Esempio n. 21
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        hg = HgMozillaOrg(settings.hg)
        th = TreeHerder(settings=settings)
        find_some_work(th)
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 22
0
def main():
    settings = startup.read_settings(defs={
       "name": ["--restart", "--reset", "--redo"],
       "help": "force a reprocessing of all data",
       "action": "store_true",
       "dest": "restart"
    })
    Log.start(settings.debug)

    try:
        with startup.SingleInstance(flavor_id=settings.args.filename):
            if settings.args.restart:
                reviews = Cluster(settings.destination).create_index(settings.destination)
            else:
                reviews = Cluster(settings.destination).get_proto(settings.destination)

            bugs = Cluster(settings.source).get_index(settings.source)

            with FromES(bugs) as esq:
                es_max_bug = esq.query({
                    "from": "private_bugs",
                    "select": {"name": "max_bug", "value": "bug_id", "aggregate": "maximum"}
                })

            #PROBE WHAT RANGE OF BUGS IS LEFT TO DO (IN EVENT OF FAILURE)
            with FromES(reviews) as esq:
                es_min_bug = esq.query({
                    "from": "reviews",
                    "select": {"name": "min_bug", "value": "bug_id", "aggregate": "minimum"}
                })

            batch_size = coalesce(bugs.settings.batch_size, settings.size, 1000)
            threads = coalesce(settings.threads, 4)
            Log.note(str(settings.min_bug))
            min_bug = int(coalesce(settings.min_bug, 0))
            max_bug = int(coalesce(settings.max_bug, Math.min(es_min_bug + batch_size * threads, es_max_bug)))

            with ThreadedQueue(reviews, batch_size=coalesce(reviews.settings.batch_size, 100)) as sink:
                func = functools.partial(full_etl, settings, sink)
                with Multithread(func, threads=threads) as m:
                    m.inbound.silent = True
                    Log.note("bugs from {{min}} to {{max}}, step {{step}}", {
                        "min": min_bug,
                        "max": max_bug,
                        "step": batch_size
                    })
                    m.execute(reversed([{"bugs": range(s, e)} for s, e in qb.intervals(min_bug, max_bug, size=1000)]))

            if settings.args.restart:
                reviews.add_alias()
                reviews.delete_all_but_self()
    finally:
        Log.stop()
Esempio n. 23
0
def start():
    global hg
    global config
    _ = wrap

    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)
        if config.hg:
            hg = HgMozillaOrg(config.hg)
        main()
    except Exception, e:
        Log.error("Problems exist", e)
Esempio n. 24
0
def main():
    global all_creds
    global config

    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)

        all_creds = config.users

        app.run(**config.flask)
    except Exception, e:
        Log.error("Serious problem with MoDataSubmission service!  Shutdown completed!", cause=e)
Esempio n. 25
0
def main():
    try:
        settings = startup.read_settings(defs={
            "name": ["--num"],
            "help": "number to show",
            "type": int,
            "dest": "num",
            "default": '10',
            "required": False
        })
        Log.start(settings.debug)
        list_queue(settings.source, settings.args.num)
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 26
0
def main():
    try:
        settings = startup.read_settings(defs={
            "name": ["--filter", "--where"],
            "help": "ES filter",
            "type": str,
            "dest": "filter",
            "default": '{"match_all":{}}',
            "required": True
        })
        Log.start(settings.debug)
        list_s3(settings.source, convert.json2value(convert.ascii2unicode(settings.args.filter)))
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 27
0
def main():
    try:
        settings = startup.read_settings(defs=[{
            "name": ["--id"],
            "help": "id (prefix, really) to process",
            "type": str,
            "dest": "id",
            "required": False
        }])
        constants.set(settings.constants)
        Log.start(settings.debug)

        diff(settings)
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 28
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        branches = _get_branches_from_hg(settings.hg)

        es = elasticsearch.Cluster(settings=settings.hg.branches).get_or_create_index(settings=settings.hg.branches)
        es.add_alias()
        es.extend({"id": b.name + " " + b.locale, "value": b} for b in branches)
        Log.alert("DONE!")
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 29
0
def main():
    global config

    try:
        config = startup.read_settings()
        with startup.SingleInstance(flavor_id=config.args.filename):
            constants.set(config.constants)
            Log.start(config.debug)

            es = elasticsearch.Cluster(config.destination).get_or_create_index(config.destination)

            please_stop = Signal()
            Thread.run("aggregator", loop_all_days, es, please_stop=please_stop)
            Thread.wait_for_shutdown_signal(please_stop=please_stop, allow_exit=True)
    except Exception, e:
        Log.error("Serious problem with Test Failure Aggregator service!  Shutdown completed!", cause=e)
Esempio n. 30
0
def main():
    try:
        settings = startup.read_settings(
            defs={
                "name": ["--filter", "--where"],
                "help": "ES filter",
                "type": str,
                "dest": "filter",
                "default": '{"match_all":{}}',
                "required": True
            })
        Log.start(settings.debug)
        list_s3(
            settings.source,
            convert.json2value(convert.ascii2unicode(settings.args.filter)))
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 31
0
def main():
    try:
        settings = startup.read_settings(defs=[
            {
                "name": ["--id"],
                "help": "id (prefix, really) to process",
                "type": str,
                "dest": "id",
                "required": False
            }
        ])
        constants.set(settings.constants)
        Log.start(settings.debug)

        diff(settings)
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 32
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        branches = get_branches(settings.hg)

        es = elasticsearch.Cluster(
            settings=settings.hg.branches).get_or_create_index(
                settings=settings.hg.branches)
        es.add_alias()
        es.extend({
            "id": b.name + " " + b.locale,
            "value": b
        } for b in branches)
        Log.alert("DONE!")
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 33
0
def main():
    try:
        config = startup.read_settings(defs=[{
            "name": ["--file"],
            "help": "file to save backup",
            "type": str,
            "dest": "file",
            "required": True
        }])
        constants.set(config.constants)
        Log.start(config.debug)

        sq = elasticsearch.Index(settings=config.saved_queries)
        result = sq.search({"query": {"match_all": {}}, "size": 200000})

        File(config.args.file).write("".join(
            map(convert.json2value, result.hits.hits)))

    except Exception, e:
        Log.error("Problem with etl", e)
def main():
    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        aws_args = dict(
            region_name=settings.aws.region,
            aws_access_key_id=unwrap(settings.aws.aws_access_key_id),
            aws_secret_access_key=unwrap(settings.aws.aws_secret_access_key)
        )
        ec2_conn = boto_ec2.connect_to_region(**aws_args)

        instances = _get_managed_instances(ec2_conn, settings.name)

        for i in instances:
            Log.note("Reset {{instance_id}} ({{name}}) at {{ip}}", insance_id=i.id, name=i.tags["Name"], ip=i.ip_address)
            _config_fabric(settings.fabric, i)
            _refresh_indexer()
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 35
0
def main():
    try:
        config = startup.read_settings()
        with startup.SingleInstance(flavor_id=config.args.filename):
            constants.set(config.constants)
            Log.start(config.debug)

            please_stop = Signal("main stop signal")
            coverage_index = elasticsearch.Cluster(config.source).get_index(settings=config.source)
            config.destination.schema = coverage_index.get_schema()
            coverage_summary_index = elasticsearch.Cluster(config.destination).get_or_create_index(read_only=False, settings=config.destination)
            coverage_summary_index.add_alias(config.destination.index)
            Thread.run(
                "processing loop",
                loop,
                config.source,
                coverage_summary_index,
                config,
                please_stop=please_stop
            )
            Thread.wait_for_shutdown_signal(please_stop)
    except Exception, e:
        Log.error("Problem with code coverage score calculation", cause=e)
Esempio n. 36
0
def main():
    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        aws_args = dict(
            region_name=settings.aws.region,
            aws_access_key_id=unwrap(settings.aws.aws_access_key_id),
            aws_secret_access_key=unwrap(settings.aws.aws_secret_access_key))
        ec2_conn = boto_ec2.connect_to_region(**aws_args)

        instances = _get_managed_instances(ec2_conn, settings.name)

        for i in instances:
            Log.note("Reset {{instance_id}} ({{name}}) at {{ip}}",
                     insance_id=i.id,
                     name=i.tags["Name"],
                     ip=i.ip_address)
            _config_fabric(settings.fabric, i)
            _refresh_indexer()
    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 37
0
@app.route('/', defaults={'path': ''}, methods=['GET', 'POST'])
@app.route('/<path:path>', methods=['GET', 'POST'])
def catch_all(path):
    return Response(
        b"",
        status=400,
        headers={
            "access-control-allow-origin": "*",
            "content-type": "text/html"
        }
    )

if __name__ == "__main__":
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)

        # SETUP TREEHERDER CACHE
        hg = HgMozillaOrg(use_cache=True, settings=config.hg)
        th = TreeherderService(hg, settings=config.treeherder)
        app.add_url_rule('/treeherder', None, th.get_treeherder_job, methods=['GET'])

        HeaderRewriterFix(app, remove_headers=['Date', 'Server'])

        app.run(**config.flask)
    except Exception, e:
        Log.error("Serious problem with service construction!  Shutdown!", cause=e)
    finally:
        Log.stop()
Esempio n. 38
0
def main():
    """
    CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE
    """
    try:
        settings = startup.read_settings(defs=[
            {
                "name": ["--bucket"],
                "help": "bucket to reprocess",
                "type": str,
                "dest": "bucket",
                "required": True
            },
            {
                "name": ["--begin", "--start"],
                "help": "lowest key (or prefix) to reprocess",
                "type": str,
                "dest": "start",
                "default": "1",
                "required": False
            },
            {
                "name": ["--end", "--stop"],
                "help": "highest key (or prefix) to reprocess",
                "type": str,
                "dest": "end",
                "default": None,
                "required": False
            },
            {
                "name": ["--file"],
                "help": "path to file with CR-delimited prefix list",
                "type": str,
                "dest": "file",
                "default": None,
                "required": False
            }
        ])
        Log.start(settings.debug)

        with aws.Queue(settings.work_queue) as work_queue:
            source = Connection(settings.aws).get_bucket(settings.args.bucket)

            if settings.args.file:
                now = Date.now()
                for prefix in File(settings.args.file):
                    all_keys = source.keys(prefix=key_prefix(prefix))
                    for k in all_keys:
                        Log.note("Adding {{key}}", key=k)
                        work_queue.add({
                            "bucket": settings.args.bucket,
                            "key": k,
                            "timestamp": now.unix,
                            "date/time": now.format()
                        })
                return

            if settings.args.end and settings.args.start:
                up_to = str(int(settings.args.end) - 1)
                prefix = strings.common_prefix(settings.args.start, up_to)
            else:
                prefix = None
            start = Version(settings.args.start)
            end = Version(settings.args.end)

            all_keys = source.keys(prefix=prefix)
            with Timer("filtering {{num}} keys", {"num": len(all_keys)}):
                all_keys = [(k, Version(k)) for k in all_keys if k.find("None") == -1]
                all_keys = [(k, p) for k, p in all_keys if start <= p < end]
            with Timer("sorting {{num}} keys", {"num": len(all_keys)}):
                all_keys = qb.sort(all_keys, 1)
            for k, p in all_keys:
                Log.note("Adding {{key}}",  key= k)
                now = Date.now()
                work_queue.add({
                    "bucket": settings.args.bucket,
                    "key": k,
                    "timestamp": now.unix,
                    "date/time": now.format()
                })

    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 39
0
 def setUp(self):
     config = startup.read_settings(filename=CONFIG_FILE)
     Log.start(config.debug)
     constants.set(config.constants)
     app.config=config
Esempio n. 40
0
def main():
    """
    CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE
    """
    try:
        settings = startup.read_settings(defs=[{
            "name": ["--bucket"],
            "help": "bucket to reprocess",
            "type": str,
            "dest": "bucket",
            "required": True
        }, {
            "name": ["--begin", "--start"],
            "help": "lowest key (or prefix) to reprocess",
            "type": str,
            "dest": "start",
            "default": "1",
            "required": False
        }, {
            "name": ["--end", "--stop"],
            "help": "highest key (or prefix) to reprocess",
            "type": str,
            "dest": "end",
            "default": None,
            "required": False
        }, {
            "name": ["--file"],
            "help": "path to file with CR-delimited prefix list",
            "type": str,
            "dest": "file",
            "default": None,
            "required": False
        }])
        Log.start(settings.debug)

        with aws.Queue(settings.work_queue) as work_queue:
            source = Connection(settings.aws).get_bucket(settings.args.bucket)

            if settings.args.file:
                now = Date.now()
                for prefix in File(settings.args.file):
                    all_keys = source.keys(prefix=key_prefix(prefix))
                    for k in all_keys:
                        Log.note("Adding {{key}}", key=k)
                        work_queue.add({
                            "bucket": settings.args.bucket,
                            "key": k,
                            "timestamp": now.unix,
                            "date/time": now.format()
                        })
                return

            if settings.args.end and settings.args.start:
                up_to = str(int(settings.args.end) - 1)
                prefix = strings.common_prefix(settings.args.start, up_to)
            else:
                prefix = None
            start = Version(settings.args.start)
            end = Version(settings.args.end)

            all_keys = source.keys(prefix=prefix)
            with Timer("filtering {{num}} keys", {"num": len(all_keys)}):
                all_keys = [(k, Version(k)) for k in all_keys
                            if k.find("None") == -1]
                all_keys = [(k, p) for k, p in all_keys if start <= p < end]
            with Timer("sorting {{num}} keys", {"num": len(all_keys)}):
                all_keys = qb.sort(all_keys, 1)
            for k, p in all_keys:
                Log.note("Adding {{key}}", key=k)
                now = Date.now()
                work_queue.add({
                    "bucket": settings.args.bucket,
                    "key": k,
                    "timestamp": now.unix,
                    "date/time": now.format()
                })

    except Exception, e:
        Log.error("Problem with etl", e)
Esempio n. 41
0

@app.route('/', defaults={'path': ''}, methods=['GET', 'POST'])
@app.route('/<path:path>', methods=['GET', 'POST'])
def catch_all(path):
    return Response(b"",
                    status=400,
                    headers={
                        "access-control-allow-origin": "*",
                        "content-type": "text/html"
                    })


if __name__ == "__main__":
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)

        # SETUP TREEHERDER CACHE
        hg = HgMozillaOrg(use_cache=True, settings=config.hg)
        th = TreeherderService(hg, settings=config.treeherder)
        app.add_url_rule('/treeherder',
                         None,
                         th.get_treeherder_job,
                         methods=['GET'])

        HeaderRewriterFix(app, remove_headers=['Date', 'Server'])

        app.run(**config.flask)
    except Exception, e: