def start():
    try:
        settings=startup.read_settings()
        Log.start(settings.debug)
        main(settings)
    except Exception, e:
        Log.error("Problems exist", e)
Example #2
0
 def setUp(self):
     settings = startup.read_settings(filename="leak_check_settings.json")
     Log.start(settings.debug)
     self.private = ElasticSearch(settings.private)
     self.public = ElasticSearch(settings.public)
     self.public_comments = ElasticSearch(settings.public_comments)
     self.settings = settings
Example #3
0
def start():
    try:
        settings = startup.read_settings(defs=[{
            "name": ["--quick", "--fast"],
            "help":
            "use this to process the first and last block, useful for testing the config settings before doing a full run",
            "action": "store_true",
            "dest": "quick"
        }, {
            "name": ["--restart", "--reset", "--redo"],
            "help":
            "use this to force a reprocessing of all data",
            "action":
            "store_true",
            "dest":
            "restart"
        }])

        with startup.SingleInstance(flavor_id=settings.args.filename):
            if settings.args.restart:
                for l in struct.listwrap(settings.debug.log):
                    if l.filename:
                        File(l.filename).parent.delete()
                File(settings.param.first_run_time).delete()
                File(settings.param.last_run_time).delete()

            Log.start(settings.debug)
            main(settings)
    except Exception, e:
        Log.fatal("Can not start", e)
    def test_private_etl(self):
        """
        ENSURE IDENTIFIABLE INFORMATION DOES NOT EXIST ON ANY BUGS
        """
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()
        self.settings.param.allow_private_bugs = True

        database.make_test_instance(self.settings.bugzilla)
        es = elasticsearch.make_test_instance("candidate",
                                              self.settings.fake.bugs)
        es_comments = elasticsearch.make_test_instance(
            "candidate_comments", self.settings.fake.comments)
        bz_etl.main(self.settings, es, es_comments)

        ref = elasticsearch.open_test_instance(
            "reference", self.settings.private_bugs_reference)
        compare_both(es, ref, self.settings, self.settings.param.bugs)

        #DIRECT COMPARE THE FILE JSON
        can = File(self.settings.fake.comments.filename).read()
        ref = File(self.settings.private_comments_reference.filename).read()
        if can != ref:
            for i, c in enumerate(can):
                found = -1
                if can[i] != ref[i]:
                    found = i
                    break
            Log.error("Comments do not match reference\n{{sample}}",
                      {"sample": can[MIN([0, found - 100]):found + 100]})
def get_pending(es, since):
    result = es.search({
        "query": {"filtered": {
            "query": {"match_all": {}},
            "filter": {
            "range": {"modified_ts": {"gte": CNV.datetime2milli(since)}}}
        }},
        "from": 0,
        "size": 0,
        "sort": [],
        "facets": {"default": {"terms": {"field": "bug_id", "size": 200000}}}
    })

    if len(result.facets.default.terms) >= 200000:
        Log.error("Can not handle more than 200K bugs changed")

    pending_bugs = Multiset(
        result.facets.default.terms,
        key_field="term",
        count_field="count"
    )
    Log.note("Source has {{num}} bug versions for updating", {
        "num": len(pending_bugs)
    })
    return pending_bugs
def start():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        main(settings, restart=True)
    except Exception, e:
        Log.error("Can not start", e)
Example #7
0
    def test_ambiguous_whiteboard_screened(self):
        GOOD_BUG_TO_TEST=1046

        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            es = elasticsearch.make_test_instance("candidate", self.settings.candidate)

            #MARK BUG AS ONE OF THE SCREENED GROUPS
            database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0])
            #MARK BUG AS ONE OF THE *NOT* SCREENED GROUPS
            database.add_bug_group(db, GOOD_BUG_TO_TEST, "not screened")
            db.flush()

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes
            param.allow_private_bugs = True

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST)

            for v in versions:
                if v.status_whiteboard not in (None, "", "[screened]"):
                    Log.error("Expecting whiteboard to be screened")
Example #8
0
def get_max_bug_id(es):
    try:
        results = es.search({
            "query": {
                "filtered": {
                    "query": {
                        "match_all": {}
                    },
                    "filter": {
                        "script": {
                            "script": "true"
                        }
                    }
                }
            },
            "from": 0,
            "size": 0,
            "sort": [],
            "facets": {
                "0": {
                    "statistical": {
                        "field": "bug_id"
                    }
                }
            }
        })

        if results.facets["0"].count == 0:
            return 0
        return results.facets["0"].max
    except Exception, e:
        Log.error("Can not get_max_bug from {{host}}/{{index}}", {
            "host": es.settings.host,
            "index": es.settings.index
        }, e)
Example #9
0
def start():
    try:
        settings = startup.read_settings(defs=[{
            "name": ["--quick", "--fast"],
            "help": "use this to process the first and last block, useful for testing the config settings before doing a full run",
            "action": "store_true",
            "dest": "quick"
        }, {
            "name": ["--restart", "--reset", "--redo"],
            "help": "use this to force a reprocessing of all data",
            "action": "store_true",
            "dest": "restart"
        }])

        with startup.SingleInstance(flavor_id=settings.args.filename):
            if settings.args.restart:
                for l in struct.listwrap(settings.debug.log):
                    if l.filename:
                        File(l.filename).parent.delete()
                File(settings.param.first_run_time).delete()
                File(settings.param.last_run_time).delete()

            Log.start(settings.debug)
            main(settings)
    except Exception, e:
        Log.fatal("Can not start", e)
Example #10
0
def setup_es(settings, db, es, es_comments):
    """
    SETUP ES CONNECTIONS TO REFLECT IF WE ARE RESUMING, INCREMENTAL, OR STARTING OVER
    """
    current_run_time = get_current_time(db)

    if File(settings.param.first_run_time).exists and File(settings.param.last_run_time).exists:
        # INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX
        last_run_time = long(File(settings.param.last_run_time).read())
        if not es:
            es = ElasticSearch(settings.es)
            es_comments = ElasticSearch(settings.es_comments)
    elif File(settings.param.first_run_time).exists:
        # DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL
        try:
            last_run_time = 0
            current_run_time = long(File(settings.param.first_run_time).read())
            if not es:
                if not settings.es.alias:
                    temp = ElasticSearch(settings.es).get_proto(settings.es.index)
                    settings.es.alias = settings.es.index
                    settings.es.index = temp.last()
                es = ElasticSearch(settings.es)
                es.set_refresh_interval(1)  #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY

                if not settings.es_comments.alias:
                    temp = ElasticSearch(settings.es_comments).get_proto(settings.es_comments.index)
                    settings.es_comments.alias = settings.es_comments.index
                    settings.es_comments.index = temp.last()
                es_comments = ElasticSearch(settings.es_comments)
        except Exception, e:
            Log.warning("can not resume ETL, restarting", e)
            File(settings.param.first_run_time).delete()
            return setup_es(settings, db, es, es_comments)
    def test_public_etl(self):
        """
        ENSURE ETL GENERATES WHAT'S IN THE REFERENCE FILE
        """
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()
        self.settings.param.allow_private_bugs = Null

        database.make_test_instance(self.settings.bugzilla)
        es = elasticsearch.make_test_instance("candidate",
                                              self.settings.fake.bugs)
        es_comments = elasticsearch.make_test_instance(
            "candidate_comments", self.settings.fake.comments)
        bz_etl.main(self.settings, es, es_comments)

        ref = elasticsearch.open_test_instance(
            "reference", self.settings.public_bugs_reference)
        compare_both(es, ref, self.settings, self.settings.param.bugs)

        #DIRECT COMPARE THE FILE JSON
        can = File(self.settings.fake.comments.filename).read()
        ref = File(self.settings.public_comments_reference.filename).read()
        if can != ref:
            found = -1
            for i, c in enumerate(can):
                if can[i] != ref[i]:
                    found = i
                    break
            Log.error("Comments do not match reference\n{{sample}}",
                      {"sample": can[MIN(0, found - 100):found + 100:]})
 def setUp(self):
     settings = startup.read_settings(filename="leak_check_settings.json")
     Log.start(settings.debug)
     self.private = ElasticSearch(settings.private)
     self.public = ElasticSearch(settings.public)
     self.public_comments = ElasticSearch(settings.public_comments)
     self.settings = settings
def verify_public_bugs(es, private_bugs):
    #VERIFY BUGS ARE IN OUTPUT
    for b in private_bugs:
        versions = compare_es.get_all_bug_versions(es, b)
        if not versions:
            Log.error("Expecting versions for public bug {{bug_id}}",
                      {"bug_id": b})
    def test_whiteboard_screened(self):
        GOOD_BUG_TO_TEST = 1046

        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            es = elasticsearch.make_test_instance("candidate",
                                                  self.settings.candidate)

            #MARK BUG AS ONE OF THE SCREENED GROUPS
            database.add_bug_group(db, GOOD_BUG_TO_TEST,
                                   SCREENED_WHITEBOARD_BUG_GROUPS[0])
            db.flush()

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = struct.wrap([
                GOOD_BUG_TO_TEST
            ])  # bug 1046 sees lots of whiteboard, and other field, changes
            param.allow_private_bugs = True

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST)

            for v in versions:
                if v.status_whiteboard not in (None, "", "[screened]"):
                    Log.error("Expecting whiteboard to be screened")
Example #15
0
def get_recent_private_comments(db, param):
    """
    GET COMMENTS THAT HAVE HAD THEIR PRIVACY INDICATOR CHANGED
    """
    if param.allow_private_bugs:
        return []

    param.field_id = PRIVATE_COMMENTS_FIELD_ID

    try:
        comments = db.query(
            """
            SELECT
                a.comment_id,
                a.bug_id
            FROM
                bugs_activity a
            WHERE
                bug_when >= {{start_time_str}} AND
                fieldid={{field_id}}
            """, param)

        return comments
    except Exception, e:
        Log.error("problem getting recent private attachments", e)
def get_comments_by_id(db, comments, param):
    """
    GET SPECIFIC COMMENTS
    """
    if param.allow_private_bugs:
        return []

    param.comments_filter = esfilter2sqlwhere(db, {"and": [
        {"term": {"isprivate": 0}},
        {"terms": {"c.comment_id": comments}}
    ]})

    try:
        comments = db.query("""
            SELECT
                c.comment_id,
                c.bug_id,
                p.login_name modified_by,
                UNIX_TIMESTAMP(CONVERT_TZ(bug_when, 'US/Pacific','UTC'))*1000 AS modified_ts,
                c.thetext comment,
                c.isprivate
            FROM
                longdescs c
            LEFT JOIN
                profiles p ON c.who = p.userid
            LEFT JOIN
                longdescs_tags t ON t.comment_id=c.comment_id AND t.tag <> 'deleted'
            WHERE
                {{comments_filter}}
            """, param)

        return comments
    except Exception, e:
        Log.error("can not get comment data", e)
Example #17
0
def diff(db, table, old_record, new_record):
    """
    UPDATE bugs_activity WITH THE CHANGES IN RECORDS
    """
    now = milli2string(db, CNV.datetime2milli(get_current_time(db)))
    changed = set(old_record.keys()) ^ set(new_record.keys())
    changed |= set([k for k, v in old_record.items() if v != new_record[k]])

    if table != u"bugs":
        prefix = table + u"."
    else:
        prefix = u""

    for c in changed:
        fieldid=db.query("SELECT id FROM fielddefs WHERE name={{field_name}}", {"field_name": prefix + c})[0].id

        if fieldid == None:
            Log.error("Expecting a valid field name")

        activity = Struct(
            bug_id=old_record.bug_id,
            who=1,
            bug_when=now,
            fieldid=fieldid,
            removed=old_record[c],
            added=new_record[c],
            attach_id=old_record.attach_id,
            comment_id=old_record.comment_id
        )
        db.insert("bugs_activity", activity)

    db.execute("UPDATE bugs SET delta_ts={{now}} WHERE {{where}}", {
        "now":now,
        "where":esfilter2sqlwhere(db, {"term":{"bug_id":old_record.bug_id}})
    })
Example #18
0
    def test_private_etl(self):
        """
        ENSURE IDENTIFIABLE INFORMATION DOES NOT EXIST ON ANY BUGS
        """
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()
        self.settings.param.allow_private_bugs = True

        database.make_test_instance(self.settings.bugzilla)
        es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs)
        es_comments = elasticsearch.make_test_instance("candidate_comments", self.settings.fake.comments)
        bz_etl.main(self.settings, es, es_comments)

        ref = elasticsearch.open_test_instance("reference", self.settings.private_bugs_reference)
        compare_both(es, ref, self.settings, self.settings.param.bugs)

        #DIRECT COMPARE THE FILE JSON
        can = File(self.settings.fake.comments.filename).read()
        ref = File(self.settings.private_comments_reference.filename).read()
        if can != ref:
            for i, c in enumerate(can):
                found = -1
                if can[i] != ref[i]:
                    found = i
                    break
            Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN([0, found - 100]):found + 100]})
def start():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        main(settings, restart=True)
    except Exception, e:
        Log.error("Can not start", e)
Example #20
0
    def test_public_etl(self):
        """
        ENSURE ETL GENERATES WHAT'S IN THE REFERENCE FILE
        """
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()
        self.settings.param.allow_private_bugs = Null

        database.make_test_instance(self.settings.bugzilla)
        es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs)
        es_comments = elasticsearch.make_test_instance("candidate_comments", self.settings.fake.comments)
        bz_etl.main(self.settings, es, es_comments)

        ref = elasticsearch.open_test_instance("reference", self.settings.public_bugs_reference)
        compare_both(es, ref, self.settings, self.settings.param.bugs)

        #DIRECT COMPARE THE FILE JSON
        can = File(self.settings.fake.comments.filename).read()
        ref = File(self.settings.public_comments_reference.filename).read()
        if can != ref:
            found = -1
            for i, c in enumerate(can):
                if can[i] != ref[i]:
                    found = i
                    break
            Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN(0, found - 100):found + 100:]})
Example #21
0
    def test_incremental_etl_catches_tracking_flags(self):
        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            es = elasticsearch.make_test_instance("candidate", self.settings.candidate)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME
            param.start_time = CNV.datetime2milli(CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S"))
            param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = struct.wrap([813650])
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            versions = get_all_bug_versions(es, 813650)

            flags = ["cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18"]
            for v in versions:
                if v.modified_ts>param.start_time:
                    for f in flags:
                        if v[f] != "fixed":
                            Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})
def verify_no_private_bugs(es, private_bugs):
    #VERIFY BUGS ARE NOT IN OUTPUT
    for b in private_bugs:
        versions = compare_es.get_all_bug_versions(es, b)

        if versions:
            Log.error("Expecting no version for private bug {{bug_id}}",
                      {"bug_id": b})
    def tearDown(self):
        #CLOSE THE CACHED DB CONNECTIONS
        bz_etl.close_db_connections()

        if all_db:
            Log.error("not all db connections are closed")

        Log.stop()
Example #24
0
    def tearDown(self):
        #CLOSE THE CACHED DB CONNECTIONS
        bz_etl.close_db_connections()

        if all_db:
            Log.error("not all db connections are closed")

        Log.stop()
Example #25
0
def verify_public_bugs(es, private_bugs):
    #VERIFY BUGS ARE IN OUTPUT
    for b in private_bugs:
        versions = compare_es.get_all_bug_versions(es, b)
        if not versions:
            Log.error("Expecting versions for public bug {{bug_id}}", {
                "bug_id": b
            })
Example #26
0
def verify_no_private_bugs(es, private_bugs):
    #VERIFY BUGS ARE NOT IN OUTPUT
    for b in private_bugs:
        versions = compare_es.get_all_bug_versions(es, b)

        if versions:
            Log.error("Expecting no version for private bug {{bug_id}}", {
                "bug_id": b
            })
Example #27
0
def verify_no_private_attachments(es, private_attachments):
    #VERIFY ATTACHMENTS ARE NOT IN OUTPUT
    for b in Q.select(private_attachments, "bug_id"):
        versions = compare_es.get_all_bug_versions(es, b)
        #WE ASSUME THE ATTACHMENT, IF IT EXISTS, WILL BE SOMEWHERE IN THE BUG IT
        #BELONGS TO, IF AT ALL
        for v in versions:
            for a in v.attachments:
                if a.attach_id in Q.select(private_attachments, "attach_id"):
                    Log.error("Private attachment should not exist")
def verify_no_private_attachments(es, private_attachments):
    #VERIFY ATTACHMENTS ARE NOT IN OUTPUT
    for b in Q.select(private_attachments, "bug_id"):
        versions = compare_es.get_all_bug_versions(es, b)
        #WE ASSUME THE ATTACHMENT, IF IT EXISTS, WILL BE SOMEWHERE IN THE BUG IT
        #BELONGS TO, IF AT ALL
        for v in versions:
            for a in v.attachments:
                if a.attach_id in Q.select(private_attachments, "attach_id"):
                    Log.error("Private attachment should not exist")
def get_private_bugs_for_delete(db, param):
    if param.allow_private_bugs:
        return {0}  # NO BUGS TO DELETE

    try:
        with Timer("get all private bug ids"):
            private_bugs = db.query("SELECT DISTINCT bug_id FROM bug_group_map")
            return set(private_bugs.bug_id) | {0}
    except Exception, e:
        Log.error("problem getting private bugs", e)
Example #30
0
def get_private_bugs_for_delete(db, param):
    if param.allow_private_bugs:
        return {0}  # NO BUGS TO DELETE

    try:
        with Timer("get all private bug ids"):
            private_bugs = db.query(
                "SELECT DISTINCT bug_id FROM bug_group_map")
            return set(private_bugs.bug_id) | {0}
    except Exception, e:
        Log.error("problem getting private bugs", e)
Example #31
0
def run_both_etl(db, output_queue, es_comments, param):
    comment_thread = Thread.run("etl comments", etl_comments, db, es_comments, param)
    process_thread = Thread.run("etl", etl, db, output_queue, param)

    result = comment_thread.join()
    if result.exception:
        Log.error("etl_comments had problems", result.exception)

    result = process_thread.join()
    if result.exception:
        Log.error("etl had problems", result.exception)
def test_replication():
    try:
        settings=startup.read_settings(filename="replication_settings.json")
        Log.start(settings.debug)

        source=ElasticSearch(settings.source)
        destination=replicate.get_or_create_index(settings["destination"], source)

        replicate.replicate(source, destination, [537285], CNV.string2datetime("19900101", "%Y%m%d"))
    finally:
        Log.stop()
    def extend(self, records):
        """
        JUST SO WE MODEL A Queue
        """
        records = {v["id"]: v["value"] for v in records}

        struct.unwrap(self.data).update(records)

        data_as_json = CNV.object2JSON(self.data, pretty=True)

        File(self.filename).write(data_as_json)
        Log.note("{{num}} items added", {"num": len(records)})
Example #34
0
def run_both_etl(db, output_queue, es_comments, param):
    comment_thread = Thread.run("etl comments", etl_comments, db, es_comments,
                                param)
    process_thread = Thread.run("etl", etl, db, output_queue, param)

    result = comment_thread.join()
    if result.exception:
        Log.error("etl_comments had problems", result.exception)

    result = process_thread.join()
    if result.exception:
        Log.error("etl had problems", result.exception)
Example #35
0
def main(settings, es=None, es_comments=None):
    if not settings.param.allow_private_bugs and es and not es_comments:
        Log.error("Must have ES for comments")

    resume_from_last_run = File(
        settings.param.first_run_time).exists and not File(
            settings.param.last_run_time).exists

    #MAKE HANDLES TO CONTAINERS
    try:
        with DB(settings.bugzilla, readonly=True) as db:
            current_run_time, es, es_comments, last_run_time = setup_es(
                settings, db, es, es_comments)

            with ThreadedQueue(es, size=500, silent=True) as output_queue:
                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts))
                # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE.
                # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM
                param.start_time = last_run_time - nvl(
                    settings.param.look_back,
                    5 * 60 * 1000)  # 5 MINUTE LOOK_BACK
                param.start_time_str = extract_bugzilla.milli2string(
                    db, param.start_time)
                param.alias_file = settings.param.alias_file
                param.allow_private_bugs = settings.param.allow_private_bugs

                if last_run_time > 0:
                    with Timer("run incremental etl"):
                        incremental_etl(settings, param, db, es, es_comments,
                                        output_queue)
                else:
                    with Timer("run full etl"):
                        full_etl(resume_from_last_run, settings, param, db, es,
                                 es_comments, output_queue)

                output_queue.add(Thread.STOP)

        if settings.es.alias:
            es.delete_all_but(settings.es.alias, settings.es.index)
            es.add_alias(settings.es.alias)

        if settings.es_comments.alias:
            es.delete_all_but(settings.es_comments.alias,
                              settings.es_comments.index)
            es_comments.add_alias(settings.es_comments.alias)

        File(settings.param.last_run_time).write(
            unicode(CNV.datetime2milli(current_run_time)))
    except Exception, e:
        Log.error("Problem with main ETL loop", e)
Example #36
0
def open_test_instance(name, settings):
    if settings.filename:
        Log.note("Using {{filename}} as {{type}}", {
            "filename": settings.filename,
            "type": name
        })
        return Fake_ES(settings)
    else:
        Log.note("Using ES cluster at {{host}} as {{type}}", {
            "host": settings.host,
            "type": name
        })
        return ElasticSearch(settings)
def loadAliases(settings):
    try:
        try:
            with Timer("load alias file at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}):
                alias_json = File(settings.param.alias_file).read()
        except Exception, e:
            Log.warning("No alias file found (looking at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)})
            alias_json = "{}"
            #self.aliases IS A dict POINTING TO structs
        for k, v in CNV.JSON2object(alias_json).iteritems():
            aliases[k] = struct.wrap(v)

        Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
Example #38
0
def test_replication():
    try:
        settings = startup.read_settings(filename="replication_settings.json")
        Log.start(settings.debug)

        source = ElasticSearch(settings.source)
        destination = replicate.get_or_create_index(settings["destination"],
                                                    source)

        replicate.replicate(source, destination, [537285],
                            CNV.string2datetime("19900101", "%Y%m%d"))
    finally:
        Log.stop()
Example #39
0
def get_comments(db, param):
    if not param.bug_list:
        return []

    if param.allow_private_bugs:
        param.comment_field = SQL("'[screened]' comment")
        param.bug_filter = esfilter2sqlwhere(
            db, {"and": [{
                "terms": {
                    "bug_id": param.bug_list
                }
            }]})
    else:
        param.comment_field = SQL("c.thetext comment")
        param.bug_filter = esfilter2sqlwhere(
            db, {
                "and": [{
                    "terms": {
                        "bug_id": param.bug_list
                    }
                }, {
                    "term": {
                        "isprivate": 0
                    }
                }]
            })

    try:
        comments = db.query(
            """
            SELECT
                c.comment_id,
                c.bug_id,
                p.login_name modified_by,
                UNIX_TIMESTAMP(CONVERT_TZ(bug_when, 'US/Pacific','UTC'))*1000 AS modified_ts,
                {{comment_field}},
                c.isprivate
            FROM
                longdescs c
            LEFT JOIN
                profiles p ON c.who = p.userid
            LEFT JOIN
                longdescs_tags t ON t.comment_id=c.comment_id AND t.tag <> 'deleted'
            WHERE
                {{bug_filter}} AND
                bug_when >= {{start_time_str}}
            """, param)

        return comments
    except Exception, e:
        Log.error("can not get comment data", e)
def analysis(settings, last_run, please_stop):
    DIFF = 7
    if last_run:
        DIFF = 4      #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING
    try_again = True

    while try_again and not please_stop:
        #FIND EMAIL MOST NEEDING REPLACEMENT
        problem_agg = Multiset(allow_negative=True)
        for bug_id, agg in bugs.iteritems():
            #ONLY COUNT NEGATIVE EMAILS
            for email, count in agg.dic.iteritems():
                if count < 0:
                    problem_agg.add(alias(email), amount=count)

        problems = Q.sort([
            {"email": e, "count": c}
            for e, c in problem_agg.dic.iteritems()
            if not aliases.get(e, Null).ignore and (c <= -(DIFF / 2) or last_run)
        ], ["count", "email"])

        try_again = False
        for problem in problems:
            if please_stop:
                break

            #FIND MOST LIKELY MATCH
            solution_agg = Multiset(allow_negative=True)
            for bug_id, agg in bugs.iteritems():
                if agg.dic.get(problem.email, 0) < 0:  #ONLY BUGS THAT ARE EXPERIENCING THIS problem
                    solution_agg += agg
            solutions = Q.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"])

            if last_run and len(solutions) == 2 and solutions[0].count == -solutions[1].count:
                #exact match
                pass
            elif len(solutions) <= 1 or (solutions[1].count + DIFF >= solutions[0].count):
                #not distinctive enough
                continue

            best_solution = solutions[0]
            Log.note("{{problem}} ({{score}}) -> {{solution}} {{matches}}", {
                "problem": problem.email,
                "score": problem.count,
                "solution": best_solution.email,
                "matches": CNV.object2JSON(Q.select(solutions, "count")[:10:])
            })
            try_again = True
            add_alias(problem.email, best_solution.email)

    saveAliases(settings)
    def test_confidential_whiteboard_is_screened(self):
        leaked_whiteboard = get(
            self.private,
            {
                "and": [
                    {
                        "terms": {
                            "bug_group": SCREENED_WHITEBOARD_BUG_GROUPS
                        }
                    },
                    {
                        "exists": {
                            "field": "status_whiteboard"
                        }
                    },
                    {
                        "not": {
                            "terms": {
                                "status_whiteboard": ["", "[screened]"]
                            }
                        }
                    },
                    {
                        "range": {
                            "expires_on": {
                                "gte": NOW
                            }
                        }
                    },  #CURRENT RECORDS
                    {
                        "range": {
                            "modified_ts": {
                                "lt": A_WHILE_AGO
                            }
                        }
                    },  #OF A MINIMUM AGE
                ]
            },
            fields=[
                "bug_id", "product", "component", "status_whiteboard",
                "bug_group", "modified_ts"
            ],
            limit=100)

        if leaked_whiteboard:
            for l in leaked_whiteboard:
                l.modified_ts = CNV.datetime2string(
                    CNV.milli2datetime(l.modified_ts))

            Log.error("Whiteboard leaking:\b{{leak}}",
                      {"leak": leaked_whiteboard})
Example #42
0
def etl_comments(db, es, param, please_stop):
    # CONNECTIONS ARE EXPENSIVE, CACHE HERE
    with comment_db_cache_lock:
        if not comment_db_cache:
            comment_db = DB(db)
            comment_db_cache.append(comment_db)

    with comment_db_cache_lock:
        Log.note("Read comments from database")
        comments = get_comments(comment_db_cache[0], param)

    for g, c in Q.groupby(comments, size=500):
        with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}):
            es.extend({"id": cc.comment_id, "value": cc} for cc in c)
Example #43
0
def etl_comments(db, es, param, please_stop):
    # CONNECTIONS ARE EXPENSIVE, CACHE HERE
    with comment_db_cache_lock:
        if not comment_db_cache:
            comment_db = DB(db)
            comment_db_cache.append(comment_db)

    with comment_db_cache_lock:
        Log.note("Read comments from database")
        comments = get_comments(comment_db_cache[0], param)

    for g, c in Q.groupby(comments, size=500):
        with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}):
            es.extend({"id": cc.comment_id, "value": cc} for cc in c)
Example #44
0
def make_test_instance(db_settings):
    if not db_settings.filename:
        Log.note("Database schema will not be touched")
        return

    with Timer("Make database instance"):
        try:
            #CLEAR SCHEMA
            Log.note("Make empty {{schema}} schema", {"schema":db_settings.schema})
            no_schema=db_settings.copy()
            no_schema.schema = None
            with DB(no_schema) as db:
                db.execute("DROP DATABASE IF EXISTS {{schema}}", {"schema":db.quote_column(db_settings.schema)})
                db.execute("CREATE DATABASE {{schema}}", {"schema":db.quote_column(db_settings.schema)})

            #FILL SCHEMA
            Log.note("Fill {{schema}} schema with data", {"schema":db_settings.schema})
            DB.execute_file(db_settings, db_settings.filename)

            #ADD MISSING TABLES
            with DB(db_settings) as db:
                db.execute("""
                CREATE TABLE `longdescs_tags` (
                  `id` mediumint(9) NOT NULL AUTO_INCREMENT,
                  `comment_id` int(11) DEFAULT NULL,
                  `tag` varchar(24) NOT NULL,
                  PRIMARY KEY (`id`),
                  UNIQUE KEY `longdescs_tags_idx` (`comment_id`,`tag`),
                  CONSTRAINT `fk_longdescs_tags_comment_id_longdescs_comment_id` FOREIGN KEY (`comment_id`) REFERENCES `longdescs` (`comment_id`) ON DELETE CASCADE ON UPDATE CASCADE
                ) DEFAULT CHARSET=utf8""")
        except Exception, e:
            Log.error("Can not setup test database", e)
Example #45
0
    def test_incremental_has_correct_expires_on(self):
        # 813650, 726635 BOTH HAVE CHANGES IN 2013
        bugs = struct.wrap([813650, 726635])
        start_incremental=CNV.datetime2milli(CNV.string2datetime("2013-01-01", "%Y-%m-%d"))

        es = elasticsearch.make_test_instance("candidate", self.settings.candidate)
        with DB(self.settings.bugzilla) as db:
            #SETUP FIRST RUN PARAMETERS
            param = Struct()
            param.end_time = start_incremental
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = bugs
            param.allow_private_bugs = False

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            #SETUP INCREMENTAL RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(datetime.utcnow())
            param.start_time = start_incremental
            param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = bugs
            param.allow_private_bugs = False

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

        for b in bugs:
            results = es.search({
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and":[
                        {"term":{"bug_id":b}},
                        {"range":{"expires_on":{"gte":CNV.datetime2milli(datetime.utcnow())}}}
                    ]}
                }},
                "from": 0,
                "size": 200000,
                "sort": [],
                "fields": ["bug_id"]
            })

            if results.hits.total>1:
                Log.error("Expecting only one active bug_version record")
Example #46
0
def verify_no_private_comments(es, private_comments):
    data = es.search({
        "query": {"filtered": {
            "query": {"match_all": {}},
            "filter": {"and": [
                {"terms": {"comment_id": private_comments}}
            ]}
        }},
        "from": 0,
        "size": 200000,
        "sort": []
    })

    if Q.select(data.hits.hits, "_source"):
        Log.error("Expecting no comments")
Example #47
0
    def test_private_comments_not_leaking(self):
        leaked_comments = get(
            self.public_comments,
            {"term": {"isprivate": "1"}},
            limit=20
        )
        if leaked_comments:
            if self.settings.param.delete:
                self.public_comments.delete_record(
                    {"terms":{"bug_id":leaked_comments.bug_id}}
                )

            Log.error("{{num}} comments marked private have leaked!\n{{comments|indent}}", {
                "num": len(leaked_comments),
                "comments": leaked_comments
            })
def open_test_instance(name, settings):
    if settings.filename:
        Log.note("Using {{filename}} as {{type}}", {
            "filename": settings.filename,
            "type": name
        })
        return Fake_ES(settings)
    else:
        Log.note("Using ES cluster at {{host}} as {{type}}", {
            "host": settings.host,
            "type": name
        })

        ElasticSearch.delete_index(settings)

        schema = CNV.JSON2object(File(settings.schema_file).read(), flexible=True, paths=True)
        es = ElasticSearch.create_index(settings, schema, limit_replicas=True)
        return es
def add_alias(lost, found):
    found_record = aliases.get(found, None)
    lost_record = aliases.get(lost, None)

    new_canonical = found
    old_canonical = nvl(lost_record.canonical, lost)
    lost_record.canonical = new_canonical

    delete_list = []

    #FOLD bugs ON lost=found
    for bug_id, agg in bugs.iteritems():
        v = agg.dic.get(lost, 0)
        if v != 0:
            agg.add(lost, -v)
            agg.add(found, v)

        if not agg:
            delete_list.append(bug_id)

    #FOLD bugs ON old_canonical=new_canonical
    if old_canonical != lost:
        for bug_id, agg in bugs.iteritems():
            v = agg.dic.get(old_canonical, 0)
            if v != 0:
                agg.add(old_canonical, -v)
                agg.add(new_canonical, v)

            if not agg:
                delete_list.append(bug_id)

    for d in delete_list:
        del bugs[d]

    #FOLD ALIASES
    for k, v in aliases.iteritems():
        if v.canonical == old_canonical:
            Log.note(
                "ALIAS REMAPPED: {{alias}}->{{old}} to {{alias}}->{{new}}", {
                    "alias": k,
                    "old": old_canonical,
                    "new": found
                })
            v.canonical = found
def extract_from_file(source_settings, destination):
    with File(source_settings.filename) as handle:
        for g, d in Q.groupby(handle, size=BATCH_SIZE):
            try:
                d2 = map(
                    lambda (x): {"id": x.id, "value": x},
                    map(
                        lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))),
                        d
                    )
                )
                destination.add(d2)
            except Exception, e:
                filename = "Error_" + unicode(g) + ".txt"
                File(filename).write(d)
                Log.warning("Can not convert block {{block}} (file={{host}})", {
                    "block": g,
                    "filename": filename
                }, e)
def get_or_create_index(destination_settings, source):
    #CHECK IF INDEX, OR ALIAS, EXISTS
    es = ElasticSearch(destination_settings)
    aliases = es.get_aliases()

    indexes = [a for a in aliases if a.alias == destination_settings.index]
    if not indexes:
        #CREATE INDEX
        schema = source.get_schema()
        assert schema.settings
        assert schema.mappings
        ElasticSearch.create_index(destination_settings, schema, limit_replicas=True)
    elif len(indexes) > 1:
        Log.error("do not know how to replicate to more than one index")
    elif indexes[0].alias != None:
        destination_settings.alias = destination_settings.index
        destination_settings.index = indexes[0].index

    return ElasticSearch(destination_settings)
    def test_private_comments_not_leaking(self):
        leaked_comments = get(self.public_comments,
                              {"term": {
                                  "isprivate": "1"
                              }},
                              limit=20)
        if leaked_comments:
            if self.settings.param.delete:
                self.public_comments.delete_record(
                    {"terms": {
                        "bug_id": leaked_comments.bug_id
                    }})

            Log.error(
                "{{num}} comments marked private have leaked!\n{{comments|indent}}",
                {
                    "num": len(leaked_comments),
                    "comments": leaked_comments
                })
Example #53
0
def get_comments_by_id(db, comments, param):
    """
    GET SPECIFIC COMMENTS
    """
    if param.allow_private_bugs:
        return []

    param.comments_filter = esfilter2sqlwhere(db, {
        "and": [{
            "term": {
                "isprivate": 0
            }
        }, {
            "terms": {
                "c.comment_id": comments
            }
        }]
    })

    try:
        comments = db.query(
            """
            SELECT
                c.comment_id,
                c.bug_id,
                p.login_name modified_by,
                UNIX_TIMESTAMP(CONVERT_TZ(bug_when, 'US/Pacific','UTC'))*1000 AS modified_ts,
                c.thetext comment,
                c.isprivate
            FROM
                longdescs c
            LEFT JOIN
                profiles p ON c.who = p.userid
            LEFT JOIN
                longdescs_tags t ON t.comment_id=c.comment_id AND t.tag <> 'deleted'
            WHERE
                {{comments_filter}}
            """, param)

        return comments
    except Exception, e:
        Log.error("can not get comment data", e)
def compare_both(candidate, reference, settings, some_bugs):
    File(settings.param.errors).delete()
    try_dir = settings.param.errors + "/try/"
    ref_dir = settings.param.errors + "/ref/"

    with Timer("Comparing to reference"):
        found_errors = False
        for bug_id in some_bugs:
            try:
                versions = Q.sort(
                    get_all_bug_versions(candidate, bug_id, datetime.utcnow()),
                    "modified_ts")
                # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE
                if not versions:
                    max_time = CNV.milli2datetime(settings.bugzilla.expires_on)
                else:
                    max_time = CNV.milli2datetime(versions.last().modified_ts)

                pre_ref_versions = get_all_bug_versions(
                    reference, bug_id, max_time)
                ref_versions = \
                    Q.sort(
                        #ADDED TO FIX OLD PRODUCTION BUG VERSIONS
                        [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions],
                        "modified_ts"
                    )

                can = CNV.object2JSON(versions, pretty=True)
                ref = CNV.object2JSON(ref_versions, pretty=True)
                if can != ref:
                    found_errors = True
                    File(try_dir + unicode(bug_id) + ".txt").write(can)
                    File(ref_dir + unicode(bug_id) + ".txt").write(ref)
            except Exception, e:
                found_errors = True
                Log.warning("Problem ETL'ing bug {{bug_id}}",
                            {"bug_id": bug_id}, e)

        if found_errors:
            Log.error("DIFFERENCES FOUND (Differences shown in {{path}})",
                      {"path": [try_dir, ref_dir]})
Example #55
0
def main(settings):
    file = File(settings.param.alias_file)
    aliases = CNV.JSON2object(file.read())

    for v in aliases.values():
        v.candidates = CNV.dict2Multiset(v.candidates)

    data = [{
        "lost": n,
        "found": d.canonical
    } for n, d in aliases.items() if d.canonical != None and n != d.canonical]

    sorted = Q.sort(data, "found")
    for s in sorted:
        Log.note("{{found}} == {{lost}}", s)

    clean = {
        n: d.canonical
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical and n != ""
    }

    rev_clean = struct.inverse(clean)
    Log.note(CNV.object2JSON(rev_clean, pretty=True))

    for k, v in rev_clean.items():
        if len(v) > 3:
            Log.note(CNV.object2JSON({k: v}, pretty=True))