def verify_no_private_attachments(es, private_attachments):
    #VERIFY ATTACHMENTS ARE NOT IN OUTPUT
    for b in Q.select(private_attachments, "bug_id"):
        versions = compare_es.get_all_bug_versions(es, b)
        #WE ASSUME THE ATTACHMENT, IF IT EXISTS, WILL BE SOMEWHERE IN THE BUG IT
        #BELONGS TO, IF AT ALL
        for v in versions:
            for a in v.attachments:
                if a.attach_id in Q.select(private_attachments, "attach_id"):
                    Log.error("Private attachment should not exist")
コード例 #2
0
ファイル: test_etl.py プロジェクト: klahnakoski/Bugzilla-ETL
def verify_no_private_attachments(es, private_attachments):
    #VERIFY ATTACHMENTS ARE NOT IN OUTPUT
    for b in Q.select(private_attachments, "bug_id"):
        versions = compare_es.get_all_bug_versions(es, b)
        #WE ASSUME THE ATTACHMENT, IF IT EXISTS, WILL BE SOMEWHERE IN THE BUG IT
        #BELONGS TO, IF AT ALL
        for v in versions:
            for a in v.attachments:
                if a.attach_id in Q.select(private_attachments, "attach_id"):
                    Log.error("Private attachment should not exist")
コード例 #3
0
def incremental_etl(settings, param, db, es, es_comments, output_queue):
    ####################################################################
    ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE
    ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS
    ####################################################################

    #REMOVE PRIVATE BUGS
    private_bugs = get_private_bugs_for_delete(db, param)
    Log.note(
        "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}",
        {"private_bugs": private_bugs})
    for g, delete_bugs in Q.groupby(private_bugs, size=1000):
        still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}})
        if still_existing:
            Log.note(
                "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}",
                {"private_bugs": still_existing})
        es.delete_record({"terms": {"bug_id": delete_bugs}})
        es_comments.delete_record({"terms": {"bug_id": delete_bugs}})

    #RECENT PUBLIC BUGS
    possible_public_bugs = get_recent_private_bugs(db, param)
    if param.allow_private_bugs:
        #PRIVATE BUGS
        #    A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO
        es.delete_record({"terms": {"bug_id": possible_public_bugs}})
    else:
        #PUBLIC BUGS
        #    IF ADDING GROUP THEN private_bugs ALREADY DID THIS
        #    IF REMOVING GROUP THEN NO RECORDS TO DELETE
        pass

    #REMOVE **RECENT** PRIVATE ATTACHMENTS
    private_attachments = get_recent_private_attachments(db, param)
    bugs_to_refresh = set(Q.select(private_attachments, "bug_id"))
    es.delete_record({"terms": {"bug_id": bugs_to_refresh}})

    #REBUILD BUGS THAT GOT REMOVED
    bug_list = (possible_public_bugs
                | bugs_to_refresh) - private_bugs  # REMOVE PRIVATE BUGS
    if bug_list:
        refresh_param = param.copy()
        refresh_param.bug_list = bug_list
        refresh_param.start_time = 0
        refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0)

        try:
            etl(db, output_queue, refresh_param.copy(), please_stop=None)
            etl_comments(db,
                         es_comments,
                         refresh_param.copy(),
                         please_stop=None)
        except Exception, e:
            Log.error("Problem with etl using parameters {{parameters}}",
                      {"parameters": refresh_param}, e)
コード例 #4
0
def analysis(settings, last_run, please_stop):
    DIFF = 7
    if last_run:
        DIFF = 4      #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING
    try_again = True

    while try_again and not please_stop:
        #FIND EMAIL MOST NEEDING REPLACEMENT
        problem_agg = Multiset(allow_negative=True)
        for bug_id, agg in bugs.iteritems():
            #ONLY COUNT NEGATIVE EMAILS
            for email, count in agg.dic.iteritems():
                if count < 0:
                    problem_agg.add(alias(email), amount=count)

        problems = Q.sort([
            {"email": e, "count": c}
            for e, c in problem_agg.dic.iteritems()
            if not aliases.get(e, Null).ignore and (c <= -(DIFF / 2) or last_run)
        ], ["count", "email"])

        try_again = False
        for problem in problems:
            if please_stop:
                break

            #FIND MOST LIKELY MATCH
            solution_agg = Multiset(allow_negative=True)
            for bug_id, agg in bugs.iteritems():
                if agg.dic.get(problem.email, 0) < 0:  #ONLY BUGS THAT ARE EXPERIENCING THIS problem
                    solution_agg += agg
            solutions = Q.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"])

            if last_run and len(solutions) == 2 and solutions[0].count == -solutions[1].count:
                #exact match
                pass
            elif len(solutions) <= 1 or (solutions[1].count + DIFF >= solutions[0].count):
                #not distinctive enough
                continue

            best_solution = solutions[0]
            Log.note("{{problem}} ({{score}}) -> {{solution}} {{matches}}", {
                "problem": problem.email,
                "score": problem.count,
                "solution": best_solution.email,
                "matches": CNV.object2JSON(Q.select(solutions, "count")[:10:])
            })
            try_again = True
            add_alias(problem.email, best_solution.email)

    saveAliases(settings)
コード例 #5
0
ファイル: bz_etl.py プロジェクト: klahnakoski/Bugzilla-ETL
def incremental_etl(settings, param, db, es, es_comments, output_queue):
    ####################################################################
    ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE
    ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS
    ####################################################################

    #REMOVE PRIVATE BUGS
    private_bugs = get_private_bugs_for_delete(db, param)
    Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs})
    for g, delete_bugs in Q.groupby(private_bugs, size=1000):
        still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}})
        if still_existing:
            Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": still_existing})
        es.delete_record({"terms": {"bug_id": delete_bugs}})
        es_comments.delete_record({"terms": {"bug_id": delete_bugs}})


    #RECENT PUBLIC BUGS
    possible_public_bugs = get_recent_private_bugs(db, param)
    if param.allow_private_bugs:
        #PRIVATE BUGS
        #    A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO
        es.delete_record({"terms": {"bug_id": possible_public_bugs}})
    else:
        #PUBLIC BUGS
        #    IF ADDING GROUP THEN private_bugs ALREADY DID THIS
        #    IF REMOVING GROUP THEN NO RECORDS TO DELETE
        pass

    #REMOVE **RECENT** PRIVATE ATTACHMENTS
    private_attachments = get_recent_private_attachments(db, param)
    bugs_to_refresh = set(Q.select(private_attachments, "bug_id"))
    es.delete_record({"terms": {"bug_id": bugs_to_refresh}})

    #REBUILD BUGS THAT GOT REMOVED
    bug_list = (possible_public_bugs | bugs_to_refresh) - private_bugs # REMOVE PRIVATE BUGS
    if bug_list:
        refresh_param = param.copy()
        refresh_param.bug_list = bug_list
        refresh_param.start_time = 0
        refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0)

        try:
            etl(db, output_queue, refresh_param.copy(), please_stop=None)
            etl_comments(db, es_comments, refresh_param.copy(), please_stop=None)
        except Exception, e:
            Log.error("Problem with etl using parameters {{parameters}}", {
                "parameters": refresh_param
            }, e)
コード例 #6
0
def main(settings):
    file = File(settings.param.alias_file)
    aliases = CNV.JSON2object(file.read())

    for v in aliases.values():
        v.candidates = CNV.dict2Multiset(v.candidates)

    data = [{
        "lost": n,
        "found": d.canonical
    } for n, d in aliases.items() if d.canonical != None and n != d.canonical]

    sorted = Q.sort(data, "found")
    for s in sorted:
        Log.note("{{found}} == {{lost}}", s)

    clean = {
        n: d.canonical
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical and n != ""
    }

    rev_clean = struct.inverse(clean)
    Log.note(CNV.object2JSON(rev_clean, pretty=True))

    for k, v in rev_clean.items():
        if len(v) > 3:
            Log.note(CNV.object2JSON({k: v}, pretty=True))
コード例 #7
0
def main(settings):
    file = File(settings.param.alias_file)
    aliases = CNV.JSON2object(file.read())

    for v in aliases.values():
        v.candidates = CNV.dict2Multiset(v.candidates)

    data = [
        {
            "lost": n,
            "found": d.canonical
        }
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical
    ]

    sorted = Q.sort(data, "found")
    for s in sorted:
        Log.note("{{found}} == {{lost}}", s)

    clean = {
        n: d.canonical
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical and n != ""
    }

    rev_clean = struct.inverse(clean)
    Log.note(CNV.object2JSON(rev_clean, pretty=True))

    for k, v in rev_clean.items():
        if len(v) > 3:
            Log.note(CNV.object2JSON({k: v}, pretty=True))
    def blocks_of_bugs(self):
        max_bug_id = self.private.search({
            "query": {
                "filtered": {
                    "query": {
                        "match_all": {}
                    },
                    "filter": {
                        "and": [{
                            "match_all": {}
                        }]
                    }
                }
            },
            "from": 0,
            "size": 0,
            "sort": [],
            "facets": {
                "0": {
                    "statistical": {
                        "field": "bug_id"
                    }
                }
            }
        }).facets["0"].max

        return reversed(
            list(Q.intervals(0, max_bug_id, self.settings.param.increment)))
def replicate(source, destination, pending, last_updated):
    """
    COPY source RECORDS TO destination
    """
    for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE):
        with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}):
            data = source.search({
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and": [
                        {"terms": {"bug_id": set(bugs)}},
                        {"range": {"modified_ts":
                            {"gte": CNV.datetime2milli(last_updated)}
                        }}
                    ]}
                }},
                "from": 0,
                "size": 200000,
                "sort": []
            })

            d2 = map(
                lambda(x): {"id": x.id, "value": x},
                map(
                    lambda(x): transform_bugzilla.normalize(transform_bugzilla.rename_attachments(x._source), old_school=True),
                    data.hits.hits
                )
            )
            destination.extend(d2)
コード例 #10
0
def get(es, esfilter, fields=None, limit=None):
    query = struct.wrap({
        "query": {"filtered": {
            "query": {"match_all": {}},
            "filter": esfilter
        }},
        "from": 0,
        "size": nvl(limit, 200000),
        "sort": []
    })

    if fields:
        query.fields=fields
        results = es.search(query)
        return Q.select(results.hits.hits, "fields")
    else:
        results = es.search(query)
        return Q.select(results.hits.hits, "_source")
def aggregator(data):
    """
    FLATTEN CC LISTS OVER TIME BY BUG
    MULTISET COUNTS THE NUMBER OF EMAIL AT BUG CREATION
    NEGATIVE MEANS THERE WAS AN ADD WITHOUT A REMOVE (AND NOT IN CURRENT LIST)
    """
    for d in data:
        new_emails = Q.map2set(split_email(d.new_value), alias)
        old_emails = Q.map2set(split_email(d.old_value), alias)

        for e in new_emails | old_emails:
            details = aliases.get(e, Struct())
            aliases[e] = details

        agg = bugs.get(d.bug_id, Multiset(allow_negative=True))
        agg = agg - new_emails
        agg = agg + old_emails
        bugs[d.bug_id] = agg
コード例 #12
0
def aggregator(data):
    """
    FLATTEN CC LISTS OVER TIME BY BUG
    MULTISET COUNTS THE NUMBER OF EMAIL AT BUG CREATION
    NEGATIVE MEANS THERE WAS AN ADD WITHOUT A REMOVE (AND NOT IN CURRENT LIST)
    """
    for d in data:
        new_emails = Q.map2set(split_email(d.new_value), alias)
        old_emails = Q.map2set(split_email(d.old_value), alias)

        for e in new_emails | old_emails:
            details = aliases.get(e, Struct())
            aliases[e] = details

        agg = bugs.get(d.bug_id, Multiset(allow_negative=True))
        agg = agg - new_emails
        agg = agg + old_emails
        bugs[d.bug_id] = agg
コード例 #13
0
def get_screened_whiteboard(db):
    if not SCREENED_BUG_GROUP_IDS:
        groups = db.query(
            "SELECT id FROM groups WHERE {{where}}", {
                "where":
                esfilter2sqlwhere(
                    db, {"terms": {
                        "name": SCREENED_WHITEBOARD_BUG_GROUPS
                    }})
            })
        globals()["SCREENED_BUG_GROUP_IDS"] = Q.select(groups, "id")
def compare_both(candidate, reference, settings, some_bugs):
    File(settings.param.errors).delete()
    try_dir = settings.param.errors + "/try/"
    ref_dir = settings.param.errors + "/ref/"

    with Timer("Comparing to reference"):
        found_errors = False
        for bug_id in some_bugs:
            try:
                versions = Q.sort(
                    get_all_bug_versions(candidate, bug_id, datetime.utcnow()),
                    "modified_ts")
                # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE
                if not versions:
                    max_time = CNV.milli2datetime(settings.bugzilla.expires_on)
                else:
                    max_time = CNV.milli2datetime(versions.last().modified_ts)

                pre_ref_versions = get_all_bug_versions(
                    reference, bug_id, max_time)
                ref_versions = \
                    Q.sort(
                        #ADDED TO FIX OLD PRODUCTION BUG VERSIONS
                        [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions],
                        "modified_ts"
                    )

                can = CNV.object2JSON(versions, pretty=True)
                ref = CNV.object2JSON(ref_versions, pretty=True)
                if can != ref:
                    found_errors = True
                    File(try_dir + unicode(bug_id) + ".txt").write(can)
                    File(ref_dir + unicode(bug_id) + ".txt").write(ref)
            except Exception, e:
                found_errors = True
                Log.warning("Problem ETL'ing bug {{bug_id}}",
                            {"bug_id": bug_id}, e)

        if found_errors:
            Log.error("DIFFERENCES FOUND (Differences shown in {{path}})",
                      {"path": [try_dir, ref_dir]})
コード例 #15
0
ファイル: test_etl.py プロジェクト: klahnakoski/Bugzilla-ETL
def compare_both(candidate, reference, settings, some_bugs):
    File(settings.param.errors).delete()
    try_dir = settings.param.errors + "/try/"
    ref_dir = settings.param.errors + "/ref/"

    with Timer("Comparing to reference"):
        found_errors = False
        for bug_id in some_bugs:
            try:
                versions = Q.sort(
                    get_all_bug_versions(candidate, bug_id, datetime.utcnow()),
                    "modified_ts")
                # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE
                if not versions:
                    max_time = CNV.milli2datetime(settings.bugzilla.expires_on)
                else:
                    max_time = CNV.milli2datetime(versions.last().modified_ts)

                pre_ref_versions = get_all_bug_versions(reference, bug_id, max_time)
                ref_versions = \
                    Q.sort(
                        #ADDED TO FIX OLD PRODUCTION BUG VERSIONS
                        [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions],
                        "modified_ts"
                    )

                can = CNV.object2JSON(versions, pretty=True)
                ref = CNV.object2JSON(ref_versions, pretty=True)
                if can != ref:
                    found_errors = True
                    File(try_dir + unicode(bug_id) + ".txt").write(can)
                    File(ref_dir + unicode(bug_id) + ".txt").write(ref)
            except Exception, e:
                found_errors = True
                Log.warning("Problem ETL'ing bug {{bug_id}}", {"bug_id": bug_id}, e)

        if found_errors:
            Log.error("DIFFERENCES FOUND (Differences shown in {{path}})", {
                "path": [try_dir, ref_dir]}
            )
def get(es, esfilter, fields=None, limit=None):
    query = struct.wrap({
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": esfilter
            }
        },
        "from": 0,
        "size": nvl(limit, 200000),
        "sort": []
    })

    if fields:
        query.fields = fields
        results = es.search(query)
        return Q.select(results.hits.hits, "fields")
    else:
        results = es.search(query)
        return Q.select(results.hits.hits, "_source")
コード例 #17
0
    def blocks_of_bugs(self):
        max_bug_id = self.private.search({
            "query": {"filtered": {
                "query": {"match_all": {}},
                "filter": {"and": [{"match_all": {}}]}
            }},
            "from": 0,
            "size": 0,
            "sort": [],
            "facets": {"0": {"statistical": {"field": "bug_id"}}}
        }).facets["0"].max

        return reversed(list(Q.intervals(0, max_bug_id, self.settings.param.increment)))
コード例 #18
0
ファイル: bz_etl.py プロジェクト: klahnakoski/Bugzilla-ETL
def etl_comments(db, es, param, please_stop):
    # CONNECTIONS ARE EXPENSIVE, CACHE HERE
    with comment_db_cache_lock:
        if not comment_db_cache:
            comment_db = DB(db)
            comment_db_cache.append(comment_db)

    with comment_db_cache_lock:
        Log.note("Read comments from database")
        comments = get_comments(comment_db_cache[0], param)

    for g, c in Q.groupby(comments, size=500):
        with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}):
            es.extend({"id": cc.comment_id, "value": cc} for cc in c)
コード例 #19
0
def etl_comments(db, es, param, please_stop):
    # CONNECTIONS ARE EXPENSIVE, CACHE HERE
    with comment_db_cache_lock:
        if not comment_db_cache:
            comment_db = DB(db)
            comment_db_cache.append(comment_db)

    with comment_db_cache_lock:
        Log.note("Read comments from database")
        comments = get_comments(comment_db_cache[0], param)

    for g, c in Q.groupby(comments, size=500):
        with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}):
            es.extend({"id": cc.comment_id, "value": cc} for cc in c)
コード例 #20
0
ファイル: test_etl.py プロジェクト: klahnakoski/Bugzilla-ETL
def verify_no_private_comments(es, private_comments):
    data = es.search({
        "query": {"filtered": {
            "query": {"match_all": {}},
            "filter": {"and": [
                {"terms": {"comment_id": private_comments}}
            ]}
        }},
        "from": 0,
        "size": 200000,
        "sort": []
    })

    if Q.select(data.hits.hits, "_source"):
        Log.error("Expecting no comments")
コード例 #21
0
def get_all_bug_versions(es, bug_id, max_time=None):
    max_time = nvl(max_time, datetime.max)

    data = es.search({
        "query": {"filtered": {
            "query": {"match_all": {}},
            "filter": {"and": [
                {"term": {"bug_id": bug_id}},
                {"range": {"modified_ts": {"lte": CNV.datetime2milli(max_time)}}}
            ]}
        }},
        "from": 0,
        "size": 200000,
        "sort": []
    })

    return Q.select(data.hits.hits, "_source")
コード例 #22
0
def old2new(bug, max_date):
    """
    CONVERT THE OLD ES FORMAT TO THE NEW
    THESE ARE KNOWN CHANGES THAT SHOULD BE MADE TO THE PRODUCTION VERSION
    """
    if bug.everconfirmed != None:
        if bug.everconfirmed == "":
            bug.everconfirmed = None
        else:
            bug.everconfirmed = int(bug.everconfirmed)

    bug = CNV.JSON2object(CNV.object2JSON(bug).replace("bugzilla: other b.m.o issues ", "bugzilla: other b.m.o issues"))

    if bug.expires_on > max_date:
        bug.expires_on = parse_bug_history.MAX_TIME
    if bug.votes != None:
        bug.votes = int(bug.votes)
    bug.dupe_by = CNV.value2intlist(bug.dupe_by)
    if bug.votes == 0:
        del bug["votes"]
        # if Math.is_integer(bug.remaining_time) and int(bug.remaining_time) == 0:
    #     bug.remaining_time = 0
    if bug.cf_due_date != None and not Math.is_number(bug.cf_due_date):
        bug.cf_due_date = CNV.datetime2milli(
            CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d")
        )
    bug.changes = CNV.JSON2object(
        CNV.object2JSON(Q.sort(bug.changes, "field_name")) \
            .replace("\"field_value_removed\":", "\"old_value\":") \
            .replace("\"field_value\":", "\"new_value\":")
    )

    if bug.everconfirmed == 0:
        del bug["everconfirmed"]
    if bug.id == "692436_1336314345":
        bug.votes = 3

    try:
        if Math.is_number(bug.cf_last_resolved):
            bug.cf_last_resolved = long(bug.cf_last_resolved)
        else:
            bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
    except Exception, e:
        pass
def extract_from_file(source_settings, destination):
    with File(source_settings.filename) as handle:
        for g, d in Q.groupby(handle, size=BATCH_SIZE):
            try:
                d2 = map(
                    lambda (x): {"id": x.id, "value": x},
                    map(
                        lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))),
                        d
                    )
                )
                destination.add(d2)
            except Exception, e:
                filename = "Error_" + unicode(g) + ".txt"
                File(filename).write(d)
                Log.warning("Can not convert block {{block}} (file={{host}})", {
                    "block": g,
                    "filename": filename
                }, e)
コード例 #24
0
def etl(db, output_queue, param, please_stop):
    """
    PROCESS RANGE, AS SPECIFIED IN param AND PUSH
    BUG VERSION RECORDS TO output_queue
    """

    # CONNECTIONS ARE EXPENSIVE, CACHE HERE
    with db_cache_lock:
        if not db_cache:
            with Timer("open connections to db"):
                for f in get_stuff_from_bugzilla:
                    db = DB(db)
                    db_cache.append(db)

    db_results = Queue(max=2**30)
    with db_cache_lock:
        # ASYMMETRIC MULTI THREADING TO GET RECORDS FROM DB
        with AllThread() as all:
            for i, f in enumerate(get_stuff_from_bugzilla):

                def process(target, db, param, please_stop):
                    db_results.extend(target(db, param))

                all.add(process, f, db_cache[i], param.copy())
    db_results.add(Thread.STOP)

    sorted = Q.sort(db_results, [
        "bug_id", "_merge_order", {
            "field": "modified_ts",
            "sort": -1
        }, "modified_by"
    ])

    process = BugHistoryParser(param, output_queue)
    for s in sorted:
        process.processRow(s)
    process.processRow(
        struct.wrap({
            "bug_id": parse_bug_history.STOP_BUG,
            "_merge_order": 1
        }))
def milli2datetime(r):
    """
    CONVERT ANY longs INTO TIME STRINGS
    """
    try:
        if r == None:
            return None
        elif isinstance(r, basestring):
            return r
        elif Math.is_number(r):
            if CNV.value2number(r) > 800000000000:
                return CNV.datetime2string(CNV.milli2datetime(r),
                                           "%Y-%m-%d %H:%M:%S")
            else:
                return r
        elif isinstance(r, dict):
            output = {}
            for k, v in r.items():
                v = milli2datetime(v)
                if v != None:
                    output[k.lower()] = v
            return output
        elif hasattr(r, '__iter__'):
            output = []
            for v in r:
                v = milli2datetime(v)
                if v != None:
                    output.append(v)
            if not output:
                return None
            try:
                return Q.sort(output)
            except Exception:
                return output
        else:
            return r
    except Exception, e:
        Log.warning("Can not scrub: {{json}}", {"json": r}, e)
コード例 #26
0
def main(settings, bug_list=None, please_stop=None, restart=False):
    """
    THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE.
    SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME.  THIS CODE
    WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED
    OVER THE LIFETIME OF THE BUGZILLA DATA.  'PERSON' IS ABSTRACT, AND SIMPLY
    ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION
    """
    if settings.args.quick:
        Log.note("Alias analysis skipped (--quick was used)")
        return

    if not restart:
        loadAliases(settings)

    if bug_list:
        with DB(settings.bugzilla, readonly=True) as db:
            data = get_all_cc_changes(db, bug_list)
            aggregator(data)
            analysis(settings, True, please_stop)
        return

    with DB(settings.bugzilla, readonly=True) as db:
        start = nvl(settings.param.start, 0)
        end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)

        #Perform analysis on blocks of bugs, in case we crash partway through
        for s, e in Q.intervals(start, end, settings.param.alias_increment):
            Log.note("Load range {{start}}-{{end}}", {
                "start": s,
                "end": e
            })
            data = get_all_cc_changes(db, range(s, e))
            if please_stop:
                break
            aggregator(data)

            analysis(settings, e >= end, please_stop)
def verify_no_private_comments(es, private_comments):
    data = es.search({
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": {
                    "and": [{
                        "terms": {
                            "comment_id": private_comments
                        }
                    }]
                }
            }
        },
        "from": 0,
        "size": 200000,
        "sort": []
    })

    if Q.select(data.hits.hits, "_source"):
        Log.error("Expecting no comments")
コード例 #28
0
def milli2datetime(r):
    """
    CONVERT ANY longs INTO TIME STRINGS
    """
    try:
        if r == None:
            return None
        elif isinstance(r, basestring):
            return r
        elif Math.is_number(r):
            if CNV.value2number(r) > 800000000000:
                return CNV.datetime2string(CNV.milli2datetime(r), "%Y-%m-%d %H:%M:%S")
            else:
                return r
        elif isinstance(r, dict):
            output = {}
            for k, v in r.items():
                v = milli2datetime(v)
                if v != None:
                    output[k.lower()] = v
            return output
        elif hasattr(r, '__iter__'):
            output = []
            for v in r:
                v = milli2datetime(v)
                if v != None:
                    output.append(v)
            if not output:
                return None
            try:
                return Q.sort(output)
            except Exception:
                return output
        else:
            return r
    except Exception, e:
        Log.warning("Can not scrub: {{json}}", {"json": r}, e)
def main(settings, bug_list=None, please_stop=None, restart=False):
    """
    THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE.
    SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME.  THIS CODE
    WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED
    OVER THE LIFETIME OF THE BUGZILLA DATA.  'PERSON' IS ABSTRACT, AND SIMPLY
    ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION
    """
    if settings.args.quick:
        Log.note("Alias analysis skipped (--quick was used)")
        return

    if not restart:
        loadAliases(settings)

    if bug_list:
        with DB(settings.bugzilla, readonly=True) as db:
            data = get_all_cc_changes(db, bug_list)
            aggregator(data)
            analysis(settings, True, please_stop)
        return

    with DB(settings.bugzilla, readonly=True) as db:
        start = nvl(settings.param.start, 0)
        end = nvl(settings.param.end,
                  db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)

        #Perform analysis on blocks of bugs, in case we crash partway through
        for s, e in Q.intervals(start, end, settings.param.alias_increment):
            Log.note("Load range {{start}}-{{end}}", {"start": s, "end": e})
            data = get_all_cc_changes(db, range(s, e))
            if please_stop:
                break
            aggregator(data)

            analysis(settings, e >= end, please_stop)
コード例 #30
0
ファイル: bz_etl.py プロジェクト: klahnakoski/Bugzilla-ETL
def etl(db, output_queue, param, please_stop):
    """
    PROCESS RANGE, AS SPECIFIED IN param AND PUSH
    BUG VERSION RECORDS TO output_queue
    """

    # CONNECTIONS ARE EXPENSIVE, CACHE HERE
    with db_cache_lock:
        if not db_cache:
            with Timer("open connections to db"):
                for f in get_stuff_from_bugzilla:
                    db = DB(db)
                    db_cache.append(db)

    db_results = Queue(max=2**30)
    with db_cache_lock:
        # ASYMMETRIC MULTI THREADING TO GET RECORDS FROM DB
        with AllThread() as all:
            for i, f in enumerate(get_stuff_from_bugzilla):
                def process(target, db, param, please_stop):
                    db_results.extend(target(db, param))

                all.add(process, f, db_cache[i], param.copy())
    db_results.add(Thread.STOP)

    sorted = Q.sort(db_results, [
        "bug_id",
        "_merge_order",
        {"field": "modified_ts", "sort": -1},
        "modified_by"
    ])

    process = BugHistoryParser(param, output_queue)
    for s in sorted:
        process.processRow(s)
    process.processRow(struct.wrap({"bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1}))
コード例 #31
0
ファイル: test_etl.py プロジェクト: klahnakoski/Bugzilla-ETL
    def test_changes_to_private_bugs_still_have_bug_group(self):
        self.settings.param.allow_private_bugs = True
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()

        private_bugs = set(Random.sample(self.settings.param.bugs, 3))

        Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs})

        database.make_test_instance(self.settings.bugzilla)

        #MARK SOME BUGS PRIVATE
        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)

        es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs)
        es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments)
        bz_etl.main(self.settings, es, es_c)

        # MAKE A CHANGE TO THE PRIVATE BUGS
        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                old_bug = db.query("SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0]
                new_bug = old_bug.copy()

                new_bug.bug_status = "NEW STATUS"
                diff(db, "bugs", old_bug, new_bug)


        #RUN INCREMENTAL
        bz_etl.main(self.settings, es, es_c)

        #VERIFY BUG GROUP STILL EXISTS
        Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
        now = datetime.utcnow()
        results = es.search({
            "query": {"filtered": {
                "query": {"match_all": {}},
                "filter": {"and": [
                    {"terms": {"bug_id": private_bugs}},
                    {"range": {"expires_on": {"gte": CNV.datetime2milli(now)}}}
                ]}
            }},
            "from": 0,
            "size": 200000,
            "sort": []
        })
        latest_bugs = Q.select(results.hits.hits, "_source")
        latest_bugs_index = Q.unique_index(latest_bugs, "bug_id")  # IF NOT UNIQUE, THEN ETL IS WRONG

        for bug_id in private_bugs:
            if latest_bugs_index[bug_id] == None:
                Log.error("Expecting to find the private bug {{bug_id}}", {"bug_id": bug_id})

            bug_group = latest_bugs_index[bug_id].bug_group
            if not bug_group:
                Log.error("Expecting private bug ({{bug_id}}) to have a bug group", {"bug_id": bug_id})
            if BUG_GROUP_FOR_TESTING not in bug_group:
                Log.error("Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group", {
                    "bug_id": bug_id,
                    "bug_group": BUG_GROUP_FOR_TESTING
                })
コード例 #32
0
    def test_private_bugs_not_leaking(self):
        bad_news = False

        # FOR ALL BUG BLOCKS
        for min_id, max_id in self.blocks_of_bugs():
            results = get(
                self.private,
                {"and": [
                    {"match_all": {}},
                    {"and": [
                        {"range": {"bug_id": {"gte": min_id, "lt": max_id}}},
                        {"exists": {"field": "bug_group"}},
                        {"range": {"expires_on": {"gte": NOW}}},  #CURRENT RECORDS
                        {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE
                    ]}
                ]},
                ["bug_id", "bug_group", "modified_ts"]
            )

            private_ids = {b.bug_id: b.bug_group for b in results}

            Log.note("Ensure {{num}} bugs did not leak", {
                "num": len(private_ids.keys())
            })

            # VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {"and": [
                    {"terms": {"bug_id": private_ids.keys()}},
                    {"range": {"expires_on": {"gte": NOW}}} # SOME BUGS WILL LEAK FOR A LITTLE WHILE
                ]}
            )

            if leaked_bugs:
                bad_news = True
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms":{"bug_id":leaked_bugs.bug_id}}
                    )

                Log.note("{{num}} leaks!! {{bugs}}", {
                    "num": len(leaked_bugs),
                    "bugs": Q.run({
                        "from":leaked_bugs,
                        "select":["bug_id", "bug_version_num", {"name":"modified_ts", "value":lambda d: CNV.datetime2string(CNV.milli2datetime(d.modified_ts))}],
                        "sort":"bug_id"
                    })
                })
                for b in leaked_bugs:
                    Log.note("{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}", {
                        "bug_id": b.bug_id,
                        "bug_group": private_ids[b.bug_id],
                        "version": milli2datetime(b)
                    })

            #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG
            leaked_comments = get(
                self.public_comments,
                {"terms": {"bug_id": private_ids.keys()}},
                limit=20
            )
            if leaked_comments:
                bad_news = True

                if self.settings.param.delete:
                    self.public_comments.delete_record(
                        {"terms":{"bug_id":leaked_comments.bug_id}}
                    )

                Log.warning("{{num}} comments marked private have leaked!\n{{comments|indent}}", {
                    "num": len(leaked_comments),
                    "comments": leaked_comments
                })

        if bad_news:
            Log.error("Bugs have leaked!")
def analysis(settings, last_run, please_stop):
    DIFF = 7
    if last_run:
        DIFF = 4  #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING
    try_again = True

    while try_again and not please_stop:
        #FIND EMAIL MOST NEEDING REPLACEMENT
        problem_agg = Multiset(allow_negative=True)
        for bug_id, agg in bugs.iteritems():
            #ONLY COUNT NEGATIVE EMAILS
            for email, count in agg.dic.iteritems():
                if count < 0:
                    problem_agg.add(alias(email), amount=count)

        problems = Q.sort([{
            "email": e,
            "count": c
        } for e, c in problem_agg.dic.iteritems()
                           if not aliases.get(e, Null).ignore and
                           (c <= -(DIFF / 2) or last_run)], ["count", "email"])

        try_again = False
        for problem in problems:
            if please_stop:
                break

            #FIND MOST LIKELY MATCH
            solution_agg = Multiset(allow_negative=True)
            for bug_id, agg in bugs.iteritems():
                if agg.dic.get(
                        problem.email,
                        0) < 0:  #ONLY BUGS THAT ARE EXPERIENCING THIS problem
                    solution_agg += agg
            solutions = Q.sort([{
                "email": e,
                "count": c
            } for e, c in solution_agg.dic.iteritems()], [{
                "field": "count",
                "sort": -1
            }, "email"])

            if last_run and len(solutions) == 2 and solutions[
                    0].count == -solutions[1].count:
                #exact match
                pass
            elif len(solutions) <= 1 or (solutions[1].count + DIFF >=
                                         solutions[0].count):
                #not distinctive enough
                continue

            best_solution = solutions[0]
            Log.note(
                "{{problem}} ({{score}}) -> {{solution}} {{matches}}", {
                    "problem": problem.email,
                    "score": problem.count,
                    "solution": best_solution.email,
                    "matches": CNV.object2JSON(
                        Q.select(solutions, "count")[:10:])
                })
            try_again = True
            add_alias(problem.email, best_solution.email)

    saveAliases(settings)
コード例 #34
0
    def test_private_attachments_not_leaking(self):
        for min_id, max_id in self.blocks_of_bugs():
            # FIND ALL PRIVATE ATTACHMENTS
            bugs_w_private_attachments = get(
                self.private,
                {"and": [
                    {"range": {"bug_id": {"gte": min_id, "lt": max_id}}},
                    {"range": {"expires_on": {"gte": NOW}}},  #CURRENT RECORDS
                    {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE
                    {"nested": { #HAS ATTACHMENT.
                        "path": "attachments",
                        "query": {"filtered": {
                            "query": {"match_all": {}},
                            "filter": {"exists": {"field":"attachments.attach_id"}}
                        }}
                    }},
                    {"or":[
                        {"nested": { #PRIVATE ATTACHMENT, OR...
                            "path": "attachments",
                            "query": {"filtered": {
                                "query": {"match_all": {}},
                                "filter": {"term": {"attachments.isprivate": 1}}
                            }}
                        }},
                        {"exists":{"field":"bug_group"}}  # ...PRIVATE BUG
                    ]}
                ]},
                fields=["bug_id", "bug_group", "attachments", "modified_ts"]
            )

            private_attachments = Q.run({
                "from": bugs_w_private_attachments,
                "select": "attachments.attach_id",
                "where": {"or": [
                    {"exists": "bug_group"},
                    {"terms": {"attachments.isprivate": ['1', True, 1]}}
                ]}
            })
            try:
                private_attachments = [int(v) for v in private_attachments]
            except Exception, e:
                private_attachments = Q.run({
                    "from": bugs_w_private_attachments,
                    "select": "attachments.attach_id",
                    "where": {"or": [
                        {"exists": "bug_group"},
                        {"terms": {"attachments.isprivate": ['1', True, 1]}}
                    ]}
                })

            Log.note("Ensure {{num}} attachments did not leak", {
                "num": len(private_attachments)
            })

            #VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {"and": [
                    {"range": {"bug_id": {"gte": min_id, "lt": max_id}}},
                    {"range": {"expires_on": {"gte": NOW}}}, # CURRENT BUGS
                    {"nested": {
                        "path": "attachments",
                        "query": {"filtered": {
                            "query": {"match_all": {}},
                            "filter": {"terms": {"attach_id": private_attachments}}
                        }}
                    }}
                ]}
                # fields=["bug_id", "attachments"]
            )

            #

            if leaked_bugs:
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms":{"bug_id":leaked_bugs.bug_id}}
                    )

                Log.note("{{num}} bugs with private attachments have leaked!", {"num": len(leaked_bugs)})
                for b in leaked_bugs:
                    Log.note("{{bug_id}} has private_attachment\n{{version|indent}}", {
                        "bug_id": b.bug_id,
                        "version": b
                    })
                Log.error("Attachments have leaked!")
コード例 #35
0
def full_etl(resume_from_last_run, settings, param, db, es, es_comments,
             output_queue):
    with Thread.run("alias_analysis", alias_analysis.main, settings=settings):
        end = nvl(settings.param.end,
                  db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
        start = nvl(settings.param.start, 0)
        if resume_from_last_run:
            start = nvl(
                settings.param.start,
                Math.floor(get_max_bug_id(es), settings.param.increment))

        #############################################################
        ## MAIN ETL LOOP
        #############################################################

        #TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD
        # with Multithread([run_both_etl, run_both_etl]) as workers:
        for min, max in Q.intervals(start, end, settings.param.increment):
            if settings.args.quick and min < end - settings.param.increment and min != 0:
                #--quick ONLY DOES FIRST AND LAST BLOCKS
                continue

            try:
                #GET LIST OF CHANGED BUGS
                with Timer("time to get {{min}}..{{max}} bug list", {
                        "min": min,
                        "max": max
                }):
                    if param.allow_private_bugs:
                        bug_list = Q.select(
                            db.query(
                                """
                            SELECT
                                b.bug_id
                            FROM
                                bugs b
                            WHERE
                                delta_ts >= {{start_time_str}} AND
                                ({{min}} <= b.bug_id AND b.bug_id < {{max}})
                        """, {
                                    "min": min,
                                    "max": max,
                                    "start_time_str": param.start_time_str
                                }), u"bug_id")
                    else:
                        bug_list = Q.select(
                            db.query(
                                """
                            SELECT
                                b.bug_id
                            FROM
                                bugs b
                            LEFT JOIN
                                bug_group_map m ON m.bug_id=b.bug_id
                            WHERE
                                delta_ts >= {{start_time_str}} AND
                                ({{min}} <= b.bug_id AND b.bug_id < {{max}}) AND
                                m.bug_id IS NULL
                        """, {
                                    "min": min,
                                    "max": max,
                                    "start_time_str": param.start_time_str
                                }), u"bug_id")

                if not bug_list:
                    continue

                param.bug_list = bug_list
                run_both_etl(
                    **{
                        "db": db,
                        "output_queue": output_queue,
                        "es_comments": es_comments,
                        "param": param.copy()
                    })

            except Exception, e:
                Log.error(
                    "Problem with dispatch loop in range [{{min}}, {{max}})", {
                        "min": min,
                        "max": max
                    }, e)
コード例 #36
0
def normalize(bug, old_school=False):
    bug=bug.copy()
    bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3]
    bug._id = None

    #ENSURE STRUCTURES ARE SORTED
    # Do some processing to make sure that diffing between runs stays as similar as possible.
    bug.flags=Q.sort(bug.flags, "value")

    if bug.attachments:
        if USE_ATTACHMENTS_DOT:
            bug.attachments=CNV.JSON2object(CNV.object2JSON(bug.attachments).replace("attachments_", "attachments."))
        bug.attachments = Q.sort(bug.attachments, "attach_id")
        for a in bug.attachments:
            for k,v in list(a.items()):
                if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")):
                    new_v=CNV.value2int(v)
                    new_k=k[12:]
                    a[k.replace(".", "\.")]=new_v
                    if not old_school:
                        a[new_k]=new_v
            a.flags = Q.sort(a.flags, ["modified_ts", "value"])

    if bug.changes != None:
        if USE_ATTACHMENTS_DOT:
            json = CNV.object2JSON(bug.changes).replace("attachments_", "attachments.")
            bug.changes=CNV.JSON2object(json)
        bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"])

    #bug IS CONVERTED TO A 'CLEAN' COPY
    bug = ElasticSearch.scrub(bug)
    # bug.attachments = nvl(bug.attachments, [])    # ATTACHMENTS MUST EXIST


    for f in NUMERIC_FIELDS:
        v = bug[f]
        if v == None:
            continue
        elif f in MULTI_FIELDS:
            bug[f] = CNV.value2intlist(v)
        elif CNV.value2number(v) == 0:
            del bug[f]
        else:
            bug[f]=CNV.value2number(v)

    # Also reformat some date fields
    for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]:
        v = bug[dateField]
        if v == None: continue
        try:
            if isinstance(v, date):
                bug[dateField] = CNV.datetime2milli(v)
            elif isinstance(v, long) and len(unicode(v)) in [12, 13]:
                bug[dateField] = v
            elif not isinstance(v, basestring):
                Log.error("situation not handled")
            elif DATE_PATTERN_STRICT.match(v):
                # Convert to "2012/01/01 00:00:00.000"
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f"))
            elif DATE_PATTERN_STRICT_SHORT.match(v):
                # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S"))
            elif DATE_PATTERN_RELAXED.match(v):
                # Convert "2012/01/01 00:00:00.000" to "2012-01-01"
                # Example: bug 643420 (deadline)
                #          bug 726635 (cf_due_date)
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d"))
        except Exception, e:
            Log.error("problem with converting date to milli (value={{value}})", {"value":bug[dateField]}, e)
    def test_private_attachments_not_leaking(self):
        for min_id, max_id in self.blocks_of_bugs():
            # FIND ALL PRIVATE ATTACHMENTS
            bugs_w_private_attachments = get(
                self.private,
                {
                    "and": [
                        {
                            "range": {
                                "bug_id": {
                                    "gte": min_id,
                                    "lt": max_id
                                }
                            }
                        },
                        {
                            "range": {
                                "expires_on": {
                                    "gte": NOW
                                }
                            }
                        },  #CURRENT RECORDS
                        {
                            "range": {
                                "modified_ts": {
                                    "lt": A_WHILE_AGO
                                }
                            }
                        },  #OF A MINIMUM AGE
                        {
                            "nested": {  #HAS ATTACHMENT.
                                "path": "attachments",
                                "query": {
                                    "filtered": {
                                        "query": {
                                            "match_all": {}
                                        },
                                        "filter": {
                                            "exists": {
                                                "field":
                                                "attachments.attach_id"
                                            }
                                        }
                                    }
                                }
                            }
                        },
                        {
                            "or": [
                                {
                                    "nested": {  #PRIVATE ATTACHMENT, OR...
                                        "path": "attachments",
                                        "query": {
                                            "filtered": {
                                                "query": {
                                                    "match_all": {}
                                                },
                                                "filter": {
                                                    "term": {
                                                        "attachments.isprivate":
                                                        1
                                                    }
                                                }
                                            }
                                        }
                                    }
                                },
                                {
                                    "exists": {
                                        "field": "bug_group"
                                    }
                                }  # ...PRIVATE BUG
                            ]
                        }
                    ]
                },
                fields=["bug_id", "bug_group", "attachments", "modified_ts"])

            private_attachments = Q.run({
                "from": bugs_w_private_attachments,
                "select": "attachments.attach_id",
                "where": {
                    "or": [{
                        "exists": "bug_group"
                    }, {
                        "terms": {
                            "attachments.isprivate": ['1', True, 1]
                        }
                    }]
                }
            })
            try:
                private_attachments = [int(v) for v in private_attachments]
            except Exception, e:
                private_attachments = Q.run({
                    "from": bugs_w_private_attachments,
                    "select": "attachments.attach_id",
                    "where": {
                        "or": [{
                            "exists": "bug_group"
                        }, {
                            "terms": {
                                "attachments.isprivate": ['1', True, 1]
                            }
                        }]
                    }
                })

            Log.note("Ensure {{num}} attachments did not leak",
                     {"num": len(private_attachments)})

            #VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {
                    "and": [
                        {
                            "range": {
                                "bug_id": {
                                    "gte": min_id,
                                    "lt": max_id
                                }
                            }
                        },
                        {
                            "range": {
                                "expires_on": {
                                    "gte": NOW
                                }
                            }
                        },  # CURRENT BUGS
                        {
                            "nested": {
                                "path": "attachments",
                                "query": {
                                    "filtered": {
                                        "query": {
                                            "match_all": {}
                                        },
                                        "filter": {
                                            "terms": {
                                                "attach_id":
                                                private_attachments
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    ]
                }
                # fields=["bug_id", "attachments"]
            )

            #

            if leaked_bugs:
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms": {
                            "bug_id": leaked_bugs.bug_id
                        }})

                Log.note("{{num}} bugs with private attachments have leaked!",
                         {"num": len(leaked_bugs)})
                for b in leaked_bugs:
                    Log.note(
                        "{{bug_id}} has private_attachment\n{{version|indent}}",
                        {
                            "bug_id": b.bug_id,
                            "version": b
                        })
                Log.error("Attachments have leaked!")
    def test_changes_to_private_bugs_still_have_bug_group(self):
        self.settings.param.allow_private_bugs = True
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()

        private_bugs = set(Random.sample(self.settings.param.bugs, 3))

        Log.note("The private bugs for this test are {{bugs}}",
                 {"bugs": private_bugs})

        database.make_test_instance(self.settings.bugzilla)

        #MARK SOME BUGS PRIVATE
        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)

        es = elasticsearch.make_test_instance("candidate",
                                              self.settings.real.bugs)
        es_c = elasticsearch.make_test_instance("candidate_comments",
                                                self.settings.real.comments)
        bz_etl.main(self.settings, es, es_c)

        # MAKE A CHANGE TO THE PRIVATE BUGS
        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                old_bug = db.query(
                    "SELECT * FROM bugs WHERE bug_id={{bug_id}}",
                    {"bug_id": b})[0]
                new_bug = old_bug.copy()

                new_bug.bug_status = "NEW STATUS"
                diff(db, "bugs", old_bug, new_bug)

        #RUN INCREMENTAL
        bz_etl.main(self.settings, es, es_c)

        #VERIFY BUG GROUP STILL EXISTS
        Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
        now = datetime.utcnow()
        results = es.search({
            "query": {
                "filtered": {
                    "query": {
                        "match_all": {}
                    },
                    "filter": {
                        "and": [{
                            "terms": {
                                "bug_id": private_bugs
                            }
                        }, {
                            "range": {
                                "expires_on": {
                                    "gte": CNV.datetime2milli(now)
                                }
                            }
                        }]
                    }
                }
            },
            "from": 0,
            "size": 200000,
            "sort": []
        })
        latest_bugs = Q.select(results.hits.hits, "_source")
        latest_bugs_index = Q.unique_index(
            latest_bugs, "bug_id")  # IF NOT UNIQUE, THEN ETL IS WRONG

        for bug_id in private_bugs:
            if latest_bugs_index[bug_id] == None:
                Log.error("Expecting to find the private bug {{bug_id}}",
                          {"bug_id": bug_id})

            bug_group = latest_bugs_index[bug_id].bug_group
            if not bug_group:
                Log.error(
                    "Expecting private bug ({{bug_id}}) to have a bug group",
                    {"bug_id": bug_id})
            if BUG_GROUP_FOR_TESTING not in bug_group:
                Log.error(
                    "Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group",
                    {
                        "bug_id": bug_id,
                        "bug_group": BUG_GROUP_FOR_TESTING
                    })
コード例 #39
0
ファイル: bz_etl.py プロジェクト: klahnakoski/Bugzilla-ETL
def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue):
    with Thread.run("alias_analysis", alias_analysis.main, settings=settings):
        end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
        start = nvl(settings.param.start, 0)
        if resume_from_last_run:
            start = nvl(settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment))

        #############################################################
        ## MAIN ETL LOOP
        #############################################################

        #TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD
        # with Multithread([run_both_etl, run_both_etl]) as workers:
        for min, max in Q.intervals(start, end, settings.param.increment):
            if settings.args.quick and min < end - settings.param.increment and min != 0:
                #--quick ONLY DOES FIRST AND LAST BLOCKS
                continue

            try:
                #GET LIST OF CHANGED BUGS
                with Timer("time to get {{min}}..{{max}} bug list", {"min":min, "max":max}):
                    if param.allow_private_bugs:
                        bug_list = Q.select(db.query("""
                            SELECT
                                b.bug_id
                            FROM
                                bugs b
                            WHERE
                                delta_ts >= {{start_time_str}} AND
                                ({{min}} <= b.bug_id AND b.bug_id < {{max}})
                        """, {
                            "min": min,
                            "max": max,
                            "start_time_str": param.start_time_str
                        }), u"bug_id")
                    else:
                        bug_list = Q.select(db.query("""
                            SELECT
                                b.bug_id
                            FROM
                                bugs b
                            LEFT JOIN
                                bug_group_map m ON m.bug_id=b.bug_id
                            WHERE
                                delta_ts >= {{start_time_str}} AND
                                ({{min}} <= b.bug_id AND b.bug_id < {{max}}) AND
                                m.bug_id IS NULL
                        """, {
                            "min": min,
                            "max": max,
                            "start_time_str": param.start_time_str
                        }), u"bug_id")

                if not bug_list:
                    continue

                param.bug_list = bug_list
                run_both_etl(**{
                    "db": db,
                    "output_queue": output_queue,
                    "es_comments": es_comments,
                    "param": param.copy()
                })

            except Exception, e:
                Log.error("Problem with dispatch loop in range [{{min}}, {{max}})", {
                    "min": min,
                    "max": max
                }, e)
コード例 #40
0
ファイル: bz_etl.py プロジェクト: klahnakoski/Bugzilla-ETL
        refresh_param = param.copy()
        refresh_param.bug_list = bug_list
        refresh_param.start_time = 0
        refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0)

        try:
            etl(db, output_queue, refresh_param.copy(), please_stop=None)
            etl_comments(db, es_comments, refresh_param.copy(), please_stop=None)
        except Exception, e:
            Log.error("Problem with etl using parameters {{parameters}}", {
                "parameters": refresh_param
            }, e)

    #REFRESH COMMENTS WITH PRIVACY CHANGE
    private_comments = get_recent_private_comments(db, param)
    comment_list = set(Q.select(private_comments, "comment_id")) | {0}
    es_comments.delete_record({"terms": {"comment_id": comment_list}})
    changed_comments = get_comments_by_id(db, comment_list, param)
    es_comments.extend({"id": c.comment_id, "value": c} for c in changed_comments)

    #GET LIST OF CHANGED BUGS
    with Timer("time to get changed bug list"):
        if param.allow_private_bugs:
            bug_list = Q.select(db.query("""
                SELECT
                    b.bug_id
                FROM
                    bugs b
                WHERE
                    delta_ts >= {{start_time_str}}
            """, {
    def test_private_bugs_not_leaking(self):
        bad_news = False

        # FOR ALL BUG BLOCKS
        for min_id, max_id in self.blocks_of_bugs():
            results = get(
                self.private,
                {
                    "and": [
                        {
                            "match_all": {}
                        },
                        {
                            "and": [
                                {
                                    "range": {
                                        "bug_id": {
                                            "gte": min_id,
                                            "lt": max_id
                                        }
                                    }
                                },
                                {
                                    "exists": {
                                        "field": "bug_group"
                                    }
                                },
                                {
                                    "range": {
                                        "expires_on": {
                                            "gte": NOW
                                        }
                                    }
                                },  #CURRENT RECORDS
                                {
                                    "range": {
                                        "modified_ts": {
                                            "lt": A_WHILE_AGO
                                        }
                                    }
                                },  #OF A MINIMUM AGE
                            ]
                        }
                    ]
                },
                ["bug_id", "bug_group", "modified_ts"])

            private_ids = {b.bug_id: b.bug_group for b in results}

            Log.note("Ensure {{num}} bugs did not leak",
                     {"num": len(private_ids.keys())})

            # VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {
                    "and": [
                        {
                            "terms": {
                                "bug_id": private_ids.keys()
                            }
                        },
                        {
                            "range": {
                                "expires_on": {
                                    "gte": NOW
                                }
                            }
                        }  # SOME BUGS WILL LEAK FOR A LITTLE WHILE
                    ]
                })

            if leaked_bugs:
                bad_news = True
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms": {
                            "bug_id": leaked_bugs.bug_id
                        }})

                Log.note(
                    "{{num}} leaks!! {{bugs}}", {
                        "num":
                        len(leaked_bugs),
                        "bugs":
                        Q.run({
                            "from":
                            leaked_bugs,
                            "select": [
                                "bug_id", "bug_version_num", {
                                    "name":
                                    "modified_ts",
                                    "value":
                                    lambda d: CNV.datetime2string(
                                        CNV.milli2datetime(d.modified_ts))
                                }
                            ],
                            "sort":
                            "bug_id"
                        })
                    })
                for b in leaked_bugs:
                    Log.note(
                        "{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}",
                        {
                            "bug_id": b.bug_id,
                            "bug_group": private_ids[b.bug_id],
                            "version": milli2datetime(b)
                        })

            #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG
            leaked_comments = get(self.public_comments,
                                  {"terms": {
                                      "bug_id": private_ids.keys()
                                  }},
                                  limit=20)
            if leaked_comments:
                bad_news = True

                if self.settings.param.delete:
                    self.public_comments.delete_record(
                        {"terms": {
                            "bug_id": leaked_comments.bug_id
                        }})

                Log.warning(
                    "{{num}} comments marked private have leaked!\n{{comments|indent}}",
                    {
                        "num": len(leaked_comments),
                        "comments": leaked_comments
                    })

        if bad_news:
            Log.error("Bugs have leaked!")
コード例 #42
0
def get_screened_whiteboard(db):
    if not SCREENED_BUG_GROUP_IDS:
        groups = db.query("SELECT id FROM groups WHERE {{where}}", {
            "where": esfilter2sqlwhere(db, {"terms": {"name": SCREENED_WHITEBOARD_BUG_GROUPS}})
        })
        globals()["SCREENED_BUG_GROUP_IDS"] = Q.select(groups, "id")
コード例 #43
0
        del bug["everconfirmed"]
    if bug.id == "692436_1336314345":
        bug.votes = 3

    try:
        if Math.is_number(bug.cf_last_resolved):
            bug.cf_last_resolved = long(bug.cf_last_resolved)
        else:
            bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
    except Exception, e:
        pass

    bug = transform_bugzilla.rename_attachments(bug)
    for c in bug.changes:
        c.field_name = c.field_name.replace("attachments.", "attachments_")
        if c.attach_id == '':
            c.attach_id = None
        else:
            c.attach_id = CNV.value2int(c.attach_id)

    bug.attachments = Q.sort(bug.attachments, "attach_id")
    for a in bug.attachments:
        a.attach_id = CNV.value2int(a.attach_id)
        for k, v in list(a.items()):
            if k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate"):
                struct.unwrap(a)[k] = CNV.value2int(v) # PREVENT dot (.) INTERPRETATION
                a[k.split(".")[-1].split("_")[-1]] = CNV.value2int(v)

    bug = transform_bugzilla.normalize(bug)
    return bug
コード例 #44
0
def get_bugs(db, param):
    try:
        get_bugs_table_columns(db, db.settings.schema)
        get_screened_whiteboard(db)

        #TODO: CF_LAST_RESOLVED IS IN PDT, FIX IT
        def lower(col):
            if col.column_type.startswith("varchar"):
                return "lower(" + db.quote_column(col.column_name) + ") " + db.quote_column(col.column_name)
            else:
                return db.quote_column(col.column_name)

        param.bugs_columns = Q.select(bugs_columns, "column_name")
        param.bugs_columns_SQL = SQL(",\n".join([lower(c) for c in bugs_columns]))
        param.bug_filter = esfilter2sqlwhere(db, {"terms": {"b.bug_id": param.bug_list}})
        param.screened_whiteboard = esfilter2sqlwhere(db, {"and": [
            {"exists": "m.bug_id"},
            {"terms": {"m.group_id": SCREENED_BUG_GROUP_IDS}}
        ]})

        if param.allow_private_bugs:
            param.sensitive_columns = SQL("""
                '[screened]' short_desc,
                '[screened]' bug_file_loc
            """)
        else:
            param.sensitive_columns = SQL("""
                short_desc,
                bug_file_loc
            """)

        bugs = db.query("""
            SELECT
                b.bug_id,
                UNIX_TIMESTAMP(CONVERT_TZ(b.creation_ts, 'US/Pacific','UTC'))*1000 AS modified_ts,
                lower(pr.login_name) AS modified_by,
                UNIX_TIMESTAMP(CONVERT_TZ(b.creation_ts, 'US/Pacific','UTC'))*1000 AS created_ts,
                lower(pr.login_name) AS created_by,
                lower(pa.login_name) AS assigned_to,
                lower(pq.login_name) AS qa_contact,
                lower(prod.`name`) AS product,
                lower(comp.`name`) AS component,
                CASE WHEN {{screened_whiteboard}} AND b.status_whiteboard IS NOT NULL AND trim(b.status_whiteboard)<>'' THEN '[screened]' ELSE trim(lower(b.status_whiteboard)) END status_whiteboard,
                {{sensitive_columns}},
                {{bugs_columns_SQL}}
            FROM
                bugs b
            LEFT JOIN
                profiles pr ON b.reporter = pr.userid
            LEFT JOIN
                profiles pa ON b.assigned_to = pa.userid
            LEFT JOIN
                profiles pq ON b.qa_contact = pq.userid
            LEFT JOIN
                products prod ON prod.id = product_id
            LEFT JOIN
                components comp ON comp.id = component_id
            LEFT JOIN
                bug_group_map m ON m.bug_id = b.bug_id
            WHERE
                {{bug_filter}}
            """, param)

        #bugs IS LIST OF BUGS WHICH MUST BE CONVERTED TO THE DELTA RECORDS FOR ALL FIELDS
        output = []
        for r in bugs:
            flatten_bugs_record(r, output)

        return output
    except Exception, e:
        Log.error("can not get basic bug data", e)
 def search(self, query):
     query=wrap(query)
     f = CNV.esfilter2where(query.query.filtered.filter)
     filtered=wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)])
     if query.fields:
         return wrap({"hits": {"total":len(filtered), "hits": [{"_id":d._id, "fields":unwrap(Q.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}})
     else:
         return wrap({"hits": {"total":len(filtered), "hits": filtered}})
コード例 #46
0
def normalize(bug, old_school=False):
    bug = bug.copy()
    bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3]
    bug._id = None

    #ENSURE STRUCTURES ARE SORTED
    # Do some processing to make sure that diffing between runs stays as similar as possible.
    bug.flags = Q.sort(bug.flags, "value")

    if bug.attachments:
        if USE_ATTACHMENTS_DOT:
            bug.attachments = CNV.JSON2object(
                CNV.object2JSON(bug.attachments).replace(
                    "attachments_", "attachments."))
        bug.attachments = Q.sort(bug.attachments, "attach_id")
        for a in bug.attachments:
            for k, v in list(a.items()):
                if k.startswith("attachments") and (k.endswith("isobsolete")
                                                    or k.endswith("ispatch") or
                                                    k.endswith("isprivate")):
                    new_v = CNV.value2int(v)
                    new_k = k[12:]
                    a[k.replace(".", "\.")] = new_v
                    if not old_school:
                        a[new_k] = new_v
            a.flags = Q.sort(a.flags, ["modified_ts", "value"])

    if bug.changes != None:
        if USE_ATTACHMENTS_DOT:
            json = CNV.object2JSON(bug.changes).replace(
                "attachments_", "attachments.")
            bug.changes = CNV.JSON2object(json)
        bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"])

    #bug IS CONVERTED TO A 'CLEAN' COPY
    bug = ElasticSearch.scrub(bug)
    # bug.attachments = nvl(bug.attachments, [])    # ATTACHMENTS MUST EXIST

    for f in NUMERIC_FIELDS:
        v = bug[f]
        if v == None:
            continue
        elif f in MULTI_FIELDS:
            bug[f] = CNV.value2intlist(v)
        elif CNV.value2number(v) == 0:
            del bug[f]
        else:
            bug[f] = CNV.value2number(v)

    # Also reformat some date fields
    for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]:
        v = bug[dateField]
        if v == None: continue
        try:
            if isinstance(v, date):
                bug[dateField] = CNV.datetime2milli(v)
            elif isinstance(v, long) and len(unicode(v)) in [12, 13]:
                bug[dateField] = v
            elif not isinstance(v, basestring):
                Log.error("situation not handled")
            elif DATE_PATTERN_STRICT.match(v):
                # Convert to "2012/01/01 00:00:00.000"
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f"))
            elif DATE_PATTERN_STRICT_SHORT.match(v):
                # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v.replace("-", "/"),
                                        "%Y/%m/%d %H:%M:%S"))
            elif DATE_PATTERN_RELAXED.match(v):
                # Convert "2012/01/01 00:00:00.000" to "2012-01-01"
                # Example: bug 643420 (deadline)
                #          bug 726635 (cf_due_date)
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v[0:10], "%Y-%m-%d"))
        except Exception, e:
            Log.error(
                "problem with converting date to milli (value={{value}})",
                {"value": bug[dateField]}, e)
コード例 #47
0
def get_bugs(db, param):
    try:
        get_bugs_table_columns(db, db.settings.schema)
        get_screened_whiteboard(db)

        #TODO: CF_LAST_RESOLVED IS IN PDT, FIX IT
        def lower(col):
            if col.column_type.startswith("varchar"):
                return "lower(" + db.quote_column(
                    col.column_name) + ") " + db.quote_column(col.column_name)
            else:
                return db.quote_column(col.column_name)

        param.bugs_columns = Q.select(bugs_columns, "column_name")
        param.bugs_columns_SQL = SQL(",\n".join(
            [lower(c) for c in bugs_columns]))
        param.bug_filter = esfilter2sqlwhere(
            db, {"terms": {
                "b.bug_id": param.bug_list
            }})
        param.screened_whiteboard = esfilter2sqlwhere(
            db, {
                "and": [{
                    "exists": "m.bug_id"
                }, {
                    "terms": {
                        "m.group_id": SCREENED_BUG_GROUP_IDS
                    }
                }]
            })

        if param.allow_private_bugs:
            param.sensitive_columns = SQL("""
                '[screened]' short_desc,
                '[screened]' bug_file_loc
            """)
        else:
            param.sensitive_columns = SQL("""
                short_desc,
                bug_file_loc
            """)

        bugs = db.query(
            """
            SELECT
                b.bug_id,
                UNIX_TIMESTAMP(CONVERT_TZ(b.creation_ts, 'US/Pacific','UTC'))*1000 AS modified_ts,
                lower(pr.login_name) AS modified_by,
                UNIX_TIMESTAMP(CONVERT_TZ(b.creation_ts, 'US/Pacific','UTC'))*1000 AS created_ts,
                lower(pr.login_name) AS created_by,
                lower(pa.login_name) AS assigned_to,
                lower(pq.login_name) AS qa_contact,
                lower(prod.`name`) AS product,
                lower(comp.`name`) AS component,
                CASE WHEN {{screened_whiteboard}} AND b.status_whiteboard IS NOT NULL AND trim(b.status_whiteboard)<>'' THEN '[screened]' ELSE trim(lower(b.status_whiteboard)) END status_whiteboard,
                {{sensitive_columns}},
                {{bugs_columns_SQL}}
            FROM
                bugs b
            LEFT JOIN
                profiles pr ON b.reporter = pr.userid
            LEFT JOIN
                profiles pa ON b.assigned_to = pa.userid
            LEFT JOIN
                profiles pq ON b.qa_contact = pq.userid
            LEFT JOIN
                products prod ON prod.id = product_id
            LEFT JOIN
                components comp ON comp.id = component_id
            LEFT JOIN
                bug_group_map m ON m.bug_id = b.bug_id
            WHERE
                {{bug_filter}}
            """, param)

        #bugs IS LIST OF BUGS WHICH MUST BE CONVERTED TO THE DELTA RECORDS FOR ALL FIELDS
        output = []
        for r in bugs:
            flatten_bugs_record(r, output)

        return output
    except Exception, e:
        Log.error("can not get basic bug data", e)
コード例 #48
0
        refresh_param.start_time = 0
        refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0)

        try:
            etl(db, output_queue, refresh_param.copy(), please_stop=None)
            etl_comments(db,
                         es_comments,
                         refresh_param.copy(),
                         please_stop=None)
        except Exception, e:
            Log.error("Problem with etl using parameters {{parameters}}",
                      {"parameters": refresh_param}, e)

    #REFRESH COMMENTS WITH PRIVACY CHANGE
    private_comments = get_recent_private_comments(db, param)
    comment_list = set(Q.select(private_comments, "comment_id")) | {0}
    es_comments.delete_record({"terms": {"comment_id": comment_list}})
    changed_comments = get_comments_by_id(db, comment_list, param)
    es_comments.extend({
        "id": c.comment_id,
        "value": c
    } for c in changed_comments)

    #GET LIST OF CHANGED BUGS
    with Timer("time to get changed bug list"):
        if param.allow_private_bugs:
            bug_list = Q.select(
                db.query(
                    """
                SELECT
                    b.bug_id