Ejemplo n.º 1
0
def get_private_bugs(es):
    """
    FIND THE BUGS WE DO NOT EXPECT TO BE FOUND IN PUBLIC
    """
    data = es.search({
        "query": {"filtered": {
            "query": {"match_all": {}},
            "filter": {"and": [
                {"script": {"script": "true"}},
                {"and": [{"exists": {"field": "bug_group"}}]}
            ]}
        }},
        "from": 0,
        "size": 200000,
        "sort": [],
        "facets": {},
        "fields": ["bug_id", "blocked", "dependson", "dupe_of", "dupe_by"]
    })

    with Timer("aggregate es results on private bugs"):
        output = set([])
        for bug in data.hits.hits:
            output.add(bug.fields.bug_id)
            output |= set(nvl(CNV.value2intlist(bug.fields.blocked), []))
            output |= set(nvl(CNV.value2intlist(bug.fields.dependson), []))
            output |= set(nvl(CNV.value2intlist(bug.fields.dupe_of), []))
            output |= set(nvl(CNV.value2intlist(bug.fields.dupe_by), []))

    output.add(551988, 636964)
    return output
Ejemplo n.º 2
0
def loadAliases(settings):
    try:
        try:
            with Timer("load alias file at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}):
                alias_json = File(settings.param.alias_file).read()
        except Exception, e:
            Log.warning("No alias file found (looking at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)})
            alias_json = "{}"
            #self.aliases IS A dict POINTING TO structs
        for k, v in CNV.JSON2object(alias_json).iteritems():
            aliases[k] = struct.wrap(v)

        Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
Ejemplo n.º 3
0
def main(settings, es=None, es_comments=None):
    if not settings.param.allow_private_bugs and es and not es_comments:
        Log.error("Must have ES for comments")

    resume_from_last_run = File(
        settings.param.first_run_time).exists and not File(
            settings.param.last_run_time).exists

    #MAKE HANDLES TO CONTAINERS
    try:
        with DB(settings.bugzilla, readonly=True) as db:
            current_run_time, es, es_comments, last_run_time = setup_es(
                settings, db, es, es_comments)

            with ThreadedQueue(es, size=500, silent=True) as output_queue:
                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts))
                # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE.
                # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM
                param.start_time = last_run_time - nvl(
                    settings.param.look_back,
                    5 * 60 * 1000)  # 5 MINUTE LOOK_BACK
                param.start_time_str = extract_bugzilla.milli2string(
                    db, param.start_time)
                param.alias_file = settings.param.alias_file
                param.allow_private_bugs = settings.param.allow_private_bugs

                if last_run_time > 0:
                    with Timer("run incremental etl"):
                        incremental_etl(settings, param, db, es, es_comments,
                                        output_queue)
                else:
                    with Timer("run full etl"):
                        full_etl(resume_from_last_run, settings, param, db, es,
                                 es_comments, output_queue)

                output_queue.add(Thread.STOP)

        if settings.es.alias:
            es.delete_all_but(settings.es.alias, settings.es.index)
            es.add_alias(settings.es.alias)

        if settings.es_comments.alias:
            es.delete_all_but(settings.es_comments.alias,
                              settings.es_comments.index)
            es_comments.add_alias(settings.es_comments.alias)

        File(settings.param.last_run_time).write(
            unicode(CNV.datetime2milli(current_run_time)))
    except Exception, e:
        Log.error("Problem with main ETL loop", e)
Ejemplo n.º 4
0
def main(settings, bug_list=None, please_stop=None, restart=False):
    """
    THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE.
    SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME.  THIS CODE
    WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED
    OVER THE LIFETIME OF THE BUGZILLA DATA.  'PERSON' IS ABSTRACT, AND SIMPLY
    ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION
    """
    if settings.args.quick:
        Log.note("Alias analysis skipped (--quick was used)")
        return

    if not restart:
        loadAliases(settings)

    if bug_list:
        with DB(settings.bugzilla, readonly=True) as db:
            data = get_all_cc_changes(db, bug_list)
            aggregator(data)
            analysis(settings, True, please_stop)
        return

    with DB(settings.bugzilla, readonly=True) as db:
        start = nvl(settings.param.start, 0)
        end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)

        #Perform analysis on blocks of bugs, in case we crash partway through
        for s, e in Q.intervals(start, end, settings.param.alias_increment):
            Log.note("Load range {{start}}-{{end}}", {
                "start": s,
                "end": e
            })
            data = get_all_cc_changes(db, range(s, e))
            if please_stop:
                break
            aggregator(data)

            analysis(settings, e >= end, please_stop)
def loadAliases(settings):
    try:
        try:
            with Timer(
                    "load alias file at {{filename}}", {
                        "filename":
                        nvl(settings.param.alias_file.path,
                            settings.param.alias_file)
                    }):
                alias_json = File(settings.param.alias_file).read()
        except Exception, e:
            Log.warning(
                "No alias file found (looking at {{filename}}", {
                    "filename":
                    nvl(settings.param.alias_file.path,
                        settings.param.alias_file)
                })
            alias_json = "{}"
            #self.aliases IS A dict POINTING TO structs
        for k, v in CNV.JSON2object(alias_json).iteritems():
            aliases[k] = struct.wrap(v)

        Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def main(settings, bug_list=None, please_stop=None, restart=False):
    """
    THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE.
    SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME.  THIS CODE
    WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED
    OVER THE LIFETIME OF THE BUGZILLA DATA.  'PERSON' IS ABSTRACT, AND SIMPLY
    ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION
    """
    if settings.args.quick:
        Log.note("Alias analysis skipped (--quick was used)")
        return

    if not restart:
        loadAliases(settings)

    if bug_list:
        with DB(settings.bugzilla, readonly=True) as db:
            data = get_all_cc_changes(db, bug_list)
            aggregator(data)
            analysis(settings, True, please_stop)
        return

    with DB(settings.bugzilla, readonly=True) as db:
        start = nvl(settings.param.start, 0)
        end = nvl(settings.param.end,
                  db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)

        #Perform analysis on blocks of bugs, in case we crash partway through
        for s, e in Q.intervals(start, end, settings.param.alias_increment):
            Log.note("Load range {{start}}-{{end}}", {"start": s, "end": e})
            data = get_all_cc_changes(db, range(s, e))
            if please_stop:
                break
            aggregator(data)

            analysis(settings, e >= end, please_stop)
Ejemplo n.º 7
0
def get_all_bug_versions(es, bug_id, max_time=None):
    max_time = nvl(max_time, datetime.max)

    data = es.search({
        "query": {"filtered": {
            "query": {"match_all": {}},
            "filter": {"and": [
                {"term": {"bug_id": bug_id}},
                {"range": {"modified_ts": {"lte": CNV.datetime2milli(max_time)}}}
            ]}
        }},
        "from": 0,
        "size": 200000,
        "sort": []
    })

    return Q.select(data.hits.hits, "_source")
Ejemplo n.º 8
0
def get(es, esfilter, fields=None, limit=None):
    query = struct.wrap({
        "query": {"filtered": {
            "query": {"match_all": {}},
            "filter": esfilter
        }},
        "from": 0,
        "size": nvl(limit, 200000),
        "sort": []
    })

    if fields:
        query.fields=fields
        results = es.search(query)
        return Q.select(results.hits.hits, "fields")
    else:
        results = es.search(query)
        return Q.select(results.hits.hits, "_source")
def add_alias(lost, found):
    found_record = aliases.get(found, None)
    lost_record = aliases.get(lost, None)

    new_canonical = found
    old_canonical = nvl(lost_record.canonical, lost)
    lost_record.canonical = new_canonical

    delete_list = []

    #FOLD bugs ON lost=found
    for bug_id, agg in bugs.iteritems():
        v = agg.dic.get(lost, 0)
        if v != 0:
            agg.add(lost, -v)
            agg.add(found, v)

        if not agg:
            delete_list.append(bug_id)

    #FOLD bugs ON old_canonical=new_canonical
    if old_canonical != lost:
        for bug_id, agg in bugs.iteritems():
            v = agg.dic.get(old_canonical, 0)
            if v != 0:
                agg.add(old_canonical, -v)
                agg.add(new_canonical, v)

            if not agg:
                delete_list.append(bug_id)

    for d in delete_list:
        del bugs[d]

    #FOLD ALIASES
    for k, v in aliases.iteritems():
        if v.canonical == old_canonical:
            Log.note(
                "ALIAS REMAPPED: {{alias}}->{{old}} to {{alias}}->{{new}}", {
                    "alias": k,
                    "old": old_canonical,
                    "new": found
                })
            v.canonical = found
Ejemplo n.º 10
0
def add_alias(lost, found):
    found_record = aliases.get(found, None)
    lost_record = aliases.get(lost, None)

    new_canonical = found
    old_canonical = nvl(lost_record.canonical, lost)
    lost_record.canonical = new_canonical

    delete_list = []

    #FOLD bugs ON lost=found
    for bug_id, agg in bugs.iteritems():
        v = agg.dic.get(lost, 0)
        if v != 0:
            agg.add(lost, -v)
            agg.add(found, v)

        if not agg:
            delete_list.append(bug_id)

    #FOLD bugs ON old_canonical=new_canonical
    if old_canonical != lost:
        for bug_id, agg in bugs.iteritems():
            v = agg.dic.get(old_canonical, 0)
            if v != 0:
                agg.add(old_canonical, -v)
                agg.add(new_canonical, v)

            if not agg:
                delete_list.append(bug_id)

    for d in delete_list:
        del bugs[d]

    #FOLD ALIASES
    for k, v in aliases.iteritems():
        if v.canonical == old_canonical:
            Log.note("ALIAS REMAPPED: {{alias}}->{{old}} to {{alias}}->{{new}}", {
                "alias": k,
                "old": old_canonical,
                "new": found
            })
            v.canonical = found
Ejemplo n.º 11
0
def main(settings, es=None, es_comments=None):
    if not settings.param.allow_private_bugs and es and not es_comments:
        Log.error("Must have ES for comments")

    resume_from_last_run = File(settings.param.first_run_time).exists and not File(settings.param.last_run_time).exists

    #MAKE HANDLES TO CONTAINERS
    try:
        with DB(settings.bugzilla, readonly=True) as db:
            current_run_time, es, es_comments, last_run_time = setup_es(settings, db, es, es_comments)

            with ThreadedQueue(es, size=500, silent=True) as output_queue:
                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts))
                # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE.
                # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM
                param.start_time = last_run_time - nvl(settings.param.look_back, 5 * 60 * 1000)  # 5 MINUTE LOOK_BACK
                param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)
                param.alias_file = settings.param.alias_file
                param.allow_private_bugs = settings.param.allow_private_bugs

                if last_run_time > 0:
                    with Timer("run incremental etl"):
                        incremental_etl(settings, param, db, es, es_comments, output_queue)
                else:
                    with Timer("run full etl"):
                        full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue)

                output_queue.add(Thread.STOP)

        if settings.es.alias:
            es.delete_all_but(settings.es.alias, settings.es.index)
            es.add_alias(settings.es.alias)

        if settings.es_comments.alias:
            es.delete_all_but(settings.es_comments.alias, settings.es_comments.index)
            es_comments.add_alias(settings.es_comments.alias)

        File(settings.param.last_run_time).write(unicode(CNV.datetime2milli(current_run_time)))
    except Exception, e:
        Log.error("Problem with main ETL loop", e)
def get(es, esfilter, fields=None, limit=None):
    query = struct.wrap({
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": esfilter
            }
        },
        "from": 0,
        "size": nvl(limit, 200000),
        "sort": []
    })

    if fields:
        query.fields = fields
        results = es.search(query)
        return Q.select(results.hits.hits, "fields")
    else:
        results = es.search(query)
        return Q.select(results.hits.hits, "_source")
def main(settings):
    #USE A FILE
    if settings.source.filename != None:
        settings.destination.alias = settings.destination.index
        settings.destination.index = ElasticSearch.proto_name(settings.destination.alias)
        schema = CNV.JSON2object(File(settings.source.schema_filename).read())
        if transform_bugzilla.USE_ATTACHMENTS_DOT:
            schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments."))

        dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True)
        dest.set_refresh_interval(-1)
        extract_from_file(settings.source, dest)
        dest.set_refresh_interval(1)

        dest.delete_all_but(settings.destination.alias, settings.destination.index)
        dest.add_alias(settings.destination.alias)
        return

    # SYNCH WITH source ES INDEX
    source=ElasticSearch(settings.source)
    destination=get_or_create_index(settings["destination"], source)

    # GET LAST UPDATED
    time_file = File(settings.param.last_replication_time)
    from_file = None
    if time_file.exists:
        from_file = CNV.milli2datetime(CNV.value2int(time_file.read()))
    from_es = get_last_updated(destination)
    last_updated = nvl(MIN(from_file, from_es), CNV.milli2datetime(0))
    current_time = datetime.utcnow()

    pending = get_pending(source, last_updated)
    with ThreadedQueue(destination, size=1000) as data_sink:
        replicate(source, data_sink, pending, last_updated)

    # RECORD LAST UPDATED
    time_file.write(unicode(CNV.datetime2milli(current_time)))
Ejemplo n.º 14
0
def alias(email):
    output = nvl(aliases.get(email, Null).canonical, email)
    return output
Ejemplo n.º 15
0
def full_etl(resume_from_last_run, settings, param, db, es, es_comments,
             output_queue):
    with Thread.run("alias_analysis", alias_analysis.main, settings=settings):
        end = nvl(settings.param.end,
                  db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
        start = nvl(settings.param.start, 0)
        if resume_from_last_run:
            start = nvl(
                settings.param.start,
                Math.floor(get_max_bug_id(es), settings.param.increment))

        #############################################################
        ## MAIN ETL LOOP
        #############################################################

        #TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD
        # with Multithread([run_both_etl, run_both_etl]) as workers:
        for min, max in Q.intervals(start, end, settings.param.increment):
            if settings.args.quick and min < end - settings.param.increment and min != 0:
                #--quick ONLY DOES FIRST AND LAST BLOCKS
                continue

            try:
                #GET LIST OF CHANGED BUGS
                with Timer("time to get {{min}}..{{max}} bug list", {
                        "min": min,
                        "max": max
                }):
                    if param.allow_private_bugs:
                        bug_list = Q.select(
                            db.query(
                                """
                            SELECT
                                b.bug_id
                            FROM
                                bugs b
                            WHERE
                                delta_ts >= {{start_time_str}} AND
                                ({{min}} <= b.bug_id AND b.bug_id < {{max}})
                        """, {
                                    "min": min,
                                    "max": max,
                                    "start_time_str": param.start_time_str
                                }), u"bug_id")
                    else:
                        bug_list = Q.select(
                            db.query(
                                """
                            SELECT
                                b.bug_id
                            FROM
                                bugs b
                            LEFT JOIN
                                bug_group_map m ON m.bug_id=b.bug_id
                            WHERE
                                delta_ts >= {{start_time_str}} AND
                                ({{min}} <= b.bug_id AND b.bug_id < {{max}}) AND
                                m.bug_id IS NULL
                        """, {
                                    "min": min,
                                    "max": max,
                                    "start_time_str": param.start_time_str
                                }), u"bug_id")

                if not bug_list:
                    continue

                param.bug_list = bug_list
                run_both_etl(
                    **{
                        "db": db,
                        "output_queue": output_queue,
                        "es_comments": es_comments,
                        "param": param.copy()
                    })

            except Exception, e:
                Log.error(
                    "Problem with dispatch loop in range [{{min}}, {{max}})", {
                        "min": min,
                        "max": max
                    }, e)
def alias(email):
    output = nvl(aliases.get(email, Null).canonical, email)
    return output
Ejemplo n.º 17
0
def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue):
    with Thread.run("alias_analysis", alias_analysis.main, settings=settings):
        end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
        start = nvl(settings.param.start, 0)
        if resume_from_last_run:
            start = nvl(settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment))

        #############################################################
        ## MAIN ETL LOOP
        #############################################################

        #TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD
        # with Multithread([run_both_etl, run_both_etl]) as workers:
        for min, max in Q.intervals(start, end, settings.param.increment):
            if settings.args.quick and min < end - settings.param.increment and min != 0:
                #--quick ONLY DOES FIRST AND LAST BLOCKS
                continue

            try:
                #GET LIST OF CHANGED BUGS
                with Timer("time to get {{min}}..{{max}} bug list", {"min":min, "max":max}):
                    if param.allow_private_bugs:
                        bug_list = Q.select(db.query("""
                            SELECT
                                b.bug_id
                            FROM
                                bugs b
                            WHERE
                                delta_ts >= {{start_time_str}} AND
                                ({{min}} <= b.bug_id AND b.bug_id < {{max}})
                        """, {
                            "min": min,
                            "max": max,
                            "start_time_str": param.start_time_str
                        }), u"bug_id")
                    else:
                        bug_list = Q.select(db.query("""
                            SELECT
                                b.bug_id
                            FROM
                                bugs b
                            LEFT JOIN
                                bug_group_map m ON m.bug_id=b.bug_id
                            WHERE
                                delta_ts >= {{start_time_str}} AND
                                ({{min}} <= b.bug_id AND b.bug_id < {{max}}) AND
                                m.bug_id IS NULL
                        """, {
                            "min": min,
                            "max": max,
                            "start_time_str": param.start_time_str
                        }), u"bug_id")

                if not bug_list:
                    continue

                param.bug_list = bug_list
                run_both_etl(**{
                    "db": db,
                    "output_queue": output_queue,
                    "es_comments": es_comments,
                    "param": param.copy()
                })

            except Exception, e:
                Log.error("Problem with dispatch loop in range [{{min}}, {{max}})", {
                    "min": min,
                    "max": max
                }, e)