Python Struct Beispiele, bzETL.util.struct.Struct Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: elasticsearch.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

 def __init__(self, settings):
     self.settings = wrap({"host":"fake", "index":"fake"})
     self.filename = settings.filename
     try:
         self.data = CNV.JSON2object(File(self.filename).read())
     except IOError:
         self.data = Struct()

Beispiel #2

0

Datei anzeigen

Datei: test_etl.py Projekt: markrcote/Bugzilla-ETL

def main(settings):

    #MAKE HANDLES TO CONTAINERS
    with DB(settings.bugzilla) as db:
        #REAL ES
        #        if settings.candidate.alias is None:
        #            settings.candidate.alias=settings.candidate.index
        #            settings.candidate.index=settings.candidate.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S")
        #        candidate=ElasticSearch.create_index(settings.candidate, File(settings.candidate.schema_file).read())
        candidate = Fake_ES(settings.fake_es)

        reference = ElasticSearch(settings.reference)

        #SETUP RUN PARAMETERS
        param = Struct()
        param.BUGS_TABLE_COLUMNS = get_bugs_table_columns(
            db, settings.bugzilla.schema)
        param.BUGS_TABLE_COLUMNS_SQL = SQL(",\n".join(
            ["`" + c.column_name + "`" for c in param.BUGS_TABLE_COLUMNS]))
        param.BUGS_TABLE_COLUMNS = Q.select(param.BUGS_TABLE_COLUMNS,
                                            "column_name")
        param.END_TIME = CNV.datetime2milli(datetime.utcnow())
        param.START_TIME = 0
        param.alias_file = settings.param.alias_file
        param.BUG_IDS_PARTITION = SQL("bug_id in {{bugs}}",
                                      {"bugs": db.quote(settings.param.bugs)})

        etl(db, candidate, param)

        #COMPARE ALL BUGS
        compare_both(candidate, reference, settings, settings.param.bugs)

Beispiel #3

0

Datei anzeigen

def remove_bug_group(db, bug_id, group_name):
    group_id = db.query("SELECT id FROM groups WHERE name={{name}}",
                        {"name": group_name})[0].id

    diff(db, "bugs", Struct(bug_id=bug_id, bug_group=group_name),
         Struct(bug_id=bug_id, bug_group=None))
    db.execute(
        "DELETE FROM bug_group_map WHERE bug_id={{bug_id}} and group_id={{group_id}}",
        {
            "bug_id": bug_id,
            "group_id": group_id
        })

Beispiel #4

0

Datei anzeigen

Datei: test_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def test_specific_bugs(self):
        """
        USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP
        THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs.  COMPARE
        THOSE VERSIONS TO A REFERENCE ES (ALSO CHECKED INTO REPOSITORY)
        """
        # settings.param.allow_private_bugs = True
        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance(
                "candidate", self.settings.candidate)
            reference = elasticsearch.open_test_instance(
                "reference", self.settings.private_bugs_reference)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = self.settings.param.bugs
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(candidate, size=1000) as output:
                etl(db, output, param, please_stop=None)

            #COMPARE ALL BUGS
            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            compare_both(candidate, reference, self.settings,
                         self.settings.param.bugs)

Beispiel #5

0

Datei anzeigen

Datei: test_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def test_whiteboard_screened(self):
        GOOD_BUG_TO_TEST = 1046

        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            es = elasticsearch.make_test_instance("candidate",
                                                  self.settings.candidate)

            #MARK BUG AS ONE OF THE SCREENED GROUPS
            database.add_bug_group(db, GOOD_BUG_TO_TEST,
                                   SCREENED_WHITEBOARD_BUG_GROUPS[0])
            db.flush()

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = struct.wrap([
                GOOD_BUG_TO_TEST
            ])  # bug 1046 sees lots of whiteboard, and other field, changes
            param.allow_private_bugs = True

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST)

            for v in versions:
                if v.status_whiteboard not in (None, "", "[screened]"):
                    Log.error("Expecting whiteboard to be screened")

Beispiel #6

0

Datei anzeigen

Datei: test_etl.py Projekt: markrcote/Bugzilla-ETL

def main(settings):

    #MAKE HANDLES TO CONTAINERS
    with DB(settings.bugzilla) as db:
        #REAL ES
#        if settings.candidate.alias is None:
#            settings.candidate.alias=settings.candidate.index
#            settings.candidate.index=settings.candidate.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S")
#        candidate=ElasticSearch.create_index(settings.candidate, File(settings.candidate.schema_file).read())
        candidate=Fake_ES(settings.fake_es)

        reference=ElasticSearch(settings.reference)

        #SETUP RUN PARAMETERS
        param=Struct()
        param.BUGS_TABLE_COLUMNS=get_bugs_table_columns(db, settings.bugzilla.schema)
        param.BUGS_TABLE_COLUMNS_SQL=SQL(",\n".join(["`"+c.column_name+"`" for c in param.BUGS_TABLE_COLUMNS]))
        param.BUGS_TABLE_COLUMNS=Q.select(param.BUGS_TABLE_COLUMNS, "column_name")
        param.END_TIME=CNV.datetime2milli(datetime.utcnow())
        param.START_TIME=0
        param.alias_file=settings.param.alias_file
        param.BUG_IDS_PARTITION=SQL("bug_id in {{bugs}}", {"bugs":db.quote(settings.param.bugs)})

        etl(db, candidate, param)

        #COMPARE ALL BUGS
        compare_both(candidate, reference, settings, settings.param.bugs)

Beispiel #7

0

Datei anzeigen

Datei: test_etl.py Projekt: markrcote/Bugzilla-ETL

def random_sample_of_bugs(settings):
    NUM_TO_TEST = 100
    MAX_BUG_ID = 900000

    with DB(settings.bugzilla) as db:
        candidate = Fake_ES(settings.fake_es)
        reference = ElasticSearch(settings.reference)

        #GO FASTER BY STORING LOCAL FILE
        local_cache = File(settings.param.temp_dir + "/private_bugs.json")
        if local_cache.exists:
            private_bugs = set(CNV.JSON2object(local_cache.read()))
        else:
            with Timer("get private bugs"):
                private_bugs = compare_es.get_private_bugs(reference)
                local_cache.write(CNV.object2JSON(private_bugs))

        while True:
            some_bugs = [
                b
                for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)]
                if b not in private_bugs
            ]

            #SETUP RUN PARAMETERS
            param = Struct()
            param.BUGS_TABLE_COLUMNS = get_bugs_table_columns(
                db, settings.bugzilla.schema)
            param.BUGS_TABLE_COLUMNS_SQL = SQL(",\n".join(
                ["`" + c.column_name + "`" for c in param.BUGS_TABLE_COLUMNS]))
            param.BUGS_TABLE_COLUMNS = Q.select(param.BUGS_TABLE_COLUMNS,
                                                "column_name")
            param.END_TIME = CNV.datetime2milli(datetime.utcnow())
            param.START_TIME = 0
            param.alias_file = settings.param.alias_file
            param.BUG_IDS_PARTITION = SQL("bug_id in {{bugs}}",
                                          {"bugs": db.quote(some_bugs)})

            try:
                etl(db, candidate, param)

                #COMPARE ALL BUGS
                found_errors = compare_both(candidate, reference, settings,
                                            some_bugs)
                if found_errors:
                    D.println("Errors found")
                    break
                else:
                    pass
            except Exception, e:
                D.warning("Total faiure during compare of bugs {{bugs}}",
                          {"bugs": some_bugs}, e)

Beispiel #8

0

Datei anzeigen

Datei: test_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def random_sample_of_bugs(self):
        """
        I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS.  OF COURSE, IT ONLY WORKS
        WHEN I HAVE A REFERENCE TO COMPARE TO
        """
        NUM_TO_TEST = 100
        MAX_BUG_ID = 900000

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate)
            reference = ElasticSearch(self.settings.private_bugs_reference)

            #GO FASTER BY STORING LOCAL FILE
            local_cache = File(self.settings.param.temp_dir + "/private_bugs.json")
            if local_cache.exists:
                private_bugs = set(CNV.JSON2object(local_cache.read()))
            else:
                with Timer("get private bugs"):
                    private_bugs = compare_es.get_private_bugs(reference)
                    local_cache.write(CNV.object2JSON(private_bugs))

            while True:
                some_bugs = [b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs]

                Log.note("Test with the following bug_ids: {{bugs}}", {"bugs":some_bugs})

                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                param.start_time = 0
                param.start_time_str = extract_bugzilla.milli2string(db, 0)
                param.alias_file = self.settings.param.alias_file

                try:
                    with ThreadedQueue(candidate, 100) as output:
                        etl(db, output, param, please_stop=None)

                    #COMPARE ALL BUGS
                    Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
                    found_errors = compare_both(candidate, reference, self.settings, some_bugs)
                    if found_errors:
                        Log.note("Errors found")
                        break
                    else:
                        pass
                except Exception, e:
                    Log.warning("Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)

Beispiel #9

0

Datei anzeigen

Datei: test_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def test_incremental_etl_catches_tracking_flags(self):
        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            es = elasticsearch.make_test_instance("candidate", self.settings.candidate)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME
            param.start_time = CNV.datetime2milli(CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S"))
            param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = struct.wrap([813650])
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            versions = get_all_bug_versions(es, 813650)

            flags = ["cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18"]
            for v in versions:
                if v.modified_ts>param.start_time:
                    for f in flags:
                        if v[f] != "fixed":
                            Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})

Beispiel #10

0

Datei anzeigen

Datei: test_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def test_ambiguous_whiteboard_screened(self):
        GOOD_BUG_TO_TEST=1046

        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            es = elasticsearch.make_test_instance("candidate", self.settings.candidate)

            #MARK BUG AS ONE OF THE SCREENED GROUPS
            database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0])
            #MARK BUG AS ONE OF THE *NOT* SCREENED GROUPS
            database.add_bug_group(db, GOOD_BUG_TO_TEST, "not screened")
            db.flush()

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes
            param.allow_private_bugs = True

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST)

            for v in versions:
                if v.status_whiteboard not in (None, "", "[screened]"):
                    Log.error("Expecting whiteboard to be screened")

Beispiel #11

0

Datei anzeigen

Datei: test_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def test_specific_bugs(self):
        """
        USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP
        THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs.  COMPARE
        THOSE VERSIONS TO A REFERENCE ES (ALSO CHECKED INTO REPOSITORY)
        """
        # settings.param.allow_private_bugs = True
        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate)
            reference = elasticsearch.open_test_instance("reference", self.settings.private_bugs_reference)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = self.settings.param.bugs
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(candidate, size=1000) as output:
                etl(db, output, param, please_stop=None)

            #COMPARE ALL BUGS
            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            compare_both(candidate, reference, self.settings, self.settings.param.bugs)

Beispiel #12

0

Datei anzeigen

def add_bug_group(db, bug_id, group_name):
    group_exists = db.query("SELECT id FROM groups WHERE name={{name}}",
                            {"name": group_name})
    if not group_exists:
        db.insert(
            "groups", {
                "name": group_name,
                "description": group_name,
                "isbuggroup": 1,
                "userregexp": 0
            })
        group_exists = db.query("SELECT id FROM groups WHERE name={{name}}",
                                {"name": group_name})
    group_id = group_exists[0].id

    diff(db, "bugs", Struct(bug_id=bug_id, bug_group=None),
         Struct(bug_id=bug_id, bug_group=group_name))
    db.insert("bug_group_map", {"bug_id": bug_id, "group_id": group_id})

Beispiel #13

0

Datei anzeigen

def flatten_bugs_record(r, output):
    for field_name, value in r.items():
        if value != "---":
            newRow = Struct()
            newRow.bug_id = r.bug_id
            newRow.modified_ts = r.modified_ts
            newRow.modified_by = r.modified_by
            newRow.field_name = field_name
            newRow.new_value = value
            newRow._merge_order = 1
            output.append(newRow)

Beispiel #14

0

Datei anzeigen

Datei: elasticsearch.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

class Fake_ES():
    def __init__(self, settings):
        self.settings = wrap({"host":"fake", "index":"fake"})
        self.filename = settings.filename
        try:
            self.data = CNV.JSON2object(File(self.filename).read())
        except IOError:
            self.data = Struct()


    def search(self, query):
        query=wrap(query)
        f = CNV.esfilter2where(query.query.filtered.filter)
        filtered=wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)])
        if query.fields:
            return wrap({"hits": {"total":len(filtered), "hits": [{"_id":d._id, "fields":unwrap(Q.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}})
        else:
            return wrap({"hits": {"total":len(filtered), "hits": filtered}})

    def extend(self, records):
        """
        JUST SO WE MODEL A Queue
        """
        records = {v["id"]: v["value"] for v in records}

        struct.unwrap(self.data).update(records)

        data_as_json = CNV.object2JSON(self.data, pretty=True)

        File(self.filename).write(data_as_json)
        Log.note("{{num}} items added", {"num": len(records)})

    def add(self, record):
        if isinstance(record, list):
            Log.error("no longer accepting lists, use extend()")
        return self.extend([record])

    def delete_record(self, filter):
        f = CNV.esfilter2where(filter)
        self.data = wrap({k: v for k, v in self.data.items() if not f(v)})

    def set_refresh_interval(self, seconds):
        pass

Beispiel #15

0

Datei anzeigen

Datei: extract_bugzilla.py Projekt: klahnakoski/Bugzilla-ETL

def flatten_bugs_record(r, output):
    for field_name, value in r.items():
        if value != "---":
            newRow = Struct()
            newRow.bug_id = r.bug_id
            newRow.modified_ts = r.modified_ts
            newRow.modified_by = r.modified_by
            newRow.field_name = field_name
            newRow.new_value = value
            newRow._merge_order = 1
            output.append(newRow)

Beispiel #16

0

Datei anzeigen

def main(settings, es=None, es_comments=None):
    if not settings.param.allow_private_bugs and es and not es_comments:
        Log.error("Must have ES for comments")

    resume_from_last_run = File(
        settings.param.first_run_time).exists and not File(
            settings.param.last_run_time).exists

    #MAKE HANDLES TO CONTAINERS
    try:
        with DB(settings.bugzilla, readonly=True) as db:
            current_run_time, es, es_comments, last_run_time = setup_es(
                settings, db, es, es_comments)

            with ThreadedQueue(es, size=500, silent=True) as output_queue:
                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts))
                # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE.
                # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM
                param.start_time = last_run_time - nvl(
                    settings.param.look_back,
                    5 * 60 * 1000)  # 5 MINUTE LOOK_BACK
                param.start_time_str = extract_bugzilla.milli2string(
                    db, param.start_time)
                param.alias_file = settings.param.alias_file
                param.allow_private_bugs = settings.param.allow_private_bugs

                if last_run_time > 0:
                    with Timer("run incremental etl"):
                        incremental_etl(settings, param, db, es, es_comments,
                                        output_queue)
                else:
                    with Timer("run full etl"):
                        full_etl(resume_from_last_run, settings, param, db, es,
                                 es_comments, output_queue)

                output_queue.add(Thread.STOP)

        if settings.es.alias:
            es.delete_all_but(settings.es.alias, settings.es.index)
            es.add_alias(settings.es.alias)

        if settings.es_comments.alias:
            es.delete_all_but(settings.es_comments.alias,
                              settings.es_comments.index)
            es_comments.add_alias(settings.es_comments.alias)

        File(settings.param.last_run_time).write(
            unicode(CNV.datetime2milli(current_run_time)))
    except Exception, e:
        Log.error("Problem with main ETL loop", e)

Beispiel #17

0

Datei anzeigen

Datei: test_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def random_sample_of_bugs(self):
        """
        I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS.  OF COURSE, IT ONLY WORKS
        WHEN I HAVE A REFERENCE TO COMPARE TO
        """
        NUM_TO_TEST = 100
        MAX_BUG_ID = 900000

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance(
                "candidate", self.settings.candidate)
            reference = ElasticSearch(self.settings.private_bugs_reference)

            #GO FASTER BY STORING LOCAL FILE
            local_cache = File(self.settings.param.temp_dir +
                               "/private_bugs.json")
            if local_cache.exists:
                private_bugs = set(CNV.JSON2object(local_cache.read()))
            else:
                with Timer("get private bugs"):
                    private_bugs = compare_es.get_private_bugs(reference)
                    local_cache.write(CNV.object2JSON(private_bugs))

            while True:
                some_bugs = [
                    b for b in
                    [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)]
                    if b not in private_bugs
                ]

                Log.note("Test with the following bug_ids: {{bugs}}",
                         {"bugs": some_bugs})

                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                param.start_time = 0
                param.start_time_str = extract_bugzilla.milli2string(db, 0)
                param.alias_file = self.settings.param.alias_file

                try:
                    with ThreadedQueue(candidate, 100) as output:
                        etl(db, output, param, please_stop=None)

                    #COMPARE ALL BUGS
                    Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
                    found_errors = compare_both(candidate, reference,
                                                self.settings, some_bugs)
                    if found_errors:
                        Log.note("Errors found")
                        break
                    else:
                        pass
                except Exception, e:
                    Log.warning(
                        "Total failure during compare of bugs {{bugs}}",
                        {"bugs": some_bugs}, e)

Beispiel #18

0

Datei anzeigen

def flatten_attachments(data):
    output = []
    for r in data:
        for k, v in r.items():
            if k == "bug_id":
                continue
            output.append(
                Struct(
                    bug_id=r.bug_id,
                    modified_ts=r.modified_ts,
                    modified_by=r.modified_by,
                    field_name=k,
                    new_value=v,  #THESE NAMES HAVE DOTS IN THEM
                    attach_id=r.attach_id,
                    _merge_order=7))
    return output

Beispiel #19

0

Datei anzeigen

Datei: test_etl.py Projekt: markrcote/Bugzilla-ETL

def random_sample_of_bugs(settings):
    NUM_TO_TEST=100
    MAX_BUG_ID=900000


    with DB(settings.bugzilla) as db:
        candidate=Fake_ES(settings.fake_es)
        reference=ElasticSearch(settings.reference)

        #GO FASTER BY STORING LOCAL FILE
        local_cache=File(settings.param.temp_dir+"/private_bugs.json")
        if local_cache.exists:
            private_bugs=set(CNV.JSON2object(local_cache.read()))
        else:
            with Timer("get private bugs"):
                private_bugs= compare_es.get_private_bugs(reference)
                local_cache.write(CNV.object2JSON(private_bugs))

        while True:
            some_bugs=[b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs]

            #SETUP RUN PARAMETERS
            param=Struct()
            param.BUGS_TABLE_COLUMNS=get_bugs_table_columns(db, settings.bugzilla.schema)
            param.BUGS_TABLE_COLUMNS_SQL=SQL(",\n".join(["`"+c.column_name+"`" for c in param.BUGS_TABLE_COLUMNS]))
            param.BUGS_TABLE_COLUMNS=Q.select(param.BUGS_TABLE_COLUMNS, "column_name")
            param.END_TIME=CNV.datetime2milli(datetime.utcnow())
            param.START_TIME=0
            param.alias_file=settings.param.alias_file
            param.BUG_IDS_PARTITION=SQL("bug_id in {{bugs}}", {"bugs":db.quote(some_bugs)})

            try:
                etl(db, candidate, param)

                #COMPARE ALL BUGS
                found_errors=compare_both(candidate, reference, settings, some_bugs)
                if found_errors:
                    D.println("Errors found")
                    break
                else:
                    pass
            except Exception, e:
                D.warning("Total faiure during compare of bugs {{bugs}}", {"bugs":some_bugs}, e)

Beispiel #20

0

Datei anzeigen

Datei: alias_analysis.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

def aggregator(data):
    """
    FLATTEN CC LISTS OVER TIME BY BUG
    MULTISET COUNTS THE NUMBER OF EMAIL AT BUG CREATION
    NEGATIVE MEANS THERE WAS AN ADD WITHOUT A REMOVE (AND NOT IN CURRENT LIST)
    """
    for d in data:
        new_emails = Q.map2set(split_email(d.new_value), alias)
        old_emails = Q.map2set(split_email(d.old_value), alias)

        for e in new_emails | old_emails:
            details = aliases.get(e, Struct())
            aliases[e] = details

        agg = bugs.get(d.bug_id, Multiset(allow_negative=True))
        agg = agg - new_emails
        agg = agg + old_emails
        bugs[d.bug_id] = agg

Beispiel #21

0

Datei anzeigen

Datei: test_one_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def test_specific_bugs(self):
        """
        USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP
        THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs.
        """
        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance(
                "candidate", self.settings.elasticsearch)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = self.settings.param.bugs
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(candidate, size=1000) as output:
                etl(db, output, param, please_stop=None)

Beispiel #22

0

Datei anzeigen

Datei: test_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def test_incremental_etl_catches_tracking_flags(self):
        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            es = elasticsearch.make_test_instance("candidate",
                                                  self.settings.candidate)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME
            param.start_time = CNV.datetime2milli(
                CNV.string2datetime("02/01/2013 10:09:15",
                                    "%d/%m/%Y %H:%M:%S"))
            param.start_time_str = extract_bugzilla.milli2string(
                db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = struct.wrap([813650])
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            versions = get_all_bug_versions(es, 813650)

            flags = [
                "cf_status_firefox18", "cf_status_firefox19",
                "cf_status_firefox_esr17", "cf_status_b2g18"
            ]
            for v in versions:
                if v.modified_ts > param.start_time:
                    for f in flags:
                        if v[f] != "fixed":
                            Log.error("813650 should have {{flag}}=='fixed'",
                                      {"flag": f})

Beispiel #23

0

Datei anzeigen

Datei: test_one_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def test_specific_bugs(self):
        """
        USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP
        THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs.
        """
        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance("candidate", self.settings.elasticsearch)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = self.settings.param.bugs
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(candidate, size=1000) as output:
                etl(db, output, param, please_stop=None)

Beispiel #24

0

Datei anzeigen

def diff(db, table, old_record, new_record):
    """
    UPDATE bugs_activity WITH THE CHANGES IN RECORDS
    """
    now = milli2string(db, CNV.datetime2milli(get_current_time(db)))
    changed = set(old_record.keys()) ^ set(new_record.keys())
    changed |= set([k for k, v in old_record.items() if v != new_record[k]])

    if table != u"bugs":
        prefix = table + u"."
    else:
        prefix = u""

    for c in changed:
        fieldid = db.query(
            "SELECT id FROM fielddefs WHERE name={{field_name}}",
            {"field_name": prefix + c})[0].id

        if fieldid == None:
            Log.error("Expecting a valid field name")

        activity = Struct(bug_id=old_record.bug_id,
                          who=1,
                          bug_when=now,
                          fieldid=fieldid,
                          removed=old_record[c],
                          added=new_record[c],
                          attach_id=old_record.attach_id,
                          comment_id=old_record.comment_id)
        db.insert("bugs_activity", activity)

    db.execute(
        "UPDATE bugs SET delta_ts={{now}} WHERE {{where}}", {
            "now": now,
            "where": esfilter2sqlwhere(db,
                                       {"term": {
                                           "bug_id": old_record.bug_id
                                       }})
        })

Beispiel #25

0

Datei anzeigen

Datei: bz_etl.py Projekt: klahnakoski/Bugzilla-ETL

def main(settings, es=None, es_comments=None):
    if not settings.param.allow_private_bugs and es and not es_comments:
        Log.error("Must have ES for comments")

    resume_from_last_run = File(settings.param.first_run_time).exists and not File(settings.param.last_run_time).exists

    #MAKE HANDLES TO CONTAINERS
    try:
        with DB(settings.bugzilla, readonly=True) as db:
            current_run_time, es, es_comments, last_run_time = setup_es(settings, db, es, es_comments)

            with ThreadedQueue(es, size=500, silent=True) as output_queue:
                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts))
                # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE.
                # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM
                param.start_time = last_run_time - nvl(settings.param.look_back, 5 * 60 * 1000)  # 5 MINUTE LOOK_BACK
                param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)
                param.alias_file = settings.param.alias_file
                param.allow_private_bugs = settings.param.allow_private_bugs

                if last_run_time > 0:
                    with Timer("run incremental etl"):
                        incremental_etl(settings, param, db, es, es_comments, output_queue)
                else:
                    with Timer("run full etl"):
                        full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue)

                output_queue.add(Thread.STOP)

        if settings.es.alias:
            es.delete_all_but(settings.es.alias, settings.es.index)
            es.add_alias(settings.es.alias)

        if settings.es_comments.alias:
            es.delete_all_but(settings.es_comments.alias, settings.es_comments.index)
            es_comments.add_alias(settings.es_comments.alias)

        File(settings.param.last_run_time).write(unicode(CNV.datetime2milli(current_run_time)))
    except Exception, e:
        Log.error("Problem with main ETL loop", e)

Beispiel #26

0

Datei anzeigen

Datei: test_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def test_incremental_has_correct_expires_on(self):
        # 813650, 726635 BOTH HAVE CHANGES IN 2013
        bugs = struct.wrap([813650, 726635])
        start_incremental=CNV.datetime2milli(CNV.string2datetime("2013-01-01", "%Y-%m-%d"))

        es = elasticsearch.make_test_instance("candidate", self.settings.candidate)
        with DB(self.settings.bugzilla) as db:
            #SETUP FIRST RUN PARAMETERS
            param = Struct()
            param.end_time = start_incremental
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = bugs
            param.allow_private_bugs = False

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            #SETUP INCREMENTAL RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(datetime.utcnow())
            param.start_time = start_incremental
            param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = bugs
            param.allow_private_bugs = False

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

        for b in bugs:
            results = es.search({
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and":[
                        {"term":{"bug_id":b}},
                        {"range":{"expires_on":{"gte":CNV.datetime2milli(datetime.utcnow())}}}
                    ]}
                }},
                "from": 0,
                "size": 200000,
                "sort": [],
                "fields": ["bug_id"]
            })

            if results.hits.total>1:
                Log.error("Expecting only one active bug_version record")

Beispiel #27

0

Datei anzeigen

Datei: test_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def test_incremental_has_correct_expires_on(self):
        # 813650, 726635 BOTH HAVE CHANGES IN 2013
        bugs = struct.wrap([813650, 726635])
        start_incremental = CNV.datetime2milli(
            CNV.string2datetime("2013-01-01", "%Y-%m-%d"))

        es = elasticsearch.make_test_instance("candidate",
                                              self.settings.candidate)
        with DB(self.settings.bugzilla) as db:
            #SETUP FIRST RUN PARAMETERS
            param = Struct()
            param.end_time = start_incremental
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(
                db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = bugs
            param.allow_private_bugs = False

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            #SETUP INCREMENTAL RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(datetime.utcnow())
            param.start_time = start_incremental
            param.start_time_str = extract_bugzilla.milli2string(
                db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = bugs
            param.allow_private_bugs = False

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

        for b in bugs:
            results = es.search({
                "query": {
                    "filtered": {
                        "query": {
                            "match_all": {}
                        },
                        "filter": {
                            "and": [{
                                "term": {
                                    "bug_id": b
                                }
                            }, {
                                "range": {
                                    "expires_on": {
                                        "gte":
                                        CNV.datetime2milli(datetime.utcnow())
                                    }
                                }
                            }]
                        }
                    }
                },
                "from": 0,
                "size": 200000,
                "sort": [],
                "fields": ["bug_id"]
            })

            if results.hits.total > 1:
                Log.error("Expecting only one active bug_version record")