Python CNV Beispiele, bzETL.util.cnv.CNV Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: view_aliases.py Projekt: klahnakoski/Bugzilla-ETL

def main(settings):
    file = File(settings.param.alias_file)
    aliases = CNV.JSON2object(file.read())

    for v in aliases.values():
        v.candidates = CNV.dict2Multiset(v.candidates)

    data = [
        {
            "lost": n,
            "found": d.canonical
        }
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical
    ]

    sorted = Q.sort(data, "found")
    for s in sorted:
        Log.note("{{found}} == {{lost}}", s)

    clean = {
        n: d.canonical
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical and n != ""
    }

    rev_clean = struct.inverse(clean)
    Log.note(CNV.object2JSON(rev_clean, pretty=True))

    for k, v in rev_clean.items():
        if len(v) > 3:
            Log.note(CNV.object2JSON({k: v}, pretty=True))

Beispiel #2

0

Datei anzeigen

def get_private_bugs(es):
    """
    FIND THE BUGS WE DO NOT EXPECT TO BE FOUND IN PUBLIC
    """
    data = es.search({
        "query": {"filtered": {
            "query": {"match_all": {}},
            "filter": {"and": [
                {"script": {"script": "true"}},
                {"and": [{"exists": {"field": "bug_group"}}]}
            ]}
        }},
        "from": 0,
        "size": 200000,
        "sort": [],
        "facets": {},
        "fields": ["bug_id", "blocked", "dependson", "dupe_of", "dupe_by"]
    })

    with Timer("aggregate es results on private bugs"):
        output = set([])
        for bug in data.hits.hits:
            output.add(bug.fields.bug_id)
            output |= set(nvl(CNV.value2intlist(bug.fields.blocked), []))
            output |= set(nvl(CNV.value2intlist(bug.fields.dependson), []))
            output |= set(nvl(CNV.value2intlist(bug.fields.dupe_of), []))
            output |= set(nvl(CNV.value2intlist(bug.fields.dupe_by), []))

    output.add(551988, 636964)
    return output

Beispiel #3

0

Datei anzeigen

def main(settings):
    file = File(settings.param.alias_file)
    aliases = CNV.JSON2object(file.read())

    for v in aliases.values():
        v.candidates = CNV.dict2Multiset(v.candidates)

    data = [{
        "lost": n,
        "found": d.canonical
    } for n, d in aliases.items() if d.canonical != None and n != d.canonical]

    sorted = Q.sort(data, "found")
    for s in sorted:
        Log.note("{{found}} == {{lost}}", s)

    clean = {
        n: d.canonical
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical and n != ""
    }

    rev_clean = struct.inverse(clean)
    Log.note(CNV.object2JSON(rev_clean, pretty=True))

    for k, v in rev_clean.items():
        if len(v) > 3:
            Log.note(CNV.object2JSON({k: v}, pretty=True))

Beispiel #4

0

Datei anzeigen

Datei: test_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def test_incremental_etl_catches_tracking_flags(self):
        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            es = elasticsearch.make_test_instance("candidate", self.settings.candidate)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME
            param.start_time = CNV.datetime2milli(CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S"))
            param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = struct.wrap([813650])
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            versions = get_all_bug_versions(es, 813650)

            flags = ["cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18"]
            for v in versions:
                if v.modified_ts>param.start_time:
                    for f in flags:
                        if v[f] != "fixed":
                            Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})

Beispiel #5

0

Datei anzeigen

def rename_attachments(bug_version):
    if bug_version.attachments == None: return bug_version
    if not USE_ATTACHMENTS_DOT:
        bug_version.attachments = CNV.JSON2object(
            CNV.object2JSON(bug_version.attachments).replace(
                "attachments.", "attachments_"))
    return bug_version

Beispiel #6

0

Datei anzeigen

Datei: test_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def random_sample_of_bugs(self):
        """
        I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS.  OF COURSE, IT ONLY WORKS
        WHEN I HAVE A REFERENCE TO COMPARE TO
        """
        NUM_TO_TEST = 100
        MAX_BUG_ID = 900000

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance(
                "candidate", self.settings.candidate)
            reference = ElasticSearch(self.settings.private_bugs_reference)

            #GO FASTER BY STORING LOCAL FILE
            local_cache = File(self.settings.param.temp_dir +
                               "/private_bugs.json")
            if local_cache.exists:
                private_bugs = set(CNV.JSON2object(local_cache.read()))
            else:
                with Timer("get private bugs"):
                    private_bugs = compare_es.get_private_bugs(reference)
                    local_cache.write(CNV.object2JSON(private_bugs))

            while True:
                some_bugs = [
                    b for b in
                    [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)]
                    if b not in private_bugs
                ]

                Log.note("Test with the following bug_ids: {{bugs}}",
                         {"bugs": some_bugs})

                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                param.start_time = 0
                param.start_time_str = extract_bugzilla.milli2string(db, 0)
                param.alias_file = self.settings.param.alias_file

                try:
                    with ThreadedQueue(candidate, 100) as output:
                        etl(db, output, param, please_stop=None)

                    #COMPARE ALL BUGS
                    Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
                    found_errors = compare_both(candidate, reference,
                                                self.settings, some_bugs)
                    if found_errors:
                        Log.note("Errors found")
                        break
                    else:
                        pass
                except Exception, e:
                    Log.warning(
                        "Total failure during compare of bugs {{bugs}}",
                        {"bugs": some_bugs}, e)

Beispiel #7

0

Datei anzeigen

def main(settings, es=None, es_comments=None):
    if not settings.param.allow_private_bugs and es and not es_comments:
        Log.error("Must have ES for comments")

    resume_from_last_run = File(
        settings.param.first_run_time).exists and not File(
            settings.param.last_run_time).exists

    #MAKE HANDLES TO CONTAINERS
    try:
        with DB(settings.bugzilla, readonly=True) as db:
            current_run_time, es, es_comments, last_run_time = setup_es(
                settings, db, es, es_comments)

            with ThreadedQueue(es, size=500, silent=True) as output_queue:
                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts))
                # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE.
                # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM
                param.start_time = last_run_time - nvl(
                    settings.param.look_back,
                    5 * 60 * 1000)  # 5 MINUTE LOOK_BACK
                param.start_time_str = extract_bugzilla.milli2string(
                    db, param.start_time)
                param.alias_file = settings.param.alias_file
                param.allow_private_bugs = settings.param.allow_private_bugs

                if last_run_time > 0:
                    with Timer("run incremental etl"):
                        incremental_etl(settings, param, db, es, es_comments,
                                        output_queue)
                else:
                    with Timer("run full etl"):
                        full_etl(resume_from_last_run, settings, param, db, es,
                                 es_comments, output_queue)

                output_queue.add(Thread.STOP)

        if settings.es.alias:
            es.delete_all_but(settings.es.alias, settings.es.index)
            es.add_alias(settings.es.alias)

        if settings.es_comments.alias:
            es.delete_all_but(settings.es_comments.alias,
                              settings.es_comments.index)
            es_comments.add_alias(settings.es_comments.alias)

        File(settings.param.last_run_time).write(
            unicode(CNV.datetime2milli(current_run_time)))
    except Exception, e:
        Log.error("Problem with main ETL loop", e)

Beispiel #8

0

Datei anzeigen

Datei: test_etl.py Projekt: markrcote/Bugzilla-ETL

def random_sample_of_bugs(settings):
    NUM_TO_TEST = 100
    MAX_BUG_ID = 900000

    with DB(settings.bugzilla) as db:
        candidate = Fake_ES(settings.fake_es)
        reference = ElasticSearch(settings.reference)

        #GO FASTER BY STORING LOCAL FILE
        local_cache = File(settings.param.temp_dir + "/private_bugs.json")
        if local_cache.exists:
            private_bugs = set(CNV.JSON2object(local_cache.read()))
        else:
            with Timer("get private bugs"):
                private_bugs = compare_es.get_private_bugs(reference)
                local_cache.write(CNV.object2JSON(private_bugs))

        while True:
            some_bugs = [
                b
                for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)]
                if b not in private_bugs
            ]

            #SETUP RUN PARAMETERS
            param = Struct()
            param.BUGS_TABLE_COLUMNS = get_bugs_table_columns(
                db, settings.bugzilla.schema)
            param.BUGS_TABLE_COLUMNS_SQL = SQL(",\n".join(
                ["`" + c.column_name + "`" for c in param.BUGS_TABLE_COLUMNS]))
            param.BUGS_TABLE_COLUMNS = Q.select(param.BUGS_TABLE_COLUMNS,
                                                "column_name")
            param.END_TIME = CNV.datetime2milli(datetime.utcnow())
            param.START_TIME = 0
            param.alias_file = settings.param.alias_file
            param.BUG_IDS_PARTITION = SQL("bug_id in {{bugs}}",
                                          {"bugs": db.quote(some_bugs)})

            try:
                etl(db, candidate, param)

                #COMPARE ALL BUGS
                found_errors = compare_both(candidate, reference, settings,
                                            some_bugs)
                if found_errors:
                    D.println("Errors found")
                    break
                else:
                    pass
            except Exception, e:
                D.warning("Total faiure during compare of bugs {{bugs}}",
                          {"bugs": some_bugs}, e)

Beispiel #9

0

Datei anzeigen

Datei: leak_check.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def test_confidential_whiteboard_is_screened(self):
        leaked_whiteboard = get(
            self.private,
            {
                "and": [
                    {
                        "terms": {
                            "bug_group": SCREENED_WHITEBOARD_BUG_GROUPS
                        }
                    },
                    {
                        "exists": {
                            "field": "status_whiteboard"
                        }
                    },
                    {
                        "not": {
                            "terms": {
                                "status_whiteboard": ["", "[screened]"]
                            }
                        }
                    },
                    {
                        "range": {
                            "expires_on": {
                                "gte": NOW
                            }
                        }
                    },  #CURRENT RECORDS
                    {
                        "range": {
                            "modified_ts": {
                                "lt": A_WHILE_AGO
                            }
                        }
                    },  #OF A MINIMUM AGE
                ]
            },
            fields=[
                "bug_id", "product", "component", "status_whiteboard",
                "bug_group", "modified_ts"
            ],
            limit=100)

        if leaked_whiteboard:
            for l in leaked_whiteboard:
                l.modified_ts = CNV.datetime2string(
                    CNV.milli2datetime(l.modified_ts))

            Log.error("Whiteboard leaking:\b{{leak}}",
                      {"leak": leaked_whiteboard})

Beispiel #10

0

Datei anzeigen

Datei: test_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def test_incremental_has_correct_expires_on(self):
        # 813650, 726635 BOTH HAVE CHANGES IN 2013
        bugs = struct.wrap([813650, 726635])
        start_incremental=CNV.datetime2milli(CNV.string2datetime("2013-01-01", "%Y-%m-%d"))

        es = elasticsearch.make_test_instance("candidate", self.settings.candidate)
        with DB(self.settings.bugzilla) as db:
            #SETUP FIRST RUN PARAMETERS
            param = Struct()
            param.end_time = start_incremental
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = bugs
            param.allow_private_bugs = False

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            #SETUP INCREMENTAL RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(datetime.utcnow())
            param.start_time = start_incremental
            param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = bugs
            param.allow_private_bugs = False

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

        for b in bugs:
            results = es.search({
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and":[
                        {"term":{"bug_id":b}},
                        {"range":{"expires_on":{"gte":CNV.datetime2milli(datetime.utcnow())}}}
                    ]}
                }},
                "from": 0,
                "size": 200000,
                "sort": [],
                "fields": ["bug_id"]
            })

            if results.hits.total>1:
                Log.error("Expecting only one active bug_version record")

Beispiel #11

0

Datei anzeigen

Datei: test_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def test_ambiguous_whiteboard_screened(self):
        GOOD_BUG_TO_TEST=1046

        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            es = elasticsearch.make_test_instance("candidate", self.settings.candidate)

            #MARK BUG AS ONE OF THE SCREENED GROUPS
            database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0])
            #MARK BUG AS ONE OF THE *NOT* SCREENED GROUPS
            database.add_bug_group(db, GOOD_BUG_TO_TEST, "not screened")
            db.flush()

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes
            param.allow_private_bugs = True

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST)

            for v in versions:
                if v.status_whiteboard not in (None, "", "[screened]"):
                    Log.error("Expecting whiteboard to be screened")

Beispiel #12

0

Datei anzeigen

Datei: replicate.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

def replicate(source, destination, pending, last_updated):
    """
    COPY source RECORDS TO destination
    """
    for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE):
        with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}):
            data = source.search({
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and": [
                        {"terms": {"bug_id": set(bugs)}},
                        {"range": {"modified_ts":
                            {"gte": CNV.datetime2milli(last_updated)}
                        }}
                    ]}
                }},
                "from": 0,
                "size": 200000,
                "sort": []
            })

            d2 = map(
                lambda(x): {"id": x.id, "value": x},
                map(
                    lambda(x): transform_bugzilla.normalize(transform_bugzilla.rename_attachments(x._source), old_school=True),
                    data.hits.hits
                )
            )
            destination.extend(d2)

Beispiel #13

0

Datei anzeigen

Datei: test_etl.py Projekt: markrcote/Bugzilla-ETL

def compare_both(candidate, reference, settings, some_bugs):
    File(settings.param.errors).delete()
    for bug_id in some_bugs:
        versions = Q.sort(
            get_all_bug_versions(candidate, bug_id, datetime.utcnow()),
            "modified_ts")
        # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE
        if len(versions) == 0:
            max_time = datetime.utcnow()
        else:
            max_time = CNV.milli2datetime(versions[-1].modified_ts)

        ref_versions = Q.sort(
            map(compare_es.old2new,
                get_all_bug_versions(reference, bug_id, max_time)),
            "modified_ts")

        can = json.dumps(json_scrub(versions),
                         indent=4,
                         sort_keys=True,
                         separators=(',', ': '))
        ref = json.dumps(json_scrub(ref_versions),
                         indent=4,
                         sort_keys=True,
                         separators=(',', ': '))
        found_errors = False
        if can != ref:
            found_errors = True
            File(settings.param.errors + "/try/" + str(bug_id) +
                 ".txt").write(can)
            File(settings.param.errors + "/exp/" + str(bug_id) +
                 ".txt").write(ref)

    return found_errors

Beispiel #14

0

Datei anzeigen

Datei: test_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def test_specific_bugs(self):
        """
        USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP
        THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs.  COMPARE
        THOSE VERSIONS TO A REFERENCE ES (ALSO CHECKED INTO REPOSITORY)
        """
        # settings.param.allow_private_bugs = True
        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate)
            reference = elasticsearch.open_test_instance("reference", self.settings.private_bugs_reference)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = self.settings.param.bugs
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(candidate, size=1000) as output:
                etl(db, output, param, please_stop=None)

            #COMPARE ALL BUGS
            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            compare_both(candidate, reference, self.settings, self.settings.param.bugs)

Beispiel #15

0

Datei anzeigen

Datei: test_etl.py Projekt: markrcote/Bugzilla-ETL

def main(settings):

    #MAKE HANDLES TO CONTAINERS
    with DB(settings.bugzilla) as db:
        #REAL ES
#        if settings.candidate.alias is None:
#            settings.candidate.alias=settings.candidate.index
#            settings.candidate.index=settings.candidate.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S")
#        candidate=ElasticSearch.create_index(settings.candidate, File(settings.candidate.schema_file).read())
        candidate=Fake_ES(settings.fake_es)

        reference=ElasticSearch(settings.reference)

        #SETUP RUN PARAMETERS
        param=Struct()
        param.BUGS_TABLE_COLUMNS=get_bugs_table_columns(db, settings.bugzilla.schema)
        param.BUGS_TABLE_COLUMNS_SQL=SQL(",\n".join(["`"+c.column_name+"`" for c in param.BUGS_TABLE_COLUMNS]))
        param.BUGS_TABLE_COLUMNS=Q.select(param.BUGS_TABLE_COLUMNS, "column_name")
        param.END_TIME=CNV.datetime2milli(datetime.utcnow())
        param.START_TIME=0
        param.alias_file=settings.param.alias_file
        param.BUG_IDS_PARTITION=SQL("bug_id in {{bugs}}", {"bugs":db.quote(settings.param.bugs)})

        etl(db, candidate, param)

        #COMPARE ALL BUGS
        compare_both(candidate, reference, settings, settings.param.bugs)

Beispiel #16

0

Datei anzeigen

Datei: elasticsearch.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

 def __init__(self, settings):
     self.settings = wrap({"host":"fake", "index":"fake"})
     self.filename = settings.filename
     try:
         self.data = CNV.JSON2object(File(self.filename).read())
     except IOError:
         self.data = Struct()

Beispiel #17

0

Datei anzeigen

Datei: database.py Projekt: klahnakoski/Bugzilla-ETL

def diff(db, table, old_record, new_record):
    """
    UPDATE bugs_activity WITH THE CHANGES IN RECORDS
    """
    now = milli2string(db, CNV.datetime2milli(get_current_time(db)))
    changed = set(old_record.keys()) ^ set(new_record.keys())
    changed |= set([k for k, v in old_record.items() if v != new_record[k]])

    if table != u"bugs":
        prefix = table + u"."
    else:
        prefix = u""

    for c in changed:
        fieldid=db.query("SELECT id FROM fielddefs WHERE name={{field_name}}", {"field_name": prefix + c})[0].id

        if fieldid == None:
            Log.error("Expecting a valid field name")

        activity = Struct(
            bug_id=old_record.bug_id,
            who=1,
            bug_when=now,
            fieldid=fieldid,
            removed=old_record[c],
            added=new_record[c],
            attach_id=old_record.attach_id,
            comment_id=old_record.comment_id
        )
        db.insert("bugs_activity", activity)

    db.execute("UPDATE bugs SET delta_ts={{now}} WHERE {{where}}", {
        "now":now,
        "where":esfilter2sqlwhere(db, {"term":{"bug_id":old_record.bug_id}})
    })

Beispiel #18

0

Datei anzeigen

Datei: replicate.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

def get_pending(es, since):
    result = es.search({
        "query": {"filtered": {
            "query": {"match_all": {}},
            "filter": {
            "range": {"modified_ts": {"gte": CNV.datetime2milli(since)}}}
        }},
        "from": 0,
        "size": 0,
        "sort": [],
        "facets": {"default": {"terms": {"field": "bug_id", "size": 200000}}}
    })

    if len(result.facets.default.terms) >= 200000:
        Log.error("Can not handle more than 200K bugs changed")

    pending_bugs = Multiset(
        result.facets.default.terms,
        key_field="term",
        count_field="count"
    )
    Log.note("Source has {{num}} bug versions for updating", {
        "num": len(pending_bugs)
    })
    return pending_bugs

Beispiel #19

0

Datei anzeigen

Datei: compare_es.py Projekt: markrcote/Bugzilla-ETL

def get_all_bug_versions(es, bug_id, max_time):

    data = es.search({
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": {
                    "and": [{
                        "term": {
                            "bug_id": bug_id
                        }
                    }, {
                        "range": {
                            "modified_ts": {
                                "lte": CNV.datetime2milli(max_time)
                            }
                        }
                    }]
                }
            }
        },
        "from": 0,
        "size": 200000,
        "sort": []
    })

    return Q.select(data.hits.hits, "_source")

Beispiel #20

0

Datei anzeigen

Datei: test_etl.py Projekt: markrcote/Bugzilla-ETL

def main(settings):

    #MAKE HANDLES TO CONTAINERS
    with DB(settings.bugzilla) as db:
        #REAL ES
        #        if settings.candidate.alias is None:
        #            settings.candidate.alias=settings.candidate.index
        #            settings.candidate.index=settings.candidate.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S")
        #        candidate=ElasticSearch.create_index(settings.candidate, File(settings.candidate.schema_file).read())
        candidate = Fake_ES(settings.fake_es)

        reference = ElasticSearch(settings.reference)

        #SETUP RUN PARAMETERS
        param = Struct()
        param.BUGS_TABLE_COLUMNS = get_bugs_table_columns(
            db, settings.bugzilla.schema)
        param.BUGS_TABLE_COLUMNS_SQL = SQL(",\n".join(
            ["`" + c.column_name + "`" for c in param.BUGS_TABLE_COLUMNS]))
        param.BUGS_TABLE_COLUMNS = Q.select(param.BUGS_TABLE_COLUMNS,
                                            "column_name")
        param.END_TIME = CNV.datetime2milli(datetime.utcnow())
        param.START_TIME = 0
        param.alias_file = settings.param.alias_file
        param.BUG_IDS_PARTITION = SQL("bug_id in {{bugs}}",
                                      {"bugs": db.quote(settings.param.bugs)})

        etl(db, candidate, param)

        #COMPARE ALL BUGS
        compare_both(candidate, reference, settings, settings.param.bugs)

Beispiel #21

0

Datei anzeigen

Datei: test_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def test_specific_bugs(self):
        """
        USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP
        THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs.  COMPARE
        THOSE VERSIONS TO A REFERENCE ES (ALSO CHECKED INTO REPOSITORY)
        """
        # settings.param.allow_private_bugs = True
        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance(
                "candidate", self.settings.candidate)
            reference = elasticsearch.open_test_instance(
                "reference", self.settings.private_bugs_reference)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = self.settings.param.bugs
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(candidate, size=1000) as output:
                etl(db, output, param, please_stop=None)

            #COMPARE ALL BUGS
            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            compare_both(candidate, reference, self.settings,
                         self.settings.param.bugs)

Beispiel #22

0

Datei anzeigen

Datei: test_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def test_whiteboard_screened(self):
        GOOD_BUG_TO_TEST = 1046

        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            es = elasticsearch.make_test_instance("candidate",
                                                  self.settings.candidate)

            #MARK BUG AS ONE OF THE SCREENED GROUPS
            database.add_bug_group(db, GOOD_BUG_TO_TEST,
                                   SCREENED_WHITEBOARD_BUG_GROUPS[0])
            db.flush()

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = struct.wrap([
                GOOD_BUG_TO_TEST
            ])  # bug 1046 sees lots of whiteboard, and other field, changes
            param.allow_private_bugs = True

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST)

            for v in versions:
                if v.status_whiteboard not in (None, "", "[screened]"):
                    Log.error("Expecting whiteboard to be screened")

Beispiel #23

0

Datei anzeigen

Datei: test_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def random_sample_of_bugs(self):
        """
        I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS.  OF COURSE, IT ONLY WORKS
        WHEN I HAVE A REFERENCE TO COMPARE TO
        """
        NUM_TO_TEST = 100
        MAX_BUG_ID = 900000

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate)
            reference = ElasticSearch(self.settings.private_bugs_reference)

            #GO FASTER BY STORING LOCAL FILE
            local_cache = File(self.settings.param.temp_dir + "/private_bugs.json")
            if local_cache.exists:
                private_bugs = set(CNV.JSON2object(local_cache.read()))
            else:
                with Timer("get private bugs"):
                    private_bugs = compare_es.get_private_bugs(reference)
                    local_cache.write(CNV.object2JSON(private_bugs))

            while True:
                some_bugs = [b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs]

                Log.note("Test with the following bug_ids: {{bugs}}", {"bugs":some_bugs})

                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                param.start_time = 0
                param.start_time_str = extract_bugzilla.milli2string(db, 0)
                param.alias_file = self.settings.param.alias_file

                try:
                    with ThreadedQueue(candidate, 100) as output:
                        etl(db, output, param, please_stop=None)

                    #COMPARE ALL BUGS
                    Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
                    found_errors = compare_both(candidate, reference, self.settings, some_bugs)
                    if found_errors:
                        Log.note("Errors found")
                        break
                    else:
                        pass
                except Exception, e:
                    Log.warning("Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)

Beispiel #24

0

Datei anzeigen

Datei: elasticsearch.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

 def search(self, query):
     query=wrap(query)
     f = CNV.esfilter2where(query.query.filtered.filter)
     filtered=wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)])
     if query.fields:
         return wrap({"hits": {"total":len(filtered), "hits": [{"_id":d._id, "fields":unwrap(Q.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}})
     else:
         return wrap({"hits": {"total":len(filtered), "hits": filtered}})

Beispiel #25

0

Datei anzeigen

def get_current_time(db):
    """
    RETURN GMT TIME
    """
    output = db.query(u"""
        SELECT
            UNIX_TIMESTAMP(now()) `value`
        """)[0].value
    return CNV.unix2datetime(output)

Beispiel #26

0

Datei anzeigen

Datei: extract_bugzilla.py Projekt: klahnakoski/Bugzilla-ETL

def get_current_time(db):
    """
    RETURN GMT TIME
    """
    output = db.query(u"""
        SELECT
            UNIX_TIMESTAMP(now()) `value`
        """)[0].value
    return CNV.unix2datetime(output)

Beispiel #27

0

Datei anzeigen

Datei: compare_es.py Projekt: markrcote/Bugzilla-ETL

def old2new(bug):
    #THESE ARE KNOWN CHANGES THAT SHOULD BE MADE TO THE PRODUCTION VERSION
    bug.id=bug._id.replace(".", "_")[:-3]
    bug._id=None

    if bug.everconfirmed is not None: bug.everconfirmed=int(bug.everconfirmed)
    if bug.votes is not None: bug.votes=int(bug.votes)
    bug.dupe_by=CNV.value2intlist(bug.dupe_by)
    if bug.votes==0: del bug["votes"]
    if Math.is_integer(bug.remaining_time) and int(bug.remaining_time)==0: del bug["remaining_time"]
    if bug.cf_due_date is not None: bug.cf_due_date=CNV.datetime2milli(CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d"))
    if bug.everconfirmed==0: del bug["everconfirmed"]


    try:
        bug.cf_last_resolved=CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
    except Exception, e:
        pass

Beispiel #28

0

Datei anzeigen

Datei: test_etl.py Projekt: markrcote/Bugzilla-ETL

def random_sample_of_bugs(settings):
    NUM_TO_TEST=100
    MAX_BUG_ID=900000


    with DB(settings.bugzilla) as db:
        candidate=Fake_ES(settings.fake_es)
        reference=ElasticSearch(settings.reference)

        #GO FASTER BY STORING LOCAL FILE
        local_cache=File(settings.param.temp_dir+"/private_bugs.json")
        if local_cache.exists:
            private_bugs=set(CNV.JSON2object(local_cache.read()))
        else:
            with Timer("get private bugs"):
                private_bugs= compare_es.get_private_bugs(reference)
                local_cache.write(CNV.object2JSON(private_bugs))

        while True:
            some_bugs=[b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs]

            #SETUP RUN PARAMETERS
            param=Struct()
            param.BUGS_TABLE_COLUMNS=get_bugs_table_columns(db, settings.bugzilla.schema)
            param.BUGS_TABLE_COLUMNS_SQL=SQL(",\n".join(["`"+c.column_name+"`" for c in param.BUGS_TABLE_COLUMNS]))
            param.BUGS_TABLE_COLUMNS=Q.select(param.BUGS_TABLE_COLUMNS, "column_name")
            param.END_TIME=CNV.datetime2milli(datetime.utcnow())
            param.START_TIME=0
            param.alias_file=settings.param.alias_file
            param.BUG_IDS_PARTITION=SQL("bug_id in {{bugs}}", {"bugs":db.quote(some_bugs)})

            try:
                etl(db, candidate, param)

                #COMPARE ALL BUGS
                found_errors=compare_both(candidate, reference, settings, some_bugs)
                if found_errors:
                    D.println("Errors found")
                    break
                else:
                    pass
            except Exception, e:
                D.warning("Total faiure during compare of bugs {{bugs}}", {"bugs":some_bugs}, e)

Beispiel #29

0

Datei anzeigen

Datei: bz_etl.py Projekt: klahnakoski/Bugzilla-ETL

def main(settings, es=None, es_comments=None):
    if not settings.param.allow_private_bugs and es and not es_comments:
        Log.error("Must have ES for comments")

    resume_from_last_run = File(settings.param.first_run_time).exists and not File(settings.param.last_run_time).exists

    #MAKE HANDLES TO CONTAINERS
    try:
        with DB(settings.bugzilla, readonly=True) as db:
            current_run_time, es, es_comments, last_run_time = setup_es(settings, db, es, es_comments)

            with ThreadedQueue(es, size=500, silent=True) as output_queue:
                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts))
                # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE.
                # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM
                param.start_time = last_run_time - nvl(settings.param.look_back, 5 * 60 * 1000)  # 5 MINUTE LOOK_BACK
                param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)
                param.alias_file = settings.param.alias_file
                param.allow_private_bugs = settings.param.allow_private_bugs

                if last_run_time > 0:
                    with Timer("run incremental etl"):
                        incremental_etl(settings, param, db, es, es_comments, output_queue)
                else:
                    with Timer("run full etl"):
                        full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue)

                output_queue.add(Thread.STOP)

        if settings.es.alias:
            es.delete_all_but(settings.es.alias, settings.es.index)
            es.add_alias(settings.es.alias)

        if settings.es_comments.alias:
            es.delete_all_but(settings.es_comments.alias, settings.es_comments.index)
            es_comments.add_alias(settings.es_comments.alias)

        File(settings.param.last_run_time).write(unicode(CNV.datetime2milli(current_run_time)))
    except Exception, e:
        Log.error("Problem with main ETL loop", e)

Beispiel #30

0

Datei anzeigen

Datei: leak_check.py Projekt: klahnakoski/Bugzilla-ETL

    def test_confidential_whiteboard_is_screened(self):
        leaked_whiteboard = get(
            self.private,
            {"and": [
                {"terms": {"bug_group": SCREENED_WHITEBOARD_BUG_GROUPS}},
                {"exists": {"field": "status_whiteboard"}},
                {"not": {"terms": {"status_whiteboard": ["", "[screened]"]}}},
                {"range": {"expires_on": {"gte": NOW}}}, #CURRENT RECORDS
                {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE
            ]},
            fields=["bug_id", "product", "component", "status_whiteboard", "bug_group", "modified_ts"],
            limit=100

        )

        if leaked_whiteboard:
            for l in leaked_whiteboard:
                l.modified_ts=CNV.datetime2string(CNV.milli2datetime(l.modified_ts))

            Log.error("Whiteboard leaking:\b{{leak}}", {"leak": leaked_whiteboard})

Beispiel #31

0

Datei anzeigen

Datei: logs.py Projekt: klahnakoski/eideticker

    def stop(cls):
        if cls.profiler:
            from bzETL.util.cnv import CNV
            from bzETL.util.env.files import File

            p = pstats.Stats(cls.profiler)
            stats = [{
                "num_calls":d[1],
                "self_time":d[2],
                "total_time":d[3],
                "file":(f[0] if f[0] != "~" else "").replace("\\", "/"),
                "line":f[1],
                "method":f[2].lstrip("<").rstrip(">")
            }
                for f, d, in p.stats.iteritems()
            ]
            CNV.list2tab(stats)
            File("profile.tab").write(CNV.list2tab(stats))

        cls.main_log.stop()

Beispiel #32

0

Datei anzeigen

Datei: test_replicate.py Projekt: klahnakoski/Bugzilla-ETL

def test_replication():
    try:
        settings=startup.read_settings(filename="replication_settings.json")
        Log.start(settings.debug)

        source=ElasticSearch(settings.source)
        destination=replicate.get_or_create_index(settings["destination"], source)

        replicate.replicate(source, destination, [537285], CNV.string2datetime("19900101", "%Y%m%d"))
    finally:
        Log.stop()

Beispiel #33

0

Datei anzeigen

Datei: test_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

def compare_both(candidate, reference, settings, some_bugs):
    File(settings.param.errors).delete()
    try_dir = settings.param.errors + "/try/"
    ref_dir = settings.param.errors + "/ref/"

    with Timer("Comparing to reference"):
        found_errors = False
        for bug_id in some_bugs:
            try:
                versions = Q.sort(
                    get_all_bug_versions(candidate, bug_id, datetime.utcnow()),
                    "modified_ts")
                # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE
                if not versions:
                    max_time = CNV.milli2datetime(settings.bugzilla.expires_on)
                else:
                    max_time = CNV.milli2datetime(versions.last().modified_ts)

                pre_ref_versions = get_all_bug_versions(
                    reference, bug_id, max_time)
                ref_versions = \
                    Q.sort(
                        #ADDED TO FIX OLD PRODUCTION BUG VERSIONS
                        [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions],
                        "modified_ts"
                    )

                can = CNV.object2JSON(versions, pretty=True)
                ref = CNV.object2JSON(ref_versions, pretty=True)
                if can != ref:
                    found_errors = True
                    File(try_dir + unicode(bug_id) + ".txt").write(can)
                    File(ref_dir + unicode(bug_id) + ".txt").write(ref)
            except Exception, e:
                found_errors = True
                Log.warning("Problem ETL'ing bug {{bug_id}}",
                            {"bug_id": bug_id}, e)

        if found_errors:
            Log.error("DIFFERENCES FOUND (Differences shown in {{path}})",
                      {"path": [try_dir, ref_dir]})

Beispiel #34

0

Datei anzeigen

Datei: elasticsearch.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def extend(self, records):
        """
        JUST SO WE MODEL A Queue
        """
        records = {v["id"]: v["value"] for v in records}

        struct.unwrap(self.data).update(records)

        data_as_json = CNV.object2JSON(self.data, pretty=True)

        File(self.filename).write(data_as_json)
        Log.note("{{num}} items added", {"num": len(records)})

Beispiel #35

0

Datei anzeigen

Datei: test_etl.py Projekt: klahnakoski/Bugzilla-ETL

def compare_both(candidate, reference, settings, some_bugs):
    File(settings.param.errors).delete()
    try_dir = settings.param.errors + "/try/"
    ref_dir = settings.param.errors + "/ref/"

    with Timer("Comparing to reference"):
        found_errors = False
        for bug_id in some_bugs:
            try:
                versions = Q.sort(
                    get_all_bug_versions(candidate, bug_id, datetime.utcnow()),
                    "modified_ts")
                # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE
                if not versions:
                    max_time = CNV.milli2datetime(settings.bugzilla.expires_on)
                else:
                    max_time = CNV.milli2datetime(versions.last().modified_ts)

                pre_ref_versions = get_all_bug_versions(reference, bug_id, max_time)
                ref_versions = \
                    Q.sort(
                        #ADDED TO FIX OLD PRODUCTION BUG VERSIONS
                        [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions],
                        "modified_ts"
                    )

                can = CNV.object2JSON(versions, pretty=True)
                ref = CNV.object2JSON(ref_versions, pretty=True)
                if can != ref:
                    found_errors = True
                    File(try_dir + unicode(bug_id) + ".txt").write(can)
                    File(ref_dir + unicode(bug_id) + ".txt").write(ref)
            except Exception, e:
                found_errors = True
                Log.warning("Problem ETL'ing bug {{bug_id}}", {"bug_id": bug_id}, e)

        if found_errors:
            Log.error("DIFFERENCES FOUND (Differences shown in {{path}})", {
                "path": [try_dir, ref_dir]}
            )

Beispiel #36

0

Datei anzeigen

Datei: compare_es.py Projekt: markrcote/Bugzilla-ETL

def old2new(bug):
    #THESE ARE KNOWN CHANGES THAT SHOULD BE MADE TO THE PRODUCTION VERSION
    bug.id = bug._id.replace(".", "_")[:-3]
    bug._id = None

    if bug.everconfirmed is not None:
        bug.everconfirmed = int(bug.everconfirmed)
    if bug.votes is not None: bug.votes = int(bug.votes)
    bug.dupe_by = CNV.value2intlist(bug.dupe_by)
    if bug.votes == 0: del bug["votes"]
    if Math.is_integer(bug.remaining_time) and int(bug.remaining_time) == 0:
        del bug["remaining_time"]
    if bug.cf_due_date is not None:
        bug.cf_due_date = CNV.datetime2milli(
            CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d"))
    if bug.everconfirmed == 0: del bug["everconfirmed"]

    try:
        bug.cf_last_resolved = CNV.datetime2milli(
            CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
    except Exception, e:
        pass

Beispiel #37

0

Datei anzeigen

def old2new(bug, max_date):
    """
    CONVERT THE OLD ES FORMAT TO THE NEW
    THESE ARE KNOWN CHANGES THAT SHOULD BE MADE TO THE PRODUCTION VERSION
    """
    if bug.everconfirmed != None:
        if bug.everconfirmed == "":
            bug.everconfirmed = None
        else:
            bug.everconfirmed = int(bug.everconfirmed)

    bug = CNV.JSON2object(CNV.object2JSON(bug).replace("bugzilla: other b.m.o issues ", "bugzilla: other b.m.o issues"))

    if bug.expires_on > max_date:
        bug.expires_on = parse_bug_history.MAX_TIME
    if bug.votes != None:
        bug.votes = int(bug.votes)
    bug.dupe_by = CNV.value2intlist(bug.dupe_by)
    if bug.votes == 0:
        del bug["votes"]
        # if Math.is_integer(bug.remaining_time) and int(bug.remaining_time) == 0:
    #     bug.remaining_time = 0
    if bug.cf_due_date != None and not Math.is_number(bug.cf_due_date):
        bug.cf_due_date = CNV.datetime2milli(
            CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d")
        )
    bug.changes = CNV.JSON2object(
        CNV.object2JSON(Q.sort(bug.changes, "field_name")) \
            .replace("\"field_value_removed\":", "\"old_value\":") \
            .replace("\"field_value\":", "\"new_value\":")
    )

    if bug.everconfirmed == 0:
        del bug["everconfirmed"]
    if bug.id == "692436_1336314345":
        bug.votes = 3

    try:
        if Math.is_number(bug.cf_last_resolved):
            bug.cf_last_resolved = long(bug.cf_last_resolved)
        else:
            bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
    except Exception, e:
        pass

Beispiel #38

0

Datei anzeigen

def test_replication():
    try:
        settings = startup.read_settings(filename="replication_settings.json")
        Log.start(settings.debug)

        source = ElasticSearch(settings.source)
        destination = replicate.get_or_create_index(settings["destination"],
                                                    source)

        replicate.replicate(source, destination, [537285],
                            CNV.string2datetime("19900101", "%Y%m%d"))
    finally:
        Log.stop()

Beispiel #39

0

Datei anzeigen

Datei: alias_analysis.py Projekt: klahnakoski/Bugzilla-ETL

def analysis(settings, last_run, please_stop):
    DIFF = 7
    if last_run:
        DIFF = 4      #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING
    try_again = True

    while try_again and not please_stop:
        #FIND EMAIL MOST NEEDING REPLACEMENT
        problem_agg = Multiset(allow_negative=True)
        for bug_id, agg in bugs.iteritems():
            #ONLY COUNT NEGATIVE EMAILS
            for email, count in agg.dic.iteritems():
                if count < 0:
                    problem_agg.add(alias(email), amount=count)

        problems = Q.sort([
            {"email": e, "count": c}
            for e, c in problem_agg.dic.iteritems()
            if not aliases.get(e, Null).ignore and (c <= -(DIFF / 2) or last_run)
        ], ["count", "email"])

        try_again = False
        for problem in problems:
            if please_stop:
                break

            #FIND MOST LIKELY MATCH
            solution_agg = Multiset(allow_negative=True)
            for bug_id, agg in bugs.iteritems():
                if agg.dic.get(problem.email, 0) < 0:  #ONLY BUGS THAT ARE EXPERIENCING THIS problem
                    solution_agg += agg
            solutions = Q.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"])

            if last_run and len(solutions) == 2 and solutions[0].count == -solutions[1].count:
                #exact match
                pass
            elif len(solutions) <= 1 or (solutions[1].count + DIFF >= solutions[0].count):
                #not distinctive enough
                continue

            best_solution = solutions[0]
            Log.note("{{problem}} ({{score}}) -> {{solution}} {{matches}}", {
                "problem": problem.email,
                "score": problem.count,
                "solution": best_solution.email,
                "matches": CNV.object2JSON(Q.select(solutions, "count")[:10:])
            })
            try_again = True
            add_alias(problem.email, best_solution.email)

    saveAliases(settings)

Beispiel #40

0

Datei anzeigen

Datei: leak_check.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

def milli2datetime(r):
    """
    CONVERT ANY longs INTO TIME STRINGS
    """
    try:
        if r == None:
            return None
        elif isinstance(r, basestring):
            return r
        elif Math.is_number(r):
            if CNV.value2number(r) > 800000000000:
                return CNV.datetime2string(CNV.milli2datetime(r),
                                           "%Y-%m-%d %H:%M:%S")
            else:
                return r
        elif isinstance(r, dict):
            output = {}
            for k, v in r.items():
                v = milli2datetime(v)
                if v != None:
                    output[k.lower()] = v
            return output
        elif hasattr(r, '__iter__'):
            output = []
            for v in r:
                v = milli2datetime(v)
                if v != None:
                    output.append(v)
            if not output:
                return None
            try:
                return Q.sort(output)
            except Exception:
                return output
        else:
            return r
    except Exception, e:
        Log.warning("Can not scrub: {{json}}", {"json": r}, e)

Beispiel #41

0

Datei anzeigen

Datei: replicate.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

def get_last_updated(es):
    try:
        results = es.search({
            "query": {"filtered": {
                "query": {"match_all": {}},
                "filter": {
                    "range": {
                    "modified_ts": {"gte": CNV.datetime2milli(far_back)}}}
            }},
            "from": 0,
            "size": 0,
            "sort": [],
            "facets": {"0": {"statistical": {"field": "modified_ts"}}}
        })

        if results.facets["0"].count == 0:
            return datetime.min
        return CNV.milli2datetime(results.facets["0"].max)
    except Exception, e:
        Log.error("Can not get_last_updated from {{host}}/{{index}}",{
            "host": es.settings.host,
            "index": es.settings.index
        }, e)

Beispiel #42

0

Datei anzeigen

Datei: test_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def test_incremental_etl_catches_tracking_flags(self):
        database.make_test_instance(self.settings.bugzilla)

        with DB(self.settings.bugzilla) as db:
            es = elasticsearch.make_test_instance("candidate",
                                                  self.settings.candidate)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME
            param.start_time = CNV.datetime2milli(
                CNV.string2datetime("02/01/2013 10:09:15",
                                    "%d/%m/%Y %H:%M:%S"))
            param.start_time_str = extract_bugzilla.milli2string(
                db, param.start_time)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = struct.wrap([813650])
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(es, size=1000) as output:
                etl(db, output, param, please_stop=None)

            Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
            versions = get_all_bug_versions(es, 813650)

            flags = [
                "cf_status_firefox18", "cf_status_firefox19",
                "cf_status_firefox_esr17", "cf_status_b2g18"
            ]
            for v in versions:
                if v.modified_ts > param.start_time:
                    for f in flags:
                        if v[f] != "fixed":
                            Log.error("813650 should have {{flag}}=='fixed'",
                                      {"flag": f})

Beispiel #43

0

Datei anzeigen

Datei: replicate.py Projekt: klahnakoski/Bugzilla-ETL

def main(settings):
    #USE A FILE
    if settings.source.filename != None:
        settings.destination.alias = settings.destination.index
        settings.destination.index = ElasticSearch.proto_name(settings.destination.alias)
        schema = CNV.JSON2object(File(settings.source.schema_filename).read())
        if transform_bugzilla.USE_ATTACHMENTS_DOT:
            schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments."))

        dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True)
        dest.set_refresh_interval(-1)
        extract_from_file(settings.source, dest)
        dest.set_refresh_interval(1)

        dest.delete_all_but(settings.destination.alias, settings.destination.index)
        dest.add_alias(settings.destination.alias)
        return

    # SYNCH WITH source ES INDEX
    source=ElasticSearch(settings.source)
    destination=get_or_create_index(settings["destination"], source)

    # GET LAST UPDATED
    time_file = File(settings.param.last_replication_time)
    from_file = None
    if time_file.exists:
        from_file = CNV.milli2datetime(CNV.value2int(time_file.read()))
    from_es = get_last_updated(destination)
    last_updated = nvl(MIN(from_file, from_es), CNV.milli2datetime(0))
    current_time = datetime.utcnow()

    pending = get_pending(source, last_updated)
    with ThreadedQueue(destination, size=1000) as data_sink:
        replicate(source, data_sink, pending, last_updated)

    # RECORD LAST UPDATED
    time_file.write(unicode(CNV.datetime2milli(current_time)))

Beispiel #44

0

Datei anzeigen

Datei: leak_check.py Projekt: klahnakoski/Bugzilla-ETL

def milli2datetime(r):
    """
    CONVERT ANY longs INTO TIME STRINGS
    """
    try:
        if r == None:
            return None
        elif isinstance(r, basestring):
            return r
        elif Math.is_number(r):
            if CNV.value2number(r) > 800000000000:
                return CNV.datetime2string(CNV.milli2datetime(r), "%Y-%m-%d %H:%M:%S")
            else:
                return r
        elif isinstance(r, dict):
            output = {}
            for k, v in r.items():
                v = milli2datetime(v)
                if v != None:
                    output[k.lower()] = v
            return output
        elif hasattr(r, '__iter__'):
            output = []
            for v in r:
                v = milli2datetime(v)
                if v != None:
                    output.append(v)
            if not output:
                return None
            try:
                return Q.sort(output)
            except Exception:
                return output
        else:
            return r
    except Exception, e:
        Log.warning("Can not scrub: {{json}}", {"json": r}, e)

Beispiel #45

0

Datei anzeigen

Datei: compare_es.py Projekt: markrcote/Bugzilla-ETL

def get_all_bug_versions(es, bug_id, max_time):

    data=es.search({
        "query":{"filtered":{
            "query":{"match_all":{}},
            "filter":{"and":[
                {"term":{"bug_id":bug_id}},
                {"range":{"modified_ts":{"lte":CNV.datetime2milli(max_time)}}}
            ]}
        }},
        "from":0,
        "size":200000,
        "sort":[]
    })

    return Q.select(data.hits.hits, "_source")

Beispiel #46

0

Datei anzeigen

Datei: elasticsearch.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

def open_test_instance(name, settings):
    if settings.filename:
        Log.note("Using {{filename}} as {{type}}", {
            "filename": settings.filename,
            "type": name
        })
        return Fake_ES(settings)
    else:
        Log.note("Using ES cluster at {{host}} as {{type}}", {
            "host": settings.host,
            "type": name
        })

        ElasticSearch.delete_index(settings)

        schema = CNV.JSON2object(File(settings.schema_file).read(), flexible=True, paths=True)
        es = ElasticSearch.create_index(settings, schema, limit_replicas=True)
        return es

Beispiel #47

0

Datei anzeigen

Datei: replicate.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

def extract_from_file(source_settings, destination):
    with File(source_settings.filename) as handle:
        for g, d in Q.groupby(handle, size=BATCH_SIZE):
            try:
                d2 = map(
                    lambda (x): {"id": x.id, "value": x},
                    map(
                        lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))),
                        d
                    )
                )
                destination.add(d2)
            except Exception, e:
                filename = "Error_" + unicode(g) + ".txt"
                File(filename).write(d)
                Log.warning("Can not convert block {{block}} (file={{host}})", {
                    "block": g,
                    "filename": filename
                }, e)

Beispiel #48

0

Datei anzeigen

Datei: test_one_etl.py Projekt: klahnakoski/Bugzilla-ETL

    def test_specific_bugs(self):
        """
        USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP
        THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs.
        """
        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance("candidate", self.settings.elasticsearch)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = self.settings.param.bugs
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(candidate, size=1000) as output:
                etl(db, output, param, please_stop=None)

Beispiel #49

0

Datei anzeigen

Datei: test_one_etl.py Projekt: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def test_specific_bugs(self):
        """
        USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP
        THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs.
        """
        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance(
                "candidate", self.settings.elasticsearch)

            #SETUP RUN PARAMETERS
            param = Struct()
            param.end_time = CNV.datetime2milli(get_current_time(db))
            param.start_time = 0
            param.start_time_str = extract_bugzilla.milli2string(db, 0)

            param.alias_file = self.settings.param.alias_file
            param.bug_list = self.settings.param.bugs
            param.allow_private_bugs = self.settings.param.allow_private_bugs

            with ThreadedQueue(candidate, size=1000) as output:
                etl(db, output, param, please_stop=None)

Beispiel #50

0

Datei anzeigen

def diff(db, table, old_record, new_record):
    """
    UPDATE bugs_activity WITH THE CHANGES IN RECORDS
    """
    now = milli2string(db, CNV.datetime2milli(get_current_time(db)))
    changed = set(old_record.keys()) ^ set(new_record.keys())
    changed |= set([k for k, v in old_record.items() if v != new_record[k]])

    if table != u"bugs":
        prefix = table + u"."
    else:
        prefix = u""

    for c in changed:
        fieldid = db.query(
            "SELECT id FROM fielddefs WHERE name={{field_name}}",
            {"field_name": prefix + c})[0].id

        if fieldid == None:
            Log.error("Expecting a valid field name")

        activity = Struct(bug_id=old_record.bug_id,
                          who=1,
                          bug_when=now,
                          fieldid=fieldid,
                          removed=old_record[c],
                          added=new_record[c],
                          attach_id=old_record.attach_id,
                          comment_id=old_record.comment_id)
        db.insert("bugs_activity", activity)

    db.execute(
        "UPDATE bugs SET delta_ts={{now}} WHERE {{where}}", {
            "now": now,
            "where": esfilter2sqlwhere(db,
                                       {"term": {
                                           "bug_id": old_record.bug_id
                                       }})
        })

Beispiel #51

0

Datei anzeigen

Datei: test_etl.py Projekt: markrcote/Bugzilla-ETL

def compare_both(candidate, reference, settings, some_bugs):
    File(settings.param.errors).delete()
    for bug_id in some_bugs:
        versions = Q.sort(
            get_all_bug_versions(candidate, bug_id, datetime.utcnow()),
            "modified_ts")
        # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE
        if len(versions)==0:
            max_time=datetime.utcnow()
        else:
            max_time = CNV.milli2datetime(versions[-1].modified_ts)
            
        ref_versions = Q.sort(map(compare_es.old2new, get_all_bug_versions(reference, bug_id, max_time)), "modified_ts")

        can = json.dumps(json_scrub(versions), indent=4, sort_keys=True, separators=(',', ': '))
        ref = json.dumps(json_scrub(ref_versions), indent=4, sort_keys=True, separators=(',', ': '))
        found_errors=False
        if can != ref:
            found_errors=True
            File(settings.param.errors + "/try/" + str(bug_id) + ".txt").write(can)
            File(settings.param.errors + "/exp/" + str(bug_id) + ".txt").write(ref)

    return found_errors

Beispiel #52

0

Datei anzeigen

Datei: elasticsearch.py Projekt: klahnakoski/Bugzilla-ETL

 def delete_record(self, filter):
     f = CNV.esfilter2where(filter)
     self.data = wrap({k: v for k, v in self.data.items() if not f(v)})