def __init__(self, settings):
     self.settings = wrap({"host":"fake", "index":"fake"})
     self.filename = settings.filename
     try:
         self.data = CNV.JSON2object(File(self.filename).read())
     except IOError:
         self.data = Struct()
예제 #2
0
def main(settings):
    file = File(settings.param.alias_file)
    aliases = CNV.JSON2object(file.read())

    for v in aliases.values():
        v.candidates = CNV.dict2Multiset(v.candidates)

    data = [{
        "lost": n,
        "found": d.canonical
    } for n, d in aliases.items() if d.canonical != None and n != d.canonical]

    sorted = Q.sort(data, "found")
    for s in sorted:
        Log.note("{{found}} == {{lost}}", s)

    clean = {
        n: d.canonical
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical and n != ""
    }

    rev_clean = struct.inverse(clean)
    Log.note(CNV.object2JSON(rev_clean, pretty=True))

    for k, v in rev_clean.items():
        if len(v) > 3:
            Log.note(CNV.object2JSON({k: v}, pretty=True))
예제 #3
0
def rename_attachments(bug_version):
    if bug_version.attachments == None: return bug_version
    if not USE_ATTACHMENTS_DOT:
        bug_version.attachments = CNV.JSON2object(
            CNV.object2JSON(bug_version.attachments).replace(
                "attachments.", "attachments_"))
    return bug_version
    def random_sample_of_bugs(self):
        """
        I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS.  OF COURSE, IT ONLY WORKS
        WHEN I HAVE A REFERENCE TO COMPARE TO
        """
        NUM_TO_TEST = 100
        MAX_BUG_ID = 900000

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance(
                "candidate", self.settings.candidate)
            reference = ElasticSearch(self.settings.private_bugs_reference)

            #GO FASTER BY STORING LOCAL FILE
            local_cache = File(self.settings.param.temp_dir +
                               "/private_bugs.json")
            if local_cache.exists:
                private_bugs = set(CNV.JSON2object(local_cache.read()))
            else:
                with Timer("get private bugs"):
                    private_bugs = compare_es.get_private_bugs(reference)
                    local_cache.write(CNV.object2JSON(private_bugs))

            while True:
                some_bugs = [
                    b for b in
                    [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)]
                    if b not in private_bugs
                ]

                Log.note("Test with the following bug_ids: {{bugs}}",
                         {"bugs": some_bugs})

                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                param.start_time = 0
                param.start_time_str = extract_bugzilla.milli2string(db, 0)
                param.alias_file = self.settings.param.alias_file

                try:
                    with ThreadedQueue(candidate, 100) as output:
                        etl(db, output, param, please_stop=None)

                    #COMPARE ALL BUGS
                    Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
                    found_errors = compare_both(candidate, reference,
                                                self.settings, some_bugs)
                    if found_errors:
                        Log.note("Errors found")
                        break
                    else:
                        pass
                except Exception, e:
                    Log.warning(
                        "Total failure during compare of bugs {{bugs}}",
                        {"bugs": some_bugs}, e)
예제 #5
0
def old2new(bug, max_date):
    """
    CONVERT THE OLD ES FORMAT TO THE NEW
    THESE ARE KNOWN CHANGES THAT SHOULD BE MADE TO THE PRODUCTION VERSION
    """
    if bug.everconfirmed != None:
        if bug.everconfirmed == "":
            bug.everconfirmed = None
        else:
            bug.everconfirmed = int(bug.everconfirmed)

    bug = CNV.JSON2object(CNV.object2JSON(bug).replace("bugzilla: other b.m.o issues ", "bugzilla: other b.m.o issues"))

    if bug.expires_on > max_date:
        bug.expires_on = parse_bug_history.MAX_TIME
    if bug.votes != None:
        bug.votes = int(bug.votes)
    bug.dupe_by = CNV.value2intlist(bug.dupe_by)
    if bug.votes == 0:
        del bug["votes"]
        # if Math.is_integer(bug.remaining_time) and int(bug.remaining_time) == 0:
    #     bug.remaining_time = 0
    if bug.cf_due_date != None and not Math.is_number(bug.cf_due_date):
        bug.cf_due_date = CNV.datetime2milli(
            CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d")
        )
    bug.changes = CNV.JSON2object(
        CNV.object2JSON(Q.sort(bug.changes, "field_name")) \
            .replace("\"field_value_removed\":", "\"old_value\":") \
            .replace("\"field_value\":", "\"new_value\":")
    )

    if bug.everconfirmed == 0:
        del bug["everconfirmed"]
    if bug.id == "692436_1336314345":
        bug.votes = 3

    try:
        if Math.is_number(bug.cf_last_resolved):
            bug.cf_last_resolved = long(bug.cf_last_resolved)
        else:
            bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
    except Exception, e:
        pass
예제 #6
0
def random_sample_of_bugs(settings):
    NUM_TO_TEST = 100
    MAX_BUG_ID = 900000

    with DB(settings.bugzilla) as db:
        candidate = Fake_ES(settings.fake_es)
        reference = ElasticSearch(settings.reference)

        #GO FASTER BY STORING LOCAL FILE
        local_cache = File(settings.param.temp_dir + "/private_bugs.json")
        if local_cache.exists:
            private_bugs = set(CNV.JSON2object(local_cache.read()))
        else:
            with Timer("get private bugs"):
                private_bugs = compare_es.get_private_bugs(reference)
                local_cache.write(CNV.object2JSON(private_bugs))

        while True:
            some_bugs = [
                b
                for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)]
                if b not in private_bugs
            ]

            #SETUP RUN PARAMETERS
            param = Struct()
            param.BUGS_TABLE_COLUMNS = get_bugs_table_columns(
                db, settings.bugzilla.schema)
            param.BUGS_TABLE_COLUMNS_SQL = SQL(",\n".join(
                ["`" + c.column_name + "`" for c in param.BUGS_TABLE_COLUMNS]))
            param.BUGS_TABLE_COLUMNS = Q.select(param.BUGS_TABLE_COLUMNS,
                                                "column_name")
            param.END_TIME = CNV.datetime2milli(datetime.utcnow())
            param.START_TIME = 0
            param.alias_file = settings.param.alias_file
            param.BUG_IDS_PARTITION = SQL("bug_id in {{bugs}}",
                                          {"bugs": db.quote(some_bugs)})

            try:
                etl(db, candidate, param)

                #COMPARE ALL BUGS
                found_errors = compare_both(candidate, reference, settings,
                                            some_bugs)
                if found_errors:
                    D.println("Errors found")
                    break
                else:
                    pass
            except Exception, e:
                D.warning("Total faiure during compare of bugs {{bugs}}",
                          {"bugs": some_bugs}, e)
def main(settings):
    #USE A FILE
    if settings.source.filename != None:
        settings.destination.alias = settings.destination.index
        settings.destination.index = ElasticSearch.proto_name(settings.destination.alias)
        schema = CNV.JSON2object(File(settings.source.schema_filename).read())
        if transform_bugzilla.USE_ATTACHMENTS_DOT:
            schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments."))

        dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True)
        dest.set_refresh_interval(-1)
        extract_from_file(settings.source, dest)
        dest.set_refresh_interval(1)

        dest.delete_all_but(settings.destination.alias, settings.destination.index)
        dest.add_alias(settings.destination.alias)
        return

    # SYNCH WITH source ES INDEX
    source=ElasticSearch(settings.source)
    destination=get_or_create_index(settings["destination"], source)

    # GET LAST UPDATED
    time_file = File(settings.param.last_replication_time)
    from_file = None
    if time_file.exists:
        from_file = CNV.milli2datetime(CNV.value2int(time_file.read()))
    from_es = get_last_updated(destination)
    last_updated = nvl(MIN(from_file, from_es), CNV.milli2datetime(0))
    current_time = datetime.utcnow()

    pending = get_pending(source, last_updated)
    with ThreadedQueue(destination, size=1000) as data_sink:
        replicate(source, data_sink, pending, last_updated)

    # RECORD LAST UPDATED
    time_file.write(unicode(CNV.datetime2milli(current_time)))
def open_test_instance(name, settings):
    if settings.filename:
        Log.note("Using {{filename}} as {{type}}", {
            "filename": settings.filename,
            "type": name
        })
        return Fake_ES(settings)
    else:
        Log.note("Using ES cluster at {{host}} as {{type}}", {
            "host": settings.host,
            "type": name
        })

        ElasticSearch.delete_index(settings)

        schema = CNV.JSON2object(File(settings.schema_file).read(), flexible=True, paths=True)
        es = ElasticSearch.create_index(settings, schema, limit_replicas=True)
        return es
def extract_from_file(source_settings, destination):
    with File(source_settings.filename) as handle:
        for g, d in Q.groupby(handle, size=BATCH_SIZE):
            try:
                d2 = map(
                    lambda (x): {"id": x.id, "value": x},
                    map(
                        lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))),
                        d
                    )
                )
                destination.add(d2)
            except Exception, e:
                filename = "Error_" + unicode(g) + ".txt"
                File(filename).write(d)
                Log.warning("Can not convert block {{block}} (file={{host}})", {
                    "block": g,
                    "filename": filename
                }, e)
def loadAliases(settings):
    try:
        try:
            with Timer(
                    "load alias file at {{filename}}", {
                        "filename":
                        nvl(settings.param.alias_file.path,
                            settings.param.alias_file)
                    }):
                alias_json = File(settings.param.alias_file).read()
        except Exception, e:
            Log.warning(
                "No alias file found (looking at {{filename}}", {
                    "filename":
                    nvl(settings.param.alias_file.path,
                        settings.param.alias_file)
                })
            alias_json = "{}"
            #self.aliases IS A dict POINTING TO structs
        for k, v in CNV.JSON2object(alias_json).iteritems():
            aliases[k] = struct.wrap(v)

        Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
예제 #11
0
 def __init__(self, settings):
     self.filename = settings.filename
     try:
         self.data = CNV.JSON2object(File(self.filename).read())
     except IOError:
         self.data = {}
예제 #12
0
                    settings.es_comments.alias = settings.es_comments.index
                    settings.es_comments.index = temp.last()
                es_comments = ElasticSearch(settings.es_comments)
        except Exception, e:
            Log.warning("can not resume ETL, restarting", e)
            File(settings.param.first_run_time).delete()
            return setup_es(settings, db, es, es_comments)
    else:
        # START ETL FROM BEGINNING, MAKE NEW INDEX
        last_run_time = 0
        if not es:
            # BUG VERSIONS
            schema = File(settings.es.schema_file).read()
            if transform_bugzilla.USE_ATTACHMENTS_DOT:
                schema = schema.replace("attachments_", "attachments\\.")
            schema = CNV.JSON2object(schema, paths=True)
            schema.settings = jsons.expand_dot(schema.settings)
            if not settings.es.alias:
                settings.es.alias = settings.es.index
                settings.es.index = ElasticSearch.proto_name(settings.es.alias)
            es = ElasticSearch.create_index(settings.es,
                                            schema,
                                            limit_replicas=True)

            # BUG COMMENTS
            comment_schema = File(settings.es_comments.schema_file).read()
            comment_schema = CNV.JSON2object(comment_schema, paths=True)
            comment_schema.settings = jsons.expand_dot(comment_schema.settings)
            if not settings.es_comments.alias:
                settings.es_comments.alias = settings.es_comments.index
                settings.es_comments.index = ElasticSearch.proto_name(
예제 #13
0
def normalize(bug, old_school=False):
    bug = bug.copy()
    bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3]
    bug._id = None

    #ENSURE STRUCTURES ARE SORTED
    # Do some processing to make sure that diffing between runs stays as similar as possible.
    bug.flags = Q.sort(bug.flags, "value")

    if bug.attachments:
        if USE_ATTACHMENTS_DOT:
            bug.attachments = CNV.JSON2object(
                CNV.object2JSON(bug.attachments).replace(
                    "attachments_", "attachments."))
        bug.attachments = Q.sort(bug.attachments, "attach_id")
        for a in bug.attachments:
            for k, v in list(a.items()):
                if k.startswith("attachments") and (k.endswith("isobsolete")
                                                    or k.endswith("ispatch") or
                                                    k.endswith("isprivate")):
                    new_v = CNV.value2int(v)
                    new_k = k[12:]
                    a[k.replace(".", "\.")] = new_v
                    if not old_school:
                        a[new_k] = new_v
            a.flags = Q.sort(a.flags, ["modified_ts", "value"])

    if bug.changes != None:
        if USE_ATTACHMENTS_DOT:
            json = CNV.object2JSON(bug.changes).replace(
                "attachments_", "attachments.")
            bug.changes = CNV.JSON2object(json)
        bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"])

    #bug IS CONVERTED TO A 'CLEAN' COPY
    bug = ElasticSearch.scrub(bug)
    # bug.attachments = nvl(bug.attachments, [])    # ATTACHMENTS MUST EXIST

    for f in NUMERIC_FIELDS:
        v = bug[f]
        if v == None:
            continue
        elif f in MULTI_FIELDS:
            bug[f] = CNV.value2intlist(v)
        elif CNV.value2number(v) == 0:
            del bug[f]
        else:
            bug[f] = CNV.value2number(v)

    # Also reformat some date fields
    for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]:
        v = bug[dateField]
        if v == None: continue
        try:
            if isinstance(v, date):
                bug[dateField] = CNV.datetime2milli(v)
            elif isinstance(v, long) and len(unicode(v)) in [12, 13]:
                bug[dateField] = v
            elif not isinstance(v, basestring):
                Log.error("situation not handled")
            elif DATE_PATTERN_STRICT.match(v):
                # Convert to "2012/01/01 00:00:00.000"
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f"))
            elif DATE_PATTERN_STRICT_SHORT.match(v):
                # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v.replace("-", "/"),
                                        "%Y/%m/%d %H:%M:%S"))
            elif DATE_PATTERN_RELAXED.match(v):
                # Convert "2012/01/01 00:00:00.000" to "2012-01-01"
                # Example: bug 643420 (deadline)
                #          bug 726635 (cf_due_date)
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v[0:10], "%Y-%m-%d"))
        except Exception, e:
            Log.error(
                "problem with converting date to milli (value={{value}})",
                {"value": bug[dateField]}, e)
        Log.error("Can not init aliases", e)


def saveAliases(settings):
    compressed = {
        email: details
        for email, details in aliases.iteritems() if details.canonical
    }

    #COMPARE WITH PREVIOUS ALIAS VERSION
    try:
        old_alias_json = File(settings.param.alias_file).read()
    except Exception, e:
        old_alias_json = "{}"
    old_aliases = {}
    for k, v in CNV.JSON2object(old_alias_json).iteritems():
        old_aliases[k] = struct.wrap(v)

    added = set(compressed.keys()) - set(old_aliases.keys())
    removed = set(old_aliases.keys()) - set(compressed.keys())
    common = set(compressed.keys()) & set(old_aliases.keys())

    changed = set()
    for c in common:
        if CNV.object2JSON(compressed[c], pretty=True) != CNV.object2JSON(
                old_aliases[c], pretty=True):
            changed.add(c)

    if added or removed or changed:
        alias_json = CNV.object2JSON(compressed, pretty=True)
        file = File(settings.param.alias_file)