def _scrub(r):
#    if r=="1.0":
#        D.println("")

    try:
        if r is None or r=="":
            return None
        elif Math.is_number(r):
            return CNV.value2number(r)
        elif isinstance(r, basestring):
#            return r
            return r.lower()
        elif isinstance(r, dict):
            if isinstance(r, Struct): r=r.dict
            output={}
            for k, v in r.items():
                v=_scrub(v)
                if v is not None: output[k.lower()]=v
            if len(output)==0: return None
            return output
        elif hasattr(r, '__iter__'):
            if isinstance(r, StructList): r=r.list
            output=[]
            for v in r:
                v=_scrub(v)
                if v is not None: output.append(v)
            if len(output)==0: return None
            try:
                return Q.sort(output)
            except Exception:
                return output
        else:
            return r
    except Exception, e:
        D.warning("Can not scrub: {{json}}", {"json":r})
Esempio n. 2
0
def _scrub(r):
    #    if r=="1.0":
    #        D.println("")

    try:
        if r is None or r == "":
            return None
        elif Math.is_number(r):
            return CNV.value2number(r)
        elif isinstance(r, basestring):
            #            return r
            return r.lower()
        elif isinstance(r, dict):
            if isinstance(r, Struct): r = r.dict
            output = {}
            for k, v in r.items():
                v = _scrub(v)
                if v is not None: output[k.lower()] = v
            if len(output) == 0: return None
            return output
        elif hasattr(r, '__iter__'):
            if isinstance(r, StructList): r = r.list
            output = []
            for v in r:
                v = _scrub(v)
                if v is not None: output.append(v)
            if len(output) == 0: return None
            try:
                return Q.sort(output)
            except Exception:
                return output
        else:
            return r
    except Exception, e:
        D.warning("Can not scrub: {{json}}", {"json": r})
Esempio n. 3
0
def get_last_updated(es):
    try:
        results=es.search({
            "query":{"filtered":{
                "query":{"match_all":{}},
                "filter":{"range":{"modified_ts":{"gte":CNV.datetime2milli(far_back)}}}
            }},
            "from":0,
            "size":0,
            "sort":[],
            "facets":{"0":{"statistical":{"field":"modified_ts"}}}
        })

        if results.facets["0"].count==0: return datetime.min;
        return CNV.milli2datetime(results.facets["0"].max)
    except Exception, e:
        D.error("Can not get_last_updated from {{host}}/{{index}}", {"host":es.settings.host, "index":es.settings.index}, e)
Esempio n. 4
0
def main(settings):
    #USE A FILE
    if settings.source.filename is not None:
        settings.destination.alias=settings.destination.index
        settings.destination.index=settings.destination.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S")
        schema=CNV.JSON2object(File(settings.source.schema_filename).read())

        dest=ElasticSearch.create_index(settings.destination, schema)
        dest.set_refresh_interval(-1)
        extract_from_file(settings.source, dest)
        dest.set_refresh_interval(1)

        dest.delete_all_but(settings.destination.alias, settings.destination.index)
        dest.add_alias(settings.destination.alias)
        return

    # SYNCH WITH source ES INDEX
    source=ElasticSearch(settings.source)
    destination=get_or_create_index(settings["destination"], source)
    last_updated=get_last_updated(destination)-timedelta(days=7)
    pending=get_pending(source, last_updated)

    # pending IS IN {"bug_id":b, "count":c} FORM
    # MAIN ETL LOOP
    for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE):
        data=source.search({
            "query":{"filtered":{
                "query":{"match_all":{}},
                "filter":{"and":[
                    {"terms":{"bug_id":bugs}},
                    {"range":{"modified_ts":{"gte":CNV.datetime2milli(last_updated)}}}
                ]}
            }},
            "from":0,
            "size":200000,
            "sort":[]
        })

        d2=map(
            lambda(x): {"id":x.id, "value":x},
            map(
                lambda(x): transform_bugzilla.normalize(transform_bugzilla.rename_attachments(x)),
                data.hits.hits
            )
        )
        destination.add(d2)
Esempio n. 5
0
def normalize(bug):
    bug.id = str(bug.bug_id) + "_" + str(bug.modified_ts)[:-3]
    bug._id = None

    #ENSURE STRUCTURES ARE SORTED
    # Do some processing to make sure that diffing between runs stays as similar as possible.
    bug.flags = Q.sort(bug.flags, "value")

    if bug.attachments is not None:
        bug.attachments = Q.sort(bug.attachments, "attach_id")
        for a in bug.attachments:
            a.flags = Q.sort(a.flags, "value")

    bug.changes = Q.sort(bug.changes, "field_name")

    #bug IS CONVERTED TO A 'CLEAN' COPY
    bug = scrub(bug)

    for f in NUMERIC_FIELDS:
        v = bug[f]
        if v is None: continue

        if f in MULTI_FIELDS:
            bug[f] = CNV.value2intlist(v)
        elif v == 0:
            del bug[f]

    # Also reformat some date fields
    for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]:
        v = bug[dateField]
        if v is None: continue
        try:
            if isinstance(v, datetime):
                bug[dateField] = CNV.datetime2milli(v)
            elif DATE_PATTERN_STRICT.match(v):
                # Convert to "2012/01/01 00:00:00.000"
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f"))
            elif DATE_PATTERN_STRICT_SHORT.match(v):
                # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v.replace("-", "/"),
                                        "%Y/%m/%d %H:%M:%S"))
            elif DATE_PATTERN_RELAXED.match(v):
                # Convert "2012/01/01 00:00:00.000" to "2012-01-01"
                # Example: bug 643420 (deadline)
                #          bug 726635 (cf_due_date)
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v[0:10], "%Y-%m-%d"))
        except Exception, e:
            D.error("problem with converting date to milli (value={{value}})",
                    {"value": bug[dateField]}, e)
 def octoberfest():
     for bb in range(99, 2, -1):
         yield ("0" * 65535) + "\n"  # ENOUGH TO FILL THE INCOMING BUFFER
         Thread.sleep(1.0 / RATE)
         yield CNV.unicode2utf8(
             expand_template(
                 "{{num}} bottles of beer on the wall! {{num}} bottles of beer!  Take one down, pass it around! {{less}} bottles of beer on he wall!\n",
                 {"num": bb, "less": bb - 1},
             )
         )
     yield ("0" * 65535) + "\n"  # ENOUGH TO FILL THE INCOMING BUFFER
     yield CNV.unicode2utf8(
         u"2 bottles of beer on the wall! 2 bottles of beer!  Take one down, pass it around! 1 bottle of beer on he wall!\n"
     )
     yield ("0" * 65535) + "\n"  # ENOUGH TO FILL THE INCOMING BUFFER
     yield CNV.unicode2utf8(
         u"1 bottle of beer on the wall! 1 bottle of beer!  Take one down, pass it around! 0 bottles of beer on he wall.\n"
     )
Esempio n. 7
0
 def octoberfest():
     for bb in range(99, 2, -1):
         yield ("0" * 65535) + "\n"  # ENOUGH TO FILL THE INCOMING BUFFER
         Thread.sleep(1.0 / RATE)
         yield CNV.unicode2utf8(
             expand_template(
                 "{{num}} bottles of beer on the wall! {{num}} bottles of beer!  Take one down, pass it around! {{less}} bottles of beer on he wall!\n",
                 {
                     "num": bb,
                     "less": bb - 1
                 }))
     yield ("0" * 65535) + "\n"  # ENOUGH TO FILL THE INCOMING BUFFER
     yield CNV.unicode2utf8(
         u"2 bottles of beer on the wall! 2 bottles of beer!  Take one down, pass it around! 1 bottle of beer on he wall!\n"
     )
     yield ("0" * 65535) + "\n"  # ENOUGH TO FILL THE INCOMING BUFFER
     yield CNV.unicode2utf8(
         u"1 bottle of beer on the wall! 1 bottle of beer!  Take one down, pass it around! 0 bottles of beer on he wall.\n"
     )
def normalize(bug):
    bug.id=str(bug.bug_id)+"_"+str(bug.modified_ts)[:-3]
    bug._id=None

    #ENSURE STRUCTURES ARE SORTED
    # Do some processing to make sure that diffing between runs stays as similar as possible.
    bug.flags=Q.sort(bug.flags, "value")

    if bug.attachments is not None:
        bug.attachments=Q.sort(bug.attachments, "attach_id")
        for a in bug.attachments:
            a.flags=Q.sort(a.flags, "value")

    bug.changes=Q.sort(bug.changes, "field_name")

    #bug IS CONVERTED TO A 'CLEAN' COPY
    bug=scrub(bug)

    for f in NUMERIC_FIELDS:
        v=bug[f]
        if v is None: continue
                
        if f in MULTI_FIELDS:
            bug[f]=CNV.value2intlist(v)
        elif v==0:
            del bug[f]
        

    # Also reformat some date fields
    for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]:
        v=bug[dateField]
        if v is None: continue
        try:
            if isinstance(v, datetime):
                bug[dateField] = CNV.datetime2milli(v)
            elif DATE_PATTERN_STRICT.match(v):
                # Convert to "2012/01/01 00:00:00.000"
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f"))
            elif DATE_PATTERN_STRICT_SHORT.match(v):
                # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S"))
            elif DATE_PATTERN_RELAXED.match(v):
                # Convert "2012/01/01 00:00:00.000" to "2012-01-01"
                # Example: bug 643420 (deadline)
                #          bug 726635 (cf_due_date)
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d"))
        except Exception, e:
            D.error("problem with converting date to milli (value={{value}})", {"value":bug[dateField]}, e)
Esempio n. 9
0
def get_last_updated(es):
    try:
        results = es.search({
            "query": {
                "filtered": {
                    "query": {
                        "match_all": {}
                    },
                    "filter": {
                        "range": {
                            "modified_ts": {
                                "gte": CNV.datetime2milli(far_back)
                            }
                        }
                    }
                }
            },
            "from": 0,
            "size": 0,
            "sort": [],
            "facets": {
                "0": {
                    "statistical": {
                        "field": "modified_ts"
                    }
                }
            }
        })

        if results.facets["0"].count == 0: return datetime.min
        return CNV.milli2datetime(results.facets["0"].max)
    except Exception, e:
        D.error("Can not get_last_updated from {{host}}/{{index}}", {
            "host": es.settings.host,
            "index": es.settings.index
        }, e)
Esempio n. 10
0
def get_pending(es, since):

    result=es.search({
        "query":{"filtered":{
            "query":{"match_all":{}},
            "filter":{"range":{"modified_ts":{"gte":CNV.datetime2milli(since)}}}
        }},
        "from":0,
        "size":0,
        "sort":[],
        "facets":{"default":{"terms":{"field":"bug_id","size":200000}}}
    })

    if len(result.facets.default.terms)>=200000: D.error("Can not handle more than 200K bugs changed")

    pending_bugs=multiset(result.facets.default.terms, key_field="term", count_field="count")
    return pending_bugs
Esempio n. 11
0
def extract_from_file(source_settings, destination):
    with File(source_settings.filename).iter() as handle:
        for g, d in Q.groupby(handle, size=BATCH_SIZE):
            try:
                d2 = map(
                    lambda (x): {
                        "id": x.id,
                        "value": x
                    },
                    map(
                        lambda (x): transform_bugzilla.normalize(
                            CNV.JSON2object(fix_json(x))), d))
                destination.add(d2)
            except Exception, e:
                filename = "Error_" + Random.hex(20) + ".txt"
                File(filename).write(d)
                D.warning("Can not convert block {{block}} (file={{host}})", {
                    "block": g,
                    "filename": filename
                }, e)
Esempio n. 12
0
def get_pending(es, since):

    result = es.search({
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": {
                    "range": {
                        "modified_ts": {
                            "gte": CNV.datetime2milli(since)
                        }
                    }
                }
            }
        },
        "from": 0,
        "size": 0,
        "sort": [],
        "facets": {
            "default": {
                "terms": {
                    "field": "bug_id",
                    "size": 200000
                }
            }
        }
    })

    if len(result.facets.default.terms) >= 200000:
        D.error("Can not handle more than 200K bugs changed")

    pending_bugs = multiset(result.facets.default.terms,
                            key_field="term",
                            count_field="count")
    return pending_bugs
Esempio n. 13
0
def main(settings):
    #USE A FILE
    if settings.source.filename is not None:
        settings.destination.alias = settings.destination.index
        settings.destination.index = settings.destination.alias + CNV.datetime2string(
            datetime.utcnow(), "%Y%m%d_%H%M%S")
        schema = CNV.JSON2object(File(settings.source.schema_filename).read())

        dest = ElasticSearch.create_index(settings.destination, schema)
        dest.set_refresh_interval(-1)
        extract_from_file(settings.source, dest)
        dest.set_refresh_interval(1)

        dest.delete_all_but(settings.destination.alias,
                            settings.destination.index)
        dest.add_alias(settings.destination.alias)
        return

    # SYNCH WITH source ES INDEX
    source = ElasticSearch(settings.source)
    destination = get_or_create_index(settings["destination"], source)
    last_updated = get_last_updated(destination) - timedelta(days=7)
    pending = get_pending(source, last_updated)

    # pending IS IN {"bug_id":b, "count":c} FORM
    # MAIN ETL LOOP
    for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE):
        data = source.search({
            "query": {
                "filtered": {
                    "query": {
                        "match_all": {}
                    },
                    "filter": {
                        "and": [{
                            "terms": {
                                "bug_id": bugs
                            }
                        }, {
                            "range": {
                                "modified_ts": {
                                    "gte": CNV.datetime2milli(last_updated)
                                }
                            }
                        }]
                    }
                }
            },
            "from": 0,
            "size": 200000,
            "sort": []
        })

        d2 = map(
            lambda (x): {
                "id": x.id,
                "value": x
            },
            map(
                lambda (x): transform_bugzilla.normalize(
                    transform_bugzilla.rename_attachments(x)), data.hits.hits))
        destination.add(d2)