Beispiel #1
0
    def test_datetime(self):

        result = convert.datetime2milli(datetime.datetime(2012, 7, 24))
        expected = 1343088000000
        assert result == expected

        result = convert.datetime2milli(datetime.date(2012, 7, 24))
        expected = 1343088000000
        assert result == expected

        result = convert.datetime2milli(datetime.datetime(2014, 1, 7, 10, 21, 0))
        expected = 1389090060000
        assert result == expected

        result = text(convert.datetime2milli(datetime.datetime(2014, 1, 7, 10, 21, 0)))
        expected = u"1389090060000"
        assert result == expected
Beispiel #2
0
    def test_datetime(self):

        result = convert.datetime2milli(datetime.datetime(2012, 07, 24))
        expected = 1343088000000
        assert result == expected

        result = convert.datetime2milli(datetime.date(2012, 07, 24))
        expected = 1343088000000
        assert result == expected

        result = convert.datetime2milli(datetime.datetime(2014, 01, 07, 10, 21, 00))
        expected = 1389090060000
        assert result == expected

        result = unicode(convert.datetime2milli(datetime.datetime(2014, 01, 07, 10, 21, 00)))
        expected = u"1389090060000"
        assert result == expected
Beispiel #3
0
def value2value(value):
    """
    CONVERT FROM PYTHON VALUE TO ES EQUIVALENT
    """
    if isinstance(value, datetime):
        return convert.datetime2milli(value)
    if isinstance(value, Duration):
        return value.milli  # DURATION
    return value
Beispiel #4
0
def value2query(value):
    if isinstance(value, datetime):
        return convert.datetime2milli(value)
    if isinstance(value, Duration):
        return value.milli

    if Math.is_number(value):
        return value
    return convert.string2quote(value)
Beispiel #5
0
def value2value(value):
    """
    CONVERT FROM PYTHON VALUE TO ES EQUIVALENT
    """
    if isinstance(value, datetime):
        return convert.datetime2milli(value)
    if isinstance(value, Duration):
        return value.milli    # DURATION
    return value
Beispiel #6
0
def value2query(value):
    if isinstance(value, datetime):
        return convert.datetime2milli(value)
    if isinstance(value, Duration):
        return value.milli

    if Math.is_number(value):
        return value
    return convert.string2quote(value)
Beispiel #7
0
def full_etl(settings):
    schema = convert.json2value(convert.value2json(SCHEMA), leaves=True)
    Cluster(settings.destination).get_or_create_index(settings=settings.destination, schema=schema, limit_replicas=True)
    destq = FromES(settings.destination)
    if settings.incremental:
        min_bug_id = destq.query({
            "from": coalesce(settings.destination.alias, settings.destination.index),
            "select": {"name": "max_bug_id", "value": "bug_id", "aggregate": "max"}
        })

        min_bug_id = int(MAX(min_bug_id-1000, 0))
    else:
        min_bug_id = 0

    sourceq = FromES(settings.source)
    max_bug_id = sourceq.query({
        "from": coalesce(settings.source.alias, settings.source.index),
        "select": {"name": "max_bug_id", "value": "bug_id", "aggregate": "max"}
    }) + 1
    max_bug_id = int(coalesce(max_bug_id, 0))

    # FIRST, GET ALL MISSING BUGS
    for s, e in qb.reverse(list(qb.intervals(min_bug_id, max_bug_id, 10000))):
        with Timer("pull {{start}}..{{end}} from ES", {"start": s, "end": e}):
            children = sourceq.query({
                "from": settings.source.alias,
                "select": ["bug_id", "dependson", "blocked", "modified_ts", "expires_on"],
                "where": {"and": [
                    {"range": {"bug_id": {"gte": s, "lt": e}}},
                    {"or": [
                        {"exists": "dependson"},
                        {"exists": "blocked"}
                    ]}
                ]},
                "limit": 10000
            })

        with Timer("fixpoint work"):
            to_fix_point(settings, destq, children.data)

    # PROCESS RECENT CHANGES
    with Timer("pull recent dependancies from ES"):
        children = sourceq.query({
            "from": settings.source.alias,
            "select": ["bug_id", "dependson", "blocked"],
            "where": {"and": [
                {"range": {"modified_ts": {"gte": convert.datetime2milli(datetime.utcnow() - timedelta(days=7))}}},
                {"or": [
                    {"exists": "dependson"},
                    {"exists": "blocked"}
                ]}
            ]},
            "limit": 100000
        })

    to_fix_point(settings, destq, children.data)
Beispiel #8
0
def value2MVEL(value):
    """
    FROM PYTHON VALUE TO MVEL EQUIVALENT
    """
    if isinstance(value, datetime):
        return str(convert.datetime2milli(value)) + " /*" + value.format("yyNNNdd HHmmss") + "*/"        # TIME
    if isinstance(value, Duration):
        return str(convert.timedelta2milli(value)) + " /*" + str(value) + "*/"    # DURATION

    if Math.is_number(value):
        return str(value)
    return convert.string2quote(value)
def value2MVEL(value):
    """
    FROM PYTHON VALUE TO MVEL EQUIVALENT
    """
    if isinstance(value, datetime):
        return str(convert.datetime2milli(value)) + " /*" + value.format("yyNNNdd HHmmss") + "*/"        # TIME
    if isinstance(value, Duration):
        return str(convert.timedelta2milli(value)) + " /*" + str(value) + "*/"    # DURATION

    if Math.is_number(value):
        return str(value)
    return quote(value)
def get_pending(es, since):
    result = es.search({
        "query": {"match_all": {}},
        "from": 0,
        "size": 0,
        "sort": [],
        "facets": {"default": {"statistical": {"field": "bug_id"}}}
    })

    max_bug = int(result.facets.default.max)
    pending_bugs = None

    for s, e in qb.intervals(0, max_bug + 1, 100000):
        Log.note("Collect history for bugs from {{start}}..{{end}}", {"start": s, "end": e})
        result = es.search({
            "query": {"filtered": {
                "query": {"match_all": {}},
                "filter": {"and": [
                    {"range": {"modified_ts": {"gte": convert.datetime2milli(since)}}},
                    {"range": {"bug_id": {"gte": s, "lte": e}}}
                ]}
            }},
            "from": 0,
            "size": 0,
            "sort": [],
            "facets": {"default": {"terms": {"field": "bug_id", "size": 200000}}}
        })

        temp = Multiset(
            result.facets.default.terms,
            key_field="term",
            count_field="count"
        )

        if pending_bugs is None:
            pending_bugs = temp
        else:
            pending_bugs = pending_bugs + temp

    Log.note("Source has {{num}} bug versions for updating", {
        "num": len(pending_bugs)
    })
    return pending_bugs
def get_last_updated(es):
    try:
        results = es.search({
            "query": {"filtered": {
                "query": {"match_all": {}},
                "filter": {
                    "range": {
                        "modified_ts": {"gte": convert.datetime2milli(far_back)}
                    }}
            }},
            "from": 0,
            "size": 0,
            "sort": [],
            "facets": {"modified_ts": {"statistical": {"field": "modified_ts"}}}
        })

        if results.facets.modified_ts.count == 0:
            return convert.milli2datetime(0)
        return convert.milli2datetime(results.facets.modified_ts.max)
    except Exception, e:
        return convert.milli2datetime(0)
def main(settings):
    current_time = datetime.utcnow()
    time_file = File(settings.param.last_replication_time)

    # SYNCH WITH source ES INDEX
    source = Index(settings.source)
    destination = Cluster(settings.destination).get_or_create_index(settings.destination)

    # GET LAST UPDATED
    from_file = None
    if time_file.exists:
        from_file = convert.milli2datetime(convert.value2int(time_file.read()))
    from_es = get_last_updated(destination) - timedelta(hours=1)
    last_updated = MIN(coalesce(from_file, convert.milli2datetime(0)), from_es)
    Log.note("updating records with modified_ts>={{last_updated}}", {"last_updated": last_updated})

    pending = get_pending(source, last_updated)
    with ThreadedQueue(destination, batch_size=1000) as data_sink:
        replicate(source, data_sink, pending, last_updated)

    # RECORD LAST UPDATED
    time_file.write(unicode(convert.datetime2milli(current_time)))
Beispiel #13
0
def full_etl(settings, sink, bugs):
    with Timer("process block {{start}}", {"start": min(bugs)}):
        es = elasticsearch.Index(settings.source)
        with FromES(es) as esq:
            versions = esq.query({
                "from": "bugs",
                "select": "*",
                "where": {"terms": {"bug_id": bugs}}
            })

        starts = qb.run({
            "select": [
                "bug_id",
                "bug_status",
                {"name": "attach_id", "value": "attachments.attach_id"},
                {"name": "request_time", "value": "modified_ts"},
                {"name": "request_type", "value": "attachments.flags.request_type"},
                {"name": "reviewer", "value": "attachments.flags.requestee"},
                {"name": "created_by", "value": "attachments.created_by"},
                "product",
                "component"
            ],
            "from":
                versions,
            "where":
                {"and": [
                    {"terms": {"attachments.flags.request_status": ["?"]}},
                    {"terms": {"attachments.flags.request_type": TYPES}},
                    {"equal": ["attachments.flags.modified_ts", "modified_ts"]},
                    {"term": {"attachments.isobsolete": 0}}
                ]},
            "sort": ["bug_id", "attach_id", "created_by"]
        })

        ends = qb.run({
            "select": [
                {"name": "bug_id", "value": "bug_id"},
                "bug_status",
                {"name": "attach_id", "value": "attachments.attach_id"},
                {"name": "modified_ts", "value": lambda r: Math.max(r.modified_ts, r.attachments.modified_ts, r.attachments.flags.modified_ts)},
                {"name": "reviewer", "value": "attachments.flags.requestee"},
                {"name": "request_type", "value": "attachments.flags.request_type"},
                {"name": "modified_by", "value": "attachments.flags.modified_by"},
                {"name": "product", "value": "product"},
                {"name": "component", "value": "component"},
                {"name": "review_end_reason", "value": lambda r: 'done' if r.attachments.flags.request_status != '?' else ('obsolete' if r.attachments.isobsolete == 1 else 'closed')},
                {"name": "review_result", "value": lambda r: '+' if r.attachments.flags.request_status == '+' else ('-' if r.attachments.flags.request_status == '-' else '?')}
            ],
            "from":
                versions,
            "where":
                {"and": [
                    {"terms": {"attachments.flags.request_type": TYPES}},
                    {"or": [
                        {"and": [# IF THE REQUESTEE SWITCHED THE ? FLAG, THEN IT IS DONE
                            {"term": {"attachments.flags.previous_status": "?"}},
                            {"not": {"term": {"attachments.flags.request_status": "?"}}},
                            {"equal": ["attachments.flags.modified_ts", "modified_ts"]}
                        ]},
                        {"and": [# IF OBSOLETED THE ATTACHMENT, IT IS DONE
                            {"term": {"attachments.isobsolete": 1}},
                            {"term": {"previous_values.isobsolete_value": 0}}
                        ]},
                        {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW
                            {"terms": {"bug_status": ["resolved", "verified", "closed"]}},
                            {"not": {"terms": {"previous_values.bug_status_value": ["resolved", "verified", "closed"]}}}
                        ]}
                    ]}
                ]}
        })

        # SOME ATTACHMENTS GO MISSING, CLOSE THEM TOO
        closed_bugs = {b.bug_id: b for b in qb.filter(versions, {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW
            {"terms": {"bug_status": ["resolved", "verified", "closed"]}},
            {"range": {"expires_on": {"gte": Date.now().milli}}}
        ]})}

        for s in starts:
            if s.bug_id in closed_bugs:
                e = closed_bugs[s.bug_id]
                ends.append({
                    "bug_id": e.bug_id,
                    "bug_status": e.bug_status,
                    "attach_id": s.attach_id,
                    "modified_ts": e.modified_ts,
                    "reviewer": s.reviewer,
                    "request_type": s.request_type,
                    "modified_by": e.modified_by,
                    "product": e.product,
                    "component": e.component,
                    "review_end_reason": 'closed',
                    "review_result": '?'
                })

        # REVIEWS END WHEN REASSIGNED TO SOMEONE ELSE
        changes = qb.run({
            "select": [
                "bug_id",
                {"name": "attach_id", "value": "changes.attach_id"},
                "modified_ts",
                {"name": "reviewer", "value": lambda r: r.changes.old_value.split("?")[1]},
                {"name": "request_type", "value": lambda r: r.changes.old_value.split("?")[0]},
                {"name": "modified_by", "value": "null"},
                "product",
                "component",
                {"name": "review_end_reason", "value": "'reassigned'"}
            ],
            "from":
                versions,
            "where":
                {"and": [# ONLY LOOK FOR NAME CHANGES IN THE "review?" FIELD
                    {"term": {"changes.field_name": "flags"}},
                    {"or": [{"prefix": {"changes.old_value": t + "?"}} for t in TYPES]}
                ]}
        })

        ends.extend(changes)

    # PYTHON VERSION NOT CAPABLE OF THIS JOIN, YET
    # reviews = qb.run({
    #     "from":
    #         starts,
    #     "select": [
    #         {"name": "bug_status", "value": "bug_status", "aggregate": "one"},
    #         {"name": "review_time", "value": "doneReview.modified_ts", "aggregate": "minimum"},
    #         {"name": "review_result", "value": "doneReview.review_result", "aggregate": "minimum"},
    #         {"name": "product", "value": "coalesce(doneReview.product, product)", "aggregate": "minimum"},
    #         {"name": "component", "value": "coalesce(doneReview.component, component)", "aggregate": "minimum"},
    #         # {"name": "keywords", "value": "(coalesce(keywords, '')+' '+ETL.parseWhiteBoard(whiteboard)).trim()+' '+flags", "aggregate": "one"},
    #         {"name": "requester_review_num", "value": "-1", "aggregate": "one"}
    #     ],
    #     "analytic": [
    #         {"name": "is_first", "value": "rownum==0 ? 1 : 0", "sort": "request_time", "edges": ["bug_id"]}
    #     ],
    #     "edges": [
    #         "bug_id",
    #         "attach_id",
    #         {"name": "reviewer", "value": "requestee"},
    #         {"name": "requester", "value": "created_by"},
    #         {"name": "request_time", "value": "modified_ts"},
    #         {
    #             "name": "doneReview",
    #             "test":
    #                 "bug_id==doneReview.bug_id && " +
    #                 "attach_id==doneReview.attach_id && " +
    #                 "requestee==doneReview.requestee && " +
    #                 "!(bug_status=='closed' && doneReview.review_end_reason=='closed') && " +
    #                 "modified_ts<=doneReview.modified_ts",
    #             "allowNulls": True,
    #             "domain": {"type": "set", "key":["bug_id", "attach_id", "requestee", "modified_ts"], "partitions": ends}
    #         }
    #     ]
    # })

    with Timer("match starts and ends for block {{start}}", {"start":min(*bugs)}):
        reviews = []
        ends = Index(data=ends, keys=["bug_id", "attach_id", "request_type", "reviewer"])

        for g, s in qb.groupby(starts, ["bug_id", "attach_id", "request_type", "reviewer"]):
            start_candidates = qb.sort(s, {"value": "request_time", "sort": 1})
            end_candidates = qb.sort(ends[g], {"value": "modified_ts", "sort": 1})

            #ZIP, BUT WITH ADDED CONSTRAINT s.modified_ts<=e.modified_ts
            if len(start_candidates) > 1:
                Log.note("many reviews on one attachment")
            ei = 0
            for i, s in enumerate(start_candidates):
                while ei < len(end_candidates) and end_candidates[ei].modified_ts < coalesce(s.request_time, convert.datetime2milli(Date.MAX)):
                    ei += 1
                e = end_candidates[ei]

                s.review_time = e.modified_ts
                s.review_duration = e.modified_ts - s.request_time
                s.review_result = e.review_result
                s.review_end_reason = e.review_end_reason
                s.product = coalesce(e.product, s.product)
                s.component = coalesce(e.component, s.component)
                s.requester_review_num = -1
                ei += 1

                if s.bug_status == 'closed' and e.review_end_reason == 'closed':
                    #reviews on closed bugs are ignored
                    continue
                reviews.append(s)

        qb.run({
            "from": reviews,
            "window": [{
                "name": "is_first",
                "value": "rownum == 0",
                "edges": ["bug_id"],
                "sort": ["request_time"],
                "aggregate": "none"
            }]
        })

    with Timer("add {{num}} reviews to ES for block {{start}}", {"start": min(*bugs), "num": len(reviews)}):
        sink.extend({"json": convert.value2json(r)} for r in reviews)
def replicate(source, destination, pending, last_updated):
    """
    COPY THE DEPENDENCY RECORDS TO THE destination
    NOTE THAT THE PUBLIC CLUSTER HAS HOLES, SO WE USE blocked TO FILL THEM
    """
    for g, bugs in qb.groupby(pending, max_size=BATCH_SIZE):
        with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}):
            data = source.search({
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and": [
                        {"terms": {"bug_id": set(bugs)}},
                        {"range": {"expires_on": {"gte": convert.datetime2milli(last_updated)}}},
                        {"or": [
                            {"exists": {"field": "dependson"}},
                            {"exists": {"field": "blocked"}}
                        ]}
                    ]}
                }},
                "from": 0,
                "size": 200000,
                "sort": [],
                "fields": ["bug_id", "modified_ts", "expires_on", "dependson", "blocked"]
            })

            with Timer("Push to destination"):
                d2 = [
                    {
                        "id": str(x.bug_id) + "_" + str(x.modified_ts)[:-3],
                        "value": {
                            "bug_id": x.bug_id,
                            "modified_ts": x.modified_ts,
                            "expires_on": x.expires_on,
                            "dependson": x.dependson
                        }
                    }
                    for x in data.hits.hits.fields
                    if x.dependson
                ]
                destination.extend(d2)

            with Timer("filter"):
                d4 = qb.run({
                    "from": data.hits.hits.fields,
                    "where": {"exists": {"field": "blocked"}}
                })

            with Timer("select"):
                d3 = qb.run({
                    "from": d4,
                    "select": [
                        {"name": "bug_id", "value": "blocked."},  # SINCE blocked IS A LIST, ARE SELECTING THE LIST VALUES, AND EFFECTIVELY PERFORMING A JOIN
                        "modified_ts",
                        "expires_on",
                        {"name": "dependson", "value": "bug_id"}
                    ]
                })

            with Timer("Push to destination"):
                destination.extend([
                    {
                        "id": str(x.bug_id) + "_" + str(x.dependson) + "_" + str(x.modified_ts)[:-3],
                        "value": x
                    }
                    for x in d3
                    if x.dependson
                ])