Python groupby Examples, pyLibrary.queries.qb.groupby Python Examples

Example #1

0

Show file

File: mercurial.py Project: klahnakoski/Activedata-ETL

def update_repo(repo, settings):
    with MySQL(settings.database) as db:
        try:
            pull_repo(repo)

            # GET LATEST DATE
            existing_range = db.query("""
                        SELECT
                            max(`date`) `max`,
                            min(`date`) `min`,
                            min(revision) min_rev,
                            max(revision) max_rev
                        FROM
                            changesets
                        WHERE
                            repos={{repos}}
                    """, {"repos": repo.name})[0]

            ranges = wrap([
                {"min": coalesce(existing_range.max, convert.milli2datetime(0)) + timedelta(days=1)},
                {"max": existing_range.min}
            ])

            for r in ranges:
                for g, docs in qb.groupby(get_changesets(date_range=r, repo=repo), size=100):
                    for doc in docs:
                        doc.file_changes = None
                        doc.file_adds = None
                        doc.file_dels = None
                        doc.description = doc.description[0:16000]

                    db.insert_list("changesets", docs)
                    db.flush()

            missing_revisions = find_holes(db, "changesets", "revision",  {"min": 0, "max": existing_range.max_rev + 1}, {"term": {"repos": repo.name}})
            for _range in missing_revisions:
                for g, docs in qb.groupby(get_changesets(revision_range=_range, repo=repo), size=100):
                    for doc in docs:
                        doc.file_changes = None
                        doc.file_adds = None
                        doc.file_dels = None
                        doc.description = doc.description[0:16000]

                    db.insert_list("changesets", docs)
                    db.flush()



        except Exception, e:
            Log.warning("Failure to pull from {{repos.name}}", {"repos": repo}, e)

Example #2

0

Show file

File: pulse_logger.py Project: klahnakoski/Activedata-ETL

def log_loop(settings, synch, queue, bucket, please_stop):
    with aws.Queue(settings.work_queue) as work_queue:
        for i, g in qb.groupby(queue, size=settings.param.size):
            Log.note(
                "Preparing {{num}} pulse messages to bucket={{bucket}}",
                num=len(g),
                bucket=bucket.name
            )

            full_key = unicode(synch.next_key) + ":" + unicode(MIN(g.select("_meta.count")))
            try:
                output = [
                    set_default(
                        d,
                        {"etl": {
                            "name": "Pulse block",
                            "bucket": settings.destination.bucket,
                            "timestamp": Date.now().unix,
                            "id": synch.next_key,
                            "source": {
                                "name": "pulse.mozilla.org",
                                "id": d._meta.count,
                                "count": d._meta.count,
                                "message_id": d._meta.message_id,
                                "sent": Date(d._meta.sent),
                            },
                            "type": "aggregation"
                        }}
                    )
                    for i, d in enumerate(g)
                    if d != None  # HAPPENS WHEN PERSISTENT QUEUE FAILS TO LOG start
                ]
                bucket.write(full_key, "\n".join(convert.value2json(d) for d in output))
                synch.advance()
                synch.source_key = MAX(g.select("_meta.count")) + 1

                now = Date.now()
                work_queue.add({
                    "bucket": bucket.name,
                    "key": full_key,
                    "timestamp": now.unix,
                    "date/time": now.format()
                })

                synch.ping()
                queue.commit()
                Log.note("Wrote {{num}} pulse messages to bucket={{bucket}}, key={{key}} ",
                    num= len(g),
                    bucket= bucket.name,
                    key= full_key)
            except Exception, e:
                queue.rollback()
                if not queue.closed:
                    Log.warning("Problem writing {{key}} to S3", key=full_key, cause=e)

            if please_stop:
                break

Example #3

0

Show file

File: hierarchy.py Project: klahnakoski/MoDevETL

def pull_from_es(settings, destq, all_parents, all_children, all_descendants, work_queue):
    # LOAD PARENTS FROM ES

    for g, r in qb.groupby(qb.sort(work_queue), size=100):
        result = destq.query({
            "from": settings.destination.index,
            "select": "*",
            "where": {"terms": {"bug_id": r}}
        })
        for r in result.data:
            all_parents.extend(r.bug_id, listwrap(r.parents))
            all_children.extend(r.bug_id, listwrap(r.children))
            all_descendants.extend(r.bug_id, listwrap(r.descendants))

Example #4

0

Show file

File: log_usingElasticSearch.py Project: mozilla/ChangeDetector

 def _insert_loop(self, please_stop=None):
     bad_count = 0
     while not please_stop:
         try:
             Thread.sleep(seconds=1)
             messages = wrap(self.queue.pop_all())
             if messages:
                 for m in messages:
                     m.value.params = leafer(m.value.params)
                     m.value.error = leafer(m.value.error)
                 for g, mm in qb.groupby(messages, size=self.batch_size):
                     self.es.extend(mm)
                 bad_count = 0
         except Exception, e:
             Log.warning("Problem inserting logs into ES", cause=e)
             bad_count += 1
             if bad_count > 5:
                 break

Example #5

0

Show file

def encrypt(text, _key, salt=None):
    """
    RETURN JSON OF ENCRYPTED DATA   {"salt":s, "length":l, "data":d}
    """
    if not isinstance(text, unicode):
        Log.error("only unicode is encrypted")
    if _key is None:
        Log.error("Expecting a key")

    if salt is None:
        salt = Random.bytes(16)

    data = bytearray(text.encode("utf8"))

    # Initialize encryption using key and iv
    key_expander_256 = key_expander.KeyExpander(256)
    expanded_key = key_expander_256.expand(_key)
    aes_cipher_256 = aes_cipher.AESCipher(expanded_key)
    aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16)
    aes_cbc_256.set_iv(salt)

    output = Dict()
    output.type = "AES256"
    output.salt = convert.bytes2base64(salt)
    output.length = len(data)

    encrypted = bytearray()
    for _, d in qb.groupby(data, size=16):
        encrypted.extend(aes_cbc_256.encrypt_block(d))
    output.data = convert.bytes2base64(encrypted)
    json = convert.value2json(output)

    if DEBUG:
        test = decrypt(json, _key)
        if test != text:
            Log.error("problem with encryption")

    return json

Example #6

0

Show file

def decrypt(data, _key):
    """
    ACCEPT JSON OF ENCRYPTED DATA  {"salt":s, "length":l, "data":d}
    """
    # Key and iv have not been generated or provided, bail out
    if _key is None:
        Log.error("Expecting a key")

    _input = convert.json2value(data)

    # Initialize encryption using key and iv
    key_expander_256 = key_expander.KeyExpander(256)
    expanded_key = key_expander_256.expand(_key)
    aes_cipher_256 = aes_cipher.AESCipher(expanded_key)
    aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16)
    aes_cbc_256.set_iv(convert.base642bytearray(_input.salt))

    raw = convert.base642bytearray(_input.data)
    out_data = bytearray()
    for _, e in qb.groupby(raw, size=16):
        out_data.extend(aes_cbc_256.decrypt_block(e))

    return str(out_data[:_input.length:]).decode("utf8")

Example #7

0

Show file

File: hg_mozilla_org.py Project: klahnakoski/MoHg

    def _load_all_in_push(self, revision, locale=None):
        # http://hg.mozilla.org/mozilla-central/json-pushes?full=1&changeset=57c461500a0c
        found_revision = copy(revision)
        if isinstance(found_revision.branch, basestring):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)]
            if not b:
                Log.error("can not find branch ({{branch}}, {{locale}})", name=lower_name, locale=locale)
        if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH:
            self.branches = _hg_branches.get_branches(use_cache=True, settings=self.settings)

        url = found_revision.branch.url.rstrip("/") + "/json-pushes?full=1&changeset=" + found_revision.changeset.id
        Log.note(
            "Reading pushlog for revision ({{branch}}, {{locale}}, {{changeset}}): {{url}}",
            branch=found_revision.branch.name,
            locale=locale,
            changeset=found_revision.changeset.id,
            url=url
        )

        try:
            data = self._get_and_retry(url, found_revision.branch)

            revs = []
            output = None
            for index, _push in data.items():
                push = Push(id=int(index), date=_push.date, user=_push.user)

                for _, ids in qb.groupby(_push.changesets.node, size=200):
                    url_param = "&".join("node=" + c[0:12] for c in ids)

                    url = found_revision.branch.url.rstrip("/") + "/json-info?" + url_param
                    Log.note("Reading details from {{url}}", {"url": url})

                    raw_revs = self._get_and_retry(url, found_revision.branch)
                    for r in raw_revs.values():
                        rev = Revision(
                            branch=found_revision.branch,
                            index=r.rev,
                            changeset=Changeset(
                                id=r.node,
                                id12=r.node[0:12],
                                author=r.user,
                                description=r.description,
                                date=Date(r.date),
                                files=r.files
                            ),
                            parents=unwraplist(r.parents),
                            children=unwraplist(r.children),
                            push=push,
                            etl={"timestamp": Date.now().unix}
                        )
                        if r.node == found_revision.changeset.id:
                            output = rev
                        if r.node[0:12] == found_revision.changeset.id[0:12]:
                            output = rev
                        _id = coalesce(rev.changeset.id12, "") + "-" + rev.branch.name + "-" + coalesce(rev.branch.locale, DEFAULT_LOCALE)
                        revs.append({"id": _id, "value": rev})
            self.es.extend(revs)
            return output
        except Exception, e:
            Log.error("Problem pulling pushlog from {{url}}", url=url, cause=e)

Example #8

0

Show file

File: reviews.py Project: klahnakoski/MoDevETL

def full_etl(settings, sink, bugs):
    with Timer("process block {{start}}", {"start": min(bugs)}):
        es = elasticsearch.Index(settings.source)
        with FromES(es) as esq:
            versions = esq.query({
                "from": "bugs",
                "select": "*",
                "where": {"terms": {"bug_id": bugs}}
            })

        starts = qb.run({
            "select": [
                "bug_id",
                "bug_status",
                {"name": "attach_id", "value": "attachments.attach_id"},
                {"name": "request_time", "value": "modified_ts"},
                {"name": "request_type", "value": "attachments.flags.request_type"},
                {"name": "reviewer", "value": "attachments.flags.requestee"},
                {"name": "created_by", "value": "attachments.created_by"},
                "product",
                "component"
            ],
            "from":
                versions,
            "where":
                {"and": [
                    {"terms": {"attachments.flags.request_status": ["?"]}},
                    {"terms": {"attachments.flags.request_type": TYPES}},
                    {"equal": ["attachments.flags.modified_ts", "modified_ts"]},
                    {"term": {"attachments.isobsolete": 0}}
                ]},
            "sort": ["bug_id", "attach_id", "created_by"]
        })

        ends = qb.run({
            "select": [
                {"name": "bug_id", "value": "bug_id"},
                "bug_status",
                {"name": "attach_id", "value": "attachments.attach_id"},
                {"name": "modified_ts", "value": lambda r: Math.max(r.modified_ts, r.attachments.modified_ts, r.attachments.flags.modified_ts)},
                {"name": "reviewer", "value": "attachments.flags.requestee"},
                {"name": "request_type", "value": "attachments.flags.request_type"},
                {"name": "modified_by", "value": "attachments.flags.modified_by"},
                {"name": "product", "value": "product"},
                {"name": "component", "value": "component"},
                {"name": "review_end_reason", "value": lambda r: 'done' if r.attachments.flags.request_status != '?' else ('obsolete' if r.attachments.isobsolete == 1 else 'closed')},
                {"name": "review_result", "value": lambda r: '+' if r.attachments.flags.request_status == '+' else ('-' if r.attachments.flags.request_status == '-' else '?')}
            ],
            "from":
                versions,
            "where":
                {"and": [
                    {"terms": {"attachments.flags.request_type": TYPES}},
                    {"or": [
                        {"and": [# IF THE REQUESTEE SWITCHED THE ? FLAG, THEN IT IS DONE
                            {"term": {"attachments.flags.previous_status": "?"}},
                            {"not": {"term": {"attachments.flags.request_status": "?"}}},
                            {"equal": ["attachments.flags.modified_ts", "modified_ts"]}
                        ]},
                        {"and": [# IF OBSOLETED THE ATTACHMENT, IT IS DONE
                            {"term": {"attachments.isobsolete": 1}},
                            {"term": {"previous_values.isobsolete_value": 0}}
                        ]},
                        {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW
                            {"terms": {"bug_status": ["resolved", "verified", "closed"]}},
                            {"not": {"terms": {"previous_values.bug_status_value": ["resolved", "verified", "closed"]}}}
                        ]}
                    ]}
                ]}
        })

        # SOME ATTACHMENTS GO MISSING, CLOSE THEM TOO
        closed_bugs = {b.bug_id: b for b in qb.filter(versions, {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW
            {"terms": {"bug_status": ["resolved", "verified", "closed"]}},
            {"range": {"expires_on": {"gte": Date.now().milli}}}
        ]})}

        for s in starts:
            if s.bug_id in closed_bugs:
                e = closed_bugs[s.bug_id]
                ends.append({
                    "bug_id": e.bug_id,
                    "bug_status": e.bug_status,
                    "attach_id": s.attach_id,
                    "modified_ts": e.modified_ts,
                    "reviewer": s.reviewer,
                    "request_type": s.request_type,
                    "modified_by": e.modified_by,
                    "product": e.product,
                    "component": e.component,
                    "review_end_reason": 'closed',
                    "review_result": '?'
                })

        # REVIEWS END WHEN REASSIGNED TO SOMEONE ELSE
        changes = qb.run({
            "select": [
                "bug_id",
                {"name": "attach_id", "value": "changes.attach_id"},
                "modified_ts",
                {"name": "reviewer", "value": lambda r: r.changes.old_value.split("?")[1]},
                {"name": "request_type", "value": lambda r: r.changes.old_value.split("?")[0]},
                {"name": "modified_by", "value": "null"},
                "product",
                "component",
                {"name": "review_end_reason", "value": "'reassigned'"}
            ],
            "from":
                versions,
            "where":
                {"and": [# ONLY LOOK FOR NAME CHANGES IN THE "review?" FIELD
                    {"term": {"changes.field_name": "flags"}},
                    {"or": [{"prefix": {"changes.old_value": t + "?"}} for t in TYPES]}
                ]}
        })

        ends.extend(changes)

    # PYTHON VERSION NOT CAPABLE OF THIS JOIN, YET
    # reviews = qb.run({
    #     "from":
    #         starts,
    #     "select": [
    #         {"name": "bug_status", "value": "bug_status", "aggregate": "one"},
    #         {"name": "review_time", "value": "doneReview.modified_ts", "aggregate": "minimum"},
    #         {"name": "review_result", "value": "doneReview.review_result", "aggregate": "minimum"},
    #         {"name": "product", "value": "coalesce(doneReview.product, product)", "aggregate": "minimum"},
    #         {"name": "component", "value": "coalesce(doneReview.component, component)", "aggregate": "minimum"},
    #         # {"name": "keywords", "value": "(coalesce(keywords, '')+' '+ETL.parseWhiteBoard(whiteboard)).trim()+' '+flags", "aggregate": "one"},
    #         {"name": "requester_review_num", "value": "-1", "aggregate": "one"}
    #     ],
    #     "analytic": [
    #         {"name": "is_first", "value": "rownum==0 ? 1 : 0", "sort": "request_time", "edges": ["bug_id"]}
    #     ],
    #     "edges": [
    #         "bug_id",
    #         "attach_id",
    #         {"name": "reviewer", "value": "requestee"},
    #         {"name": "requester", "value": "created_by"},
    #         {"name": "request_time", "value": "modified_ts"},
    #         {
    #             "name": "doneReview",
    #             "test":
    #                 "bug_id==doneReview.bug_id && " +
    #                 "attach_id==doneReview.attach_id && " +
    #                 "requestee==doneReview.requestee && " +
    #                 "!(bug_status=='closed' && doneReview.review_end_reason=='closed') && " +
    #                 "modified_ts<=doneReview.modified_ts",
    #             "allowNulls": True,
    #             "domain": {"type": "set", "key":["bug_id", "attach_id", "requestee", "modified_ts"], "partitions": ends}
    #         }
    #     ]
    # })

    with Timer("match starts and ends for block {{start}}", {"start":min(*bugs)}):
        reviews = []
        ends = Index(data=ends, keys=["bug_id", "attach_id", "request_type", "reviewer"])

        for g, s in qb.groupby(starts, ["bug_id", "attach_id", "request_type", "reviewer"]):
            start_candidates = qb.sort(s, {"value": "request_time", "sort": 1})
            end_candidates = qb.sort(ends[g], {"value": "modified_ts", "sort": 1})

            #ZIP, BUT WITH ADDED CONSTRAINT s.modified_ts<=e.modified_ts
            if len(start_candidates) > 1:
                Log.note("many reviews on one attachment")
            ei = 0
            for i, s in enumerate(start_candidates):
                while ei < len(end_candidates) and end_candidates[ei].modified_ts < coalesce(s.request_time, convert.datetime2milli(Date.MAX)):
                    ei += 1
                e = end_candidates[ei]

                s.review_time = e.modified_ts
                s.review_duration = e.modified_ts - s.request_time
                s.review_result = e.review_result
                s.review_end_reason = e.review_end_reason
                s.product = coalesce(e.product, s.product)
                s.component = coalesce(e.component, s.component)
                s.requester_review_num = -1
                ei += 1

                if s.bug_status == 'closed' and e.review_end_reason == 'closed':
                    #reviews on closed bugs are ignored
                    continue
                reviews.append(s)

        qb.run({
            "from": reviews,
            "window": [{
                "name": "is_first",
                "value": "rownum == 0",
                "edges": ["bug_id"],
                "sort": ["request_time"],
                "aggregate": "none"
            }]
        })

    with Timer("add {{num}} reviews to ES for block {{start}}", {"start": min(*bugs), "num": len(reviews)}):
        sink.extend({"json": convert.value2json(r)} for r in reviews)

Example #9

0

Show file

File: mysql.py Project: mozilla/ActiveData-ETL

            # https://github.com/PyMySQL/PyMySQL/issues/157
            for b in backlog:
                sql = self.preamble + b
                try:
                    if self.debug:
                        Log.note("Execute SQL:\n{{sql|indent}}", sql=sql)
                    self.cursor.execute(b)
                except Exception, e:
                    Log.error("Can not execute sql:\n{{sql}}",
                              sql=sql,
                              cause=e)

            self.cursor.close()
            self.cursor = self.db.cursor()
        else:
            for i, g in qb.groupby(backlog, size=MAX_BATCH_SIZE):
                sql = self.preamble + ";\n".join(g)
                try:
                    if self.debug:
                        Log.note("Execute block of SQL:\n{{sql|indent}}",
                                 sql=sql)
                    self.cursor.execute(sql)
                    self.cursor.close()
                    self.cursor = self.db.cursor()
                except Exception, e:
                    Log.error("Problem executing SQL:\n{{sql|indent}}",
                              sql=sql,
                              cause=e,
                              stack_depth=1)

    ## Insert dictionary of values into table

Example #10

0

Show file

File: talos_los_to_perf_logs.py Project: mozilla/ActiveData-ETL

    def transform(self, uid, talos_test_result):
        try:
            r = talos_test_result

            def mainthread_transform(r):
                if r == None:
                    return None

                output = Dict()

                for i in r.mainthread_readbytes:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].readbytes = i[0]
                r.mainthread_readbytes = None

                for i in r.mainthread_writebytes:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].writebytes = i[0]
                r.mainthread_writebytes = None

                for i in r.mainthread_readcount:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].readcount = i[0]
                r.mainthread_readcount = None

                for i in r.mainthread_writecount:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].writecount = i[0]
                r.mainthread_writecount = None

                r.mainthread = output.values()

            mainthread_transform(r.results_aux)
            mainthread_transform(r.results_xperf)

            branch = r.build.branch
            if branch.lower().endswith("-non-pgo"):
                branch = branch[0:-8]
                r.build.branch = branch
                r.build.pgo = False
            else:
                r.build.pgo = True

            if r.machine.osversion.endswith(".e"):
                r.machine.osversion = r.machine.osversion[:-2]
                r.machine.e10s = True

            #ADD PUSH LOG INFO
            try:
                with Profiler("get from pushlog"):
                    revision = Revision(
                        **{
                            "branch": {
                                "name": branch
                            },
                            "changeset": {
                                "id": r.build.revision
                            }
                        })
                    with self.locker:
                        revision = self.repo.get_revision(revision)

                    with self.locker:
                        push = self.repo.get_push(revision)

                    r.build.push_date = push.date
            except Exception, e:
                Log.warning(
                    "{{build.branch}} @ {{build.revision}} (perf_id=={{treeherder.perf_id}}) has no pushlog",
                    r, e)
                # TRY AGAIN LATER
                return []

            new_records = []

            # RECORD THE UNKNOWN PART OF THE TEST RESULTS
            remainder = r.copy()
            remainder.results = None
            if not r.results or len(remainder.keys()) > 4:
                new_records.append(remainder)

            #RECORD TEST RESULTS
            total = DictList()
            if r.run.suite in ["dromaeo_css", "dromaeo_dom"]:
                #dromaeo IS SPECIAL, REPLICATES ARE IN SETS OF FIVE
                #RECORD ALL RESULTS
                for i, (test_name, replicates) in enumerate(r.results.items()):
                    for g, sub_results in qb.groupby(replicates, size=5):
                        new_record = Dict(machine=r.machine,
                                          treeherder=r.treeherder,
                                          run=r.run,
                                          build=r.build,
                                          result={
                                              "test_name":
                                              unicode(test_name) + "." +
                                              unicode(g),
                                              "ordering":
                                              i,
                                              "samples":
                                              sub_results
                                          })
                        try:
                            s = stats(sub_results)
                            new_record.result.stats = s
                            total.append(s)
                        except Exception, e:
                            Log.warning("can not reduce series to moments", e)
                        new_records.append(new_record)

Example #11

0

Show file

File: mysql.py Project: mozilla/ChangeDetector

        if self.db.__module__.startswith("pymysql"):
            # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS
            # https://github.com/PyMySQL/PyMySQL/issues/157
            for b in backlog:
                sql = self.preamble + b
                try:
                    if self.debug:
                        Log.note("Execute SQL:\n{{sql|indent}}",  sql= sql)
                    self.cursor.execute(b)
                except Exception, e:
                    Log.error("Can not execute sql:\n{{sql}}",  sql= sql, cause=e)

            self.cursor.close()
            self.cursor = self.db.cursor()
        else:
            for i, g in qb.groupby(backlog, size=MAX_BATCH_SIZE):
                sql = self.preamble + ";\n".join(g)
                try:
                    if self.debug:
                        Log.note("Execute block of SQL:\n{{sql|indent}}",  sql= sql)
                    self.cursor.execute(sql)
                    self.cursor.close()
                    self.cursor = self.db.cursor()
                except Exception, e:
                    Log.error("Problem executing SQL:\n{{sql|indent}}",  sql= sql, cause=e, stack_depth=1)


    ## Insert dictionary of values into table
    def insert(self, table_name, record):
        keys = record.keys()

Example #12

0

Show file

File: mercurial.py Project: mozilla/ActiveData-ETL

def update_repo(repo, settings):
    with MySQL(settings.database) as db:
        try:
            pull_repo(repo)

            # GET LATEST DATE
            existing_range = db.query(
                """
                        SELECT
                            max(`date`) `max`,
                            min(`date`) `min`,
                            min(revision) min_rev,
                            max(revision) max_rev
                        FROM
                            changesets
                        WHERE
                            repos={{repos}}
                    """, {"repos": repo.name})[0]

            ranges = wrap([{
                "min":
                coalesce(existing_range.max, convert.milli2datetime(0)) +
                timedelta(days=1)
            }, {
                "max": existing_range.min
            }])

            for r in ranges:
                for g, docs in qb.groupby(get_changesets(date_range=r,
                                                         repo=repo),
                                          size=100):
                    for doc in docs:
                        doc.file_changes = None
                        doc.file_adds = None
                        doc.file_dels = None
                        doc.description = doc.description[0:16000]

                    db.insert_list("changesets", docs)
                    db.flush()

            missing_revisions = find_holes(db, "changesets", "revision", {
                "min": 0,
                "max": existing_range.max_rev + 1
            }, {"term": {
                "repos": repo.name
            }})
            for _range in missing_revisions:
                for g, docs in qb.groupby(get_changesets(revision_range=_range,
                                                         repo=repo),
                                          size=100):
                    for doc in docs:
                        doc.file_changes = None
                        doc.file_adds = None
                        doc.file_dels = None
                        doc.description = doc.description[0:16000]

                    db.insert_list("changesets", docs)
                    db.flush()

        except Exception, e:
            Log.warning("Failure to pull from {{repos.name}}", {"repos": repo},
                        e)

Example #13

0

Show file

File: meta.py Project: klahnakoski/MoDevETL

    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in ["object", "nested"]:
            Log.error("not supported")
        try:
            if c.table == "meta.columns":
                with self.columns.locker:
                    partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.columns, c.abs_name) if g[c.abs_name] != None])
                    self.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                    })
                return
            if c.table == "meta.tables":
                with self.columns.locker:
                    partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.tables, c.abs_name) if g[c.abs_name] != None])
                    self.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return

            es_index = c.table.split(".")[0]
            result = self.default_es.post("/"+es_index+"/_search", data={
                "aggs": {c.name: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value)
            if cardinality == None:
                Log.error("logic error")

            query = Dict(size=0)
            if c.type in ["object", "nested"]:
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif c.nested_path:
                query.aggs[literal_field(c.name)] = {
                    "nested": {"path": listwrap(c.nested_path)[0]},
                    "aggs": {"_nested": {"terms": {"field": c.abs_name, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.name)] = {"terms": {"field": c.abs_name, "size": 0}}

            result = self.default_es.post("/"+es_index+"/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = qb.sort(aggs._nested.buckets.key)
            else:
                parts = qb.sort(aggs.buckets.key)

            Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.columns.locker:
                self.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                })
        except Exception, e:
            if "IndexMissingException" in e and c.table.startswith("testing"):
                Log.alert("{{col.table}} does not exist", col=c)
            else:
                self.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                })
                Log.warning("Could not get {{col.table}}.{{col.abs_name}} info", col=c, cause=e)

Example #14

0

Show file

File: talos_los_to_perf_logs.py Project: klahnakoski/Activedata-ETL

    def transform(self, uid, talos_test_result):
        try:
            r = talos_test_result

            def mainthread_transform(r):
                if r == None:
                    return None

                output = Dict()

                for i in r.mainthread_readbytes:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].readbytes = i[0]
                r.mainthread_readbytes = None

                for i in r.mainthread_writebytes:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].writebytes = i[0]
                r.mainthread_writebytes = None

                for i in r.mainthread_readcount:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].readcount = i[0]
                r.mainthread_readcount = None

                for i in r.mainthread_writecount:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].writecount = i[0]
                r.mainthread_writecount = None

                r.mainthread = output.values()

            mainthread_transform(r.results_aux)
            mainthread_transform(r.results_xperf)


            branch = r.build.branch
            if branch.lower().endswith("-non-pgo"):
                branch = branch[0:-8]
                r.build.branch = branch
                r.build.pgo = False
            else:
                r.build.pgo = True

            if r.machine.osversion.endswith(".e"):
                r.machine.osversion = r.machine.osversion[:-2]
                r.machine.e10s = True


            #ADD PUSH LOG INFO
            try:
                with Profiler("get from pushlog"):
                    revision = Revision(**{"branch": {"name": branch}, "changeset": {"id": r.build.revision}})
                    with self.locker:
                        revision = self.repo.get_revision(revision)

                    with self.locker:
                        push = self.repo.get_push(revision)

                    r.build.push_date = push.date
            except Exception, e:
                Log.warning("{{build.branch}} @ {{build.revision}} (perf_id=={{treeherder.perf_id}}) has no pushlog", r, e)
                # TRY AGAIN LATER
                return []

            new_records = []

            # RECORD THE UNKNOWN PART OF THE TEST RESULTS
            remainder = r.copy()
            remainder.results = None
            if not r.results or len(remainder.keys()) > 4:
                new_records.append(remainder)

            #RECORD TEST RESULTS
            total = DictList()
            if r.run.suite in ["dromaeo_css", "dromaeo_dom"]:
                #dromaeo IS SPECIAL, REPLICATES ARE IN SETS OF FIVE
                #RECORD ALL RESULTS
                for i, (test_name, replicates) in enumerate(r.results.items()):
                    for g, sub_results in qb.groupby(replicates, size=5):
                        new_record = Dict(
                            machine=r.machine,
                            treeherder=r.treeherder,
                            run=r.run,
                            build=r.build,
                            result={
                                "test_name": unicode(test_name) + "." + unicode(g),
                                "ordering": i,
                                "samples": sub_results
                            }
                        )
                        try:
                            s = stats(sub_results)
                            new_record.result.stats = s
                            total.append(s)
                        except Exception, e:
                            Log.warning("can not reduce series to moments", e)
                        new_records.append(new_record)

Example #15

0

Show file

File: replicate_dependencies.py Project: klahnakoski/MoDevETL

def replicate(source, destination, pending, last_updated):
    """
    COPY THE DEPENDENCY RECORDS TO THE destination
    NOTE THAT THE PUBLIC CLUSTER HAS HOLES, SO WE USE blocked TO FILL THEM
    """
    for g, bugs in qb.groupby(pending, max_size=BATCH_SIZE):
        with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}):
            data = source.search({
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and": [
                        {"terms": {"bug_id": set(bugs)}},
                        {"range": {"expires_on": {"gte": convert.datetime2milli(last_updated)}}},
                        {"or": [
                            {"exists": {"field": "dependson"}},
                            {"exists": {"field": "blocked"}}
                        ]}
                    ]}
                }},
                "from": 0,
                "size": 200000,
                "sort": [],
                "fields": ["bug_id", "modified_ts", "expires_on", "dependson", "blocked"]
            })

            with Timer("Push to destination"):
                d2 = [
                    {
                        "id": str(x.bug_id) + "_" + str(x.modified_ts)[:-3],
                        "value": {
                            "bug_id": x.bug_id,
                            "modified_ts": x.modified_ts,
                            "expires_on": x.expires_on,
                            "dependson": x.dependson
                        }
                    }
                    for x in data.hits.hits.fields
                    if x.dependson
                ]
                destination.extend(d2)

            with Timer("filter"):
                d4 = qb.run({
                    "from": data.hits.hits.fields,
                    "where": {"exists": {"field": "blocked"}}
                })

            with Timer("select"):
                d3 = qb.run({
                    "from": d4,
                    "select": [
                        {"name": "bug_id", "value": "blocked."},  # SINCE blocked IS A LIST, ARE SELECTING THE LIST VALUES, AND EFFECTIVELY PERFORMING A JOIN
                        "modified_ts",
                        "expires_on",
                        {"name": "dependson", "value": "bug_id"}
                    ]
                })

            with Timer("Push to destination"):
                destination.extend([
                    {
                        "id": str(x.bug_id) + "_" + str(x.dependson) + "_" + str(x.modified_ts)[:-3],
                        "value": x
                    }
                    for x in d3
                    if x.dependson
                ])