コード例 #1
0
ファイル: redshift.py プロジェクト: mozilla/ActiveData-ETL
    def insert_list(self, table_name, records):
        if not records:
            return

        columns = set()
        for r in records:
            columns |= set(r.keys())
        columns = qb.sort(columns)

        try:
            self.execute(
                "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}",
                {"ids": self.quote_column([r["_id"] for r in records])}
            )

            command = \
                "INSERT INTO " + self.quote_column(table_name) + "(" + \
                ",".join([self.quote_column(k) for k in columns]) + \
                ") VALUES " + ",\n".join([
                    "(" + ",".join([self.quote_value(r.get(k, None)) for k in columns]) + ")"
                    for r in records
                ])
            self.execute(command)
        except Exception, e:
            Log.error("problem with insert", e)
コード例 #2
0
def main():
    """
    CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE
    """
    settings = startup.read_settings(defs=[
        {
            "name": ["--bucket"],
            "help": "bucket to scan",
            "type": str,
            "dest": "bucket",
            "required": True
        }
    ])
    Log.start(settings.debug)

    source = Connection(settings.aws).get_bucket(settings.args.bucket)

    for k in qb.sort(source.keys()):
        try:
            data = source.read_bytes(k)
            if convert.ascii2unicode(data).find("2e2834fa7ecd8d3bb1ad49ec981fdb89eb4df95e18") >= 0:
                Log.note("Found at {{key}}", key=k)
        except Exception, e:
            Log.warning("Problem with {{key}}", key=k, cause=e)
        finally:
コード例 #3
0
    def query(self, _query):
        if not self.columns:
            self.columns = []
            alias_done = set()
            metadata = self._es.get_metadata()
            for index, meta in qb.sort(metadata.indices.items(), {
                    "value": 0,
                    "sort": -1
            }):
                for _, properties in meta.mappings.items():
                    columns = _parse_properties(index, properties.properties)
                    for c in columns:
                        c.cube = index
                        c.property = c.name
                        c.name = None
                        c.useSource = None

                    self.columns.extend(columns)
                    for a in meta.aliases:
                        # ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS
                        if a in alias_done:
                            continue
                        alias_done.add(a)
                        for c in columns:
                            self.columns.append(set_default(
                                {"cube": a}, c))  # ENSURE WE COPY

        return qb.run(
            set_default({
                "from": self.columns,
                "sort": ["cube", "property"]
            }, _query.as_dict()))
コード例 #4
0
    def get_or_create_index(
        self,
        index,
        alias=None,
        schema=None,
        limit_replicas=None,
        settings=None
    ):
        from pyLibrary.queries import qb

        settings = deepcopy(settings)
        aliases = self.get_aliases()

        indexes = qb.sort([
            a
            for a in aliases
            if (a.alias == settings.index and settings.alias == None) or
               (re.match(re.escape(settings.index) + "\\d{8}_\\d{6}", a.index) and settings.alias == None) or
            (a.index == settings.index and (a.alias == None or a.alias == settings.alias ))
        ], "index")
        if not indexes:
            output = self.create_index(settings=settings, schema=schema, limit_replicas=limit_replicas)
            return output
        elif indexes.last().alias != None:
            settings.alias = indexes.last().alias
            settings.index = indexes.last().index
        elif settings.alias == None:
            settings.alias = settings.index
            settings.index = indexes.last().index
        return Index(settings)
コード例 #5
0
    def get_schema(self, retry=True):
        if self.settings.explore_metadata:
            indices = self.cluster.get_metadata().indices
            if not self.settings.alias or self.settings.alias==self.settings.index:
                #PARTIALLY DEFINED settings
                candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases]
                index = qb.sort(candidates, 0).last()[1]
            else:
                #FULLY DEFINED settings
                index = indices[self.settings.index]

            if index == None and retry:
                #TRY AGAIN, JUST IN CASE
                self.cluster.cluster_state = None
                return self.get_schema(retry=False)

            properties = index.mappings[self.settings.type]


            #TODO: REMOVE THIS BUG CORRECTION
            if not properties and self.settings.type == "test_result":
                properties = index.mappings["test_results"]
            # DONE BUG CORRECTION

            if not properties:
                Log.error("ElasticSearch index ({{index}}) does not have type ({{type}})",
                    index= self.settings.index,
                    type= self.settings.type)
            return properties
        else:
            mapping = self.cluster.get(self.path + "/_mapping")
            if not mapping[self.settings.type]:
                Log.error("{{index}} does not have type {{type}}", self.settings)
            return wrap({"mappings": mapping[self.settings.type]})
コード例 #6
0
ファイル: meta.py プロジェクト: klahnakoski/MoDevETL
    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        alias_done = set()
        index = split_field(table)[0]
        query_path = split_field(table)[1:]
        metadata = self.default_es.get_metadata(index=index)
        for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}):
            for _, properties in meta.mappings.items():
                columns = _elasticsearch.parse_properties(index, None, properties.properties)
                columns = columns.filter(lambda r: not r.abs_name.startswith("other.") and not r.abs_name.startswith("previous_values.cf_"))  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
                with Timer("upserting {{num}} columns", {"num": len(columns)}, debug=DEBUG):
                    with self.columns.locker:
                        for c in columns:
                            # ABSOLUTE
                            c.table = join_field([index]+query_path)
                            self.upsert_column(c)

                            for alias in meta.aliases:
                                # ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS
                                if alias in alias_done:
                                    continue
                                alias_done.add(alias)
                                c = copy(c)
                                c.table = join_field([alias]+query_path)
                                self.upsert_column(c)
コード例 #7
0
    def get_or_create_index(self,
                            index,
                            alias=None,
                            schema=None,
                            limit_replicas=None,
                            settings=None):
        from pyLibrary.queries import qb

        settings = deepcopy(settings)
        aliases = self.get_aliases()

        indexes = qb.sort([
            a for a in aliases
            if (a.alias == settings.index and settings.alias == None) or (
                re.match(re.escape(settings.index) + "\\d{8}_\\d{6}", a.index)
                and settings.alias == None) or (a.index == settings.index and (
                    a.alias == None or a.alias == settings.alias))
        ], "index")
        if not indexes:
            output = self.create_index(settings=settings,
                                       schema=schema,
                                       limit_replicas=limit_replicas)
            return output
        elif indexes.last().alias != None:
            settings.alias = indexes.last().alias
            settings.index = indexes.last().index
        elif settings.alias == None:
            settings.alias = settings.index
            settings.index = indexes.last().index
        return Index(settings)
コード例 #8
0
ファイル: meta.py プロジェクト: klahnakoski/MoDevETL
    def get_columns(self, table):
        """
        RETURN METADATA COLUMNS
        """
        with self.columns.locker:
            columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name")
            if columns:
                return columns

        self._get_columns(table=table)
        with self.columns.locker:
            columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name")
            if columns:
                return columns

        # self._get_columns(table=table)
        Log.error("no columns for {{table}}", table=table)
コード例 #9
0
 def done_count(self):
     self.edge.domain = SimpleSetDomain(
         key="value",
         partitions=[{
             "value": v,
             "dataIndex": i
         } for i, v in enumerate(
             qb.sort(self.edge.domain.partitions,
                     [k for k, v in self.fields]))])
コード例 #10
0
ファイル: elasticsearch.py プロジェクト: klahnakoski/MoDevETL
 def get_metadata(self, index=None, force=False):
     with self.metadata_locker:
         if self.settings.explore_metadata:
             if not self._metadata or (force and index is None):
                 response = self.get("/_cluster/state")
                 self._metadata = wrap(response.metadata)
                 self.cluster_state = wrap(self.get("/"))
                 self.version = self.cluster_state.version.number
             elif index:  # UPDATE THE MAPPING FOR ONE INDEX ONLY
                 response = self.get("/"+index+"/_mapping")
                 if self.version.startswith("0.90."):
                     best = qb.sort(response.items(), 0).last()
                     self._metadata.indices[index].mappings = best[1]
                 else:
                     self._metadata.indices[index].mappings = qb.sort(response.items(), 0).last()[1].mappings
                 return Dict(indices={index: self._metadata.indices[index]})
         else:
             Log.error("Metadata exploration has been disabled")
         return self._metadata
コード例 #11
0
ファイル: aggs.py プロジェクト: mozilla/ChangeDetector
    def done_count(self):
        columns = map(unicode, range(len(self.fields)))
        parts = wrap([{unicode(i): p for i, p in enumerate(part)} for part in set(self.parts)])
        self.parts = None
        sorted_parts = qb.sort(parts, columns)

        self.edge.domain = self.domain = SimpleSetDomain(
            key="value",
            partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)],
        )
コード例 #12
0
ファイル: elasticsearch.py プロジェクト: klahnakoski/MoDevETL
 def _get_best(self, settings):
     from pyLibrary.queries import qb
     aliases = self.get_aliases()
     indexes = qb.sort([
         a
         for a in aliases
         if (a.alias == settings.index and settings.alias == None) or
         (re.match(re.escape(settings.index) + r'\d{8}_\d{6}', a.index) and settings.alias == None) or
         (a.index == settings.index and (a.alias == None or a.alias == settings.alias))
     ], "index")
     return indexes.last()
コード例 #13
0
ファイル: terms.py プロジェクト: klahnakoski/Activedata-ETL
def _es_terms2(es, mvel, query):
    """
    WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value
    """

    # REQUEST VALUES IN FIRST DIMENSION
    q1 = query.copy()
    q1.edges = query.edges[0:1:]
    values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value

    select = listwrap(query.select)
    FromES = build_es_query(query)
    for s in select:
        for i, v in enumerate(values1):
            FromES.facets[s.name + "," + str(i)] = {
                "terms": {
                    "field": query.edges[1].value,
                    "size": coalesce(query.limit, 200000)
                },
                "facet_filter": simplify_esfilter({"and": [
                    query.where,
                    {"term": {query.edges[0].value: v}}
                ]})
            }

    data = es09.util.post(es, FromES, query.limit)

    # UNION ALL TERMS FROM SECOND DIMENSION
    values2 = set()
    for k, f in data.facets.items():
        values2.update(f.terms.term)
    values2 = qb.sort(values2)
    term2index = {v: i for i, v in enumerate(values2)}
    query.edges[1].domain.partitions = DictList([{"name": v, "value": v} for v in values2])

    # MAKE CUBE
    output = {}
    dims = [len(values1), len(values2)]
    for s in select:
        output[s.name] = Matrix(*dims)

    # FILL CUBE
    # EXPECTING ONLY SELECT CLAUSE FACETS
    for facetName, facet in data.facets.items():
        coord = facetName.split(",")
        s = [s for s in select if s.name == coord[0]][0]
        i1 = int(coord[1])
        for term in facet.terms:
            i2 = term2index[term.term]
            output[s.name][(i1, i2)] = term[aggregates[s.aggregate]]

    cube = Cube(query.select, query.edges, output)
    cube.query = query
    return cube
コード例 #14
0
ファイル: hierarchy.py プロジェクト: klahnakoski/MoDevETL
def push_to_es(settings, data, dirty):
    global destination

    if not destination:
        index = Index(settings.destination)
        destination = index.threaded_queue(batch_size=100)

    # PREP RECORDS FOR ES
    for bug_id in dirty:
        d = data.descendants[bug_id]
        if not d:
            continue

        p = data.parents[bug_id]
        c = data.children[bug_id]
        destination.add({"id": bug_id, "value": {
            "bug_id": bug_id,
            "parents": qb.sort(p),
            "children": qb.sort(c),
            "descendants": qb.sort(d),
            "etl": {"timestamp": Date.now().unix}
        }})
コード例 #15
0
ファイル: hierarchy.py プロジェクト: klahnakoski/MoDevETL
def pull_from_es(settings, destq, all_parents, all_children, all_descendants, work_queue):
    # LOAD PARENTS FROM ES

    for g, r in qb.groupby(qb.sort(work_queue), size=100):
        result = destq.query({
            "from": settings.destination.index,
            "select": "*",
            "where": {"terms": {"bug_id": r}}
        })
        for r in result.data:
            all_parents.extend(r.bug_id, listwrap(r.parents))
            all_children.extend(r.bug_id, listwrap(r.children))
            all_descendants.extend(r.bug_id, listwrap(r.descendants))
コード例 #16
0
ファイル: convert.py プロジェクト: mozilla/ChangeDetector
def json_schema_to_markdown(schema):
    from pyLibrary.queries import qb

    def _md_code(code):
        return "`"+code+"`"

    def _md_italic(value):
        return "*"+value+"*"

    def _inner(schema, parent_name, indent):
        more_lines = []
        for k,v in schema.items():
            full_name = join_field(split_field(parent_name)+[k])
            details = indent+"* "+_md_code(full_name)
            if v.type:
                details += " - "+_md_italic(v.type)
            else:
                Log.error("{{full_name}} is missing type", full_name=full_name)
            if v.description:
                details += " " + v.description
            more_lines.append(details)

            if v.type in ["object", "array", "nested"]:
                more_lines.extend(_inner(v.properties, full_name, indent+"  "))
        return more_lines

    lines = []
    if schema.title:
        lines.append("#"+schema.title)

    lines.append(schema.description)
    lines.append("")

    for k, v in qb.sort(schema.properties.items(), 0):
        full_name = k
        if v.type in ["object", "array", "nested"]:
            lines.append("##"+_md_code(full_name)+" Property")
            if v.description:
                lines.append(v.description)
            lines.append("")

            if v.type in ["object", "array", "nested"]:
                lines.extend(_inner(v.properties, full_name, "  "))
        else:
            lines.append("##"+_md_code(full_name)+" ("+v.type+")")
            if v.description:
                lines.append(v.description)

    return "\n".join(lines)
コード例 #17
0
    def __init__(
        self,
        alias,  # NAME OF THE ALIAS
        type=None,  # SCHEMA NAME, WILL HUNT FOR ONE IF None
        explore_metadata=True,  # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED
        debug=False,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        settings=None):
        self.debug = debug
        if self.debug:
            Log.alert("Elasticsearch debugging on {{index|quote}} is on",
                      index=settings.index)

        self.settings = settings
        self.cluster = Cluster(settings)

        if type == None:
            if not explore_metadata:
                Log.error(
                    "Alias() was given no `type` (aka schema) and not allowed to explore metadata.  Do not know what to do now."
                )

            indices = self.cluster.get_metadata().indices
            if not self.settings.alias or self.settings.alias == self.settings.index:
                candidates = [(name, i) for name, i in indices.items()
                              if self.settings.index in i.aliases]
                index = qb.sort(candidates, 0).last()[1]
            else:
                index = indices[self.settings.index]

            # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE)
            max_prop = -1
            for _type, mapping in index.mappings.items():
                num_prop = len(mapping.properties.keys())
                if max_prop < num_prop:
                    max_prop = num_prop
                    self.settings.type = _type
                    type = _type

            if type == None:
                Log.error("Can not find schema type for index {{index}}",
                          index=coalesce(self.settings.alias,
                                         self.settings.index))

        self.path = "/" + alias + "/" + type
コード例 #18
0
ファイル: elasticsearch.py プロジェクト: klahnakoski/MoDevETL
    def __init__(
        self,
        alias,  # NAME OF THE ALIAS
        type=None,  # SCHEMA NAME, WILL HUNT FOR ONE IF None
        explore_metadata=True,  # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED
        debug=False,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        settings=None
    ):
        self.debug = debug
        if self.debug:
            Log.alert("Elasticsearch debugging on {{index|quote}} is on",  index= settings.index)

        self.settings = settings
        self.cluster = Cluster(settings)

        if type == None:
            if not explore_metadata:
                Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata.  Do not know what to do now.")

            indices = self.cluster.get_metadata().indices
            if not self.settings.alias or self.settings.alias==self.settings.index:
                alias_list = self.cluster.get("/_alias/"+self.settings.index)
                candidates = [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()]
                full_name = qb.sort(candidates, 0).last()[0]
                index = self.cluster.get("/" + full_name + "/_mapping")[full_name]
            else:
                index = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index]

            # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE)
            max_prop = -1
            for _type, mapping in index.mappings.items():
                if _type == "_default_":
                    continue
                num_prop = len(mapping.properties.keys())
                if max_prop < num_prop:
                    max_prop = num_prop
                    self.settings.type = _type
                    type = _type

            if type == None:
                Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index))

        self.path = "/" + alias + "/" + type
コード例 #19
0
ファイル: mysql.py プロジェクト: mozilla/ChangeDetector
    def insert_list(self, table_name, records):
        if not records:
            return

        keys = set()
        for r in records:
            keys |= set(r.keys())
        keys = qb.sort(keys)

        try:
            command = \
                "INSERT INTO " + self.quote_column(table_name) + "(" + \
                ",".join([self.quote_column(k) for k in keys]) + \
                ") VALUES " + ",\n".join([
                    "(" + ",".join([self.quote_value(r[k]) for k in keys]) + ")"
                    for r in records
                ])
            self.execute(command)
        except Exception, e:
            Log.error("problem with record: {{record}}",  record= records, cause=e)
コード例 #20
0
    def get_schema(self, retry=True):
        if self.settings.explore_metadata:
            indices = self.cluster.get_metadata().indices
            if not self.settings.alias or self.settings.alias == self.settings.index:
                #PARTIALLY DEFINED settings
                candidates = [(name, i) for name, i in indices.items()
                              if self.settings.index in i.aliases]
                # TODO: MERGE THE mappings OF ALL candidates, DO NOT JUST PICK THE LAST ONE
                index = "dummy value"
                schema = wrap({"properties": {}})
                for _, ind in qb.sort(candidates, {"value": 0, "sort": -1}):
                    schema.properties = _merge_mapping(
                        schema.properties,
                        ind.mappings[self.settings.type].properties)
            else:
                #FULLY DEFINED settings
                index = indices[self.settings.index]
                schema = index.mappings[self.settings.type]

            if index == None and retry:
                #TRY AGAIN, JUST IN CASE
                self.cluster.cluster_state = None
                return self.get_schema(retry=False)

            #TODO: REMOVE THIS BUG CORRECTION
            if not schema and self.settings.type == "test_result":
                schema = index.mappings["test_results"]
            # DONE BUG CORRECTION

            if not schema:
                Log.error(
                    "ElasticSearch index ({{index}}) does not have type ({{type}})",
                    index=self.settings.index,
                    type=self.settings.type)
            return schema
        else:
            mapping = self.cluster.get(self.path + "/_mapping")
            if not mapping[self.settings.type]:
                Log.error("{{index}} does not have type {{type}}",
                          self.settings)
            return wrap({"mappings": mapping[self.settings.type]})
コード例 #21
0
ファイル: elasticsearch.py プロジェクト: klahnakoski/MoDevETL
    def get_schema(self, retry=True):
        if self.settings.explore_metadata:
            indices = self.cluster.get_metadata().indices
            if not self.settings.alias or self.settings.alias==self.settings.index:
                #PARTIALLY DEFINED settings
                candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases]
                # TODO: MERGE THE mappings OF ALL candidates, DO NOT JUST PICK THE LAST ONE

                index = "dummy value"
                schema = wrap({"_routing": {}, "properties": {}})
                for _, ind in qb.sort(candidates, {"value": 0, "sort": -1}):
                    mapping = ind.mappings[self.settings.type]
                    set_default(schema._routing, mapping._routing)
                    schema.properties = _merge_mapping(schema.properties, mapping.properties)
            else:
                #FULLY DEFINED settings
                index = indices[self.settings.index]
                schema = index.mappings[self.settings.type]

            if index == None and retry:
                #TRY AGAIN, JUST IN CASE
                self.cluster.cluster_state = None
                return self.get_schema(retry=False)

            #TODO: REMOVE THIS BUG CORRECTION
            if not schema and self.settings.type == "test_result":
                schema = index.mappings["test_results"]
            # DONE BUG CORRECTION

            if not schema:
                Log.error(
                    "ElasticSearch index ({{index}}) does not have type ({{type}})",
                    index=self.settings.index,
                    type=self.settings.type
                )
            return schema
        else:
            mapping = self.cluster.get(self.path + "/_mapping")
            if not mapping[self.settings.type]:
                Log.error("{{index}} does not have type {{type}}", self.settings)
            return wrap({"mappings": mapping[self.settings.type]})
コード例 #22
0
ファイル: mysql.py プロジェクト: mozilla/ActiveData-ETL
    def insert_list(self, table_name, records):
        if not records:
            return

        keys = set()
        for r in records:
            keys |= set(r.keys())
        keys = qb.sort(keys)

        try:
            command = \
                "INSERT INTO " + self.quote_column(table_name) + "(" + \
                ",".join([self.quote_column(k) for k in keys]) + \
                ") VALUES " + ",\n".join([
                    "(" + ",".join([self.quote_value(r[k]) for k in keys]) + ")"
                    for r in records
                ])
            self.execute(command)
        except Exception, e:
            Log.error("problem with record: {{record}}",
                      record=records,
                      cause=e)
コード例 #23
0
ファイル: reset.py プロジェクト: mozilla/ActiveData-ETL
def main():
    """
    CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE
    """
    try:
        settings = startup.read_settings(defs=[{
            "name": ["--bucket"],
            "help": "bucket to reprocess",
            "type": str,
            "dest": "bucket",
            "required": True
        }, {
            "name": ["--begin", "--start"],
            "help": "lowest key (or prefix) to reprocess",
            "type": str,
            "dest": "start",
            "default": "1",
            "required": False
        }, {
            "name": ["--end", "--stop"],
            "help": "highest key (or prefix) to reprocess",
            "type": str,
            "dest": "end",
            "default": None,
            "required": False
        }, {
            "name": ["--file"],
            "help": "path to file with CR-delimited prefix list",
            "type": str,
            "dest": "file",
            "default": None,
            "required": False
        }])
        Log.start(settings.debug)

        with aws.Queue(settings.work_queue) as work_queue:
            source = Connection(settings.aws).get_bucket(settings.args.bucket)

            if settings.args.file:
                now = Date.now()
                for prefix in File(settings.args.file):
                    all_keys = source.keys(prefix=key_prefix(prefix))
                    for k in all_keys:
                        Log.note("Adding {{key}}", key=k)
                        work_queue.add({
                            "bucket": settings.args.bucket,
                            "key": k,
                            "timestamp": now.unix,
                            "date/time": now.format()
                        })
                return

            if settings.args.end and settings.args.start:
                up_to = str(int(settings.args.end) - 1)
                prefix = strings.common_prefix(settings.args.start, up_to)
            else:
                prefix = None
            start = Version(settings.args.start)
            end = Version(settings.args.end)

            all_keys = source.keys(prefix=prefix)
            with Timer("filtering {{num}} keys", {"num": len(all_keys)}):
                all_keys = [(k, Version(k)) for k in all_keys
                            if k.find("None") == -1]
                all_keys = [(k, p) for k, p in all_keys if start <= p < end]
            with Timer("sorting {{num}} keys", {"num": len(all_keys)}):
                all_keys = qb.sort(all_keys, 1)
            for k, p in all_keys:
                Log.note("Adding {{key}}", key=k)
                now = Date.now()
                work_queue.add({
                    "bucket": settings.args.bucket,
                    "key": k,
                    "timestamp": now.unix,
                    "date/time": now.format()
                })

    except Exception, e:
        Log.error("Problem with etl", e)
コード例 #24
0
        settings=settings.destination).get_or_create_index(
            settings=settings.destination)

    keep_trying = True
    while keep_trying:
        try:
            all_keys = source.keys()
            keep_trying = False
        except Exception, e:
            Log.warning("problem", e)

    # all_keys = set()
    # for i in range(20, 97, 1):
    #     all_keys |= source.keys(prefix=unicode(i))

    for k in qb.sort(all_keys):
        try:
            pulse_block_to_es.process(k, source.get_key(k), destination)
        except Exception, e:
            Log.warning("Problem with {{key}}", key=k, cause=e)


def main():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        backfill(settings)
    except Exception, e:
        Log.error("Problem with backfill", e)
    finally:
        Log.stop()
コード例 #25
0
                        "ordering": -1,
                        "stats": geo_mean(total)
                    }
                )
                new_records.append(new_record)

                # ADD RECORD FOR GRAPH SERVER SUMMARYh
                new_record = Dict(
                    machine=r.machine,
                    treeherder=r.treeherder,
                    run=r.run,
                    build=r.build,
                    result={
                        "test_name": "summary_old",
                        "ordering": -1,
                        "stats": Stats(samples=qb.sort(total.mean)[:len(total)-1:])
                    }
                )
                new_records.append(new_record)

            return new_records
        except Exception, e:
            Log.error("Transformation failure on id={{uid}}", {"uid": uid}, e)


def stats(values):
    """
    RETURN LOTS OF AGGREGATES
    """
    if values == None:
        return None
コード例 #26
0
ファイル: index_diff.py プロジェクト: mozilla/ActiveData-ETL
    prefixes = [
        p.name.rstrip(":") for p in bucket.list(prefix="", delimiter=":")
    ]
    in_s3 = []
    for i, p in enumerate(prefixes):
        if i % 1000 == 0:
            Log.note("Scrubbed {{p|percent(decimal=1)}}", p=i / len(prefixes))
        try:
            if int(p) not in in_es:
                in_s3.append(int(p))
            else:
                pass
        except Exception, _:
            Log.note("delete key {{key}}", key=p)
            bucket.delete_key(strip_extension(p))
    in_s3 = qb.reverse(qb.sort(in_s3))
    return in_s3


def main():
    try:
        settings = startup.read_settings(defs=[{
            "name": ["--id"],
            "help": "id (prefix, really) to process",
            "type": str,
            "dest": "id",
            "required": False
        }])
        constants.set(settings.constants)
        Log.start(settings.debug)
コード例 #27
0
    # EVERYTHING FROM S3
    bucket = s3.Bucket(settings.source)
    prefixes = [p.name.rstrip(":") for p in bucket.list(prefix="", delimiter=":")]
    in_s3 = []
    for i, p in enumerate(prefixes):
        if i % 1000 == 0:
            Log.note("Scrubbed {{p|percent(decimal=1)}}",  p= i / len(prefixes))
        try:
            if int(p) not in in_es:
                in_s3.append(int(p))
            else:
                pass
        except Exception, _:
            Log.note("delete key {{key}}",  key= p)
            bucket.delete_key(strip_extension(p))
    in_s3 = qb.reverse(qb.sort(in_s3))
    return in_s3


def main():
    try:
        settings = startup.read_settings(defs=[
            {
                "name": ["--id"],
                "help": "id (prefix, really) to process",
                "type": str,
                "dest": "id",
                "required": False
            }
        ])
        constants.set(settings.constants)
コード例 #28
0
ファイル: reset.py プロジェクト: klahnakoski/Activedata-ETL
def main():
    """
    CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE
    """
    try:
        settings = startup.read_settings(defs=[
            {
                "name": ["--bucket"],
                "help": "bucket to reprocess",
                "type": str,
                "dest": "bucket",
                "required": True
            },
            {
                "name": ["--begin", "--start"],
                "help": "lowest key (or prefix) to reprocess",
                "type": str,
                "dest": "start",
                "default": "1",
                "required": False
            },
            {
                "name": ["--end", "--stop"],
                "help": "highest key (or prefix) to reprocess",
                "type": str,
                "dest": "end",
                "default": None,
                "required": False
            },
            {
                "name": ["--file"],
                "help": "path to file with CR-delimited prefix list",
                "type": str,
                "dest": "file",
                "default": None,
                "required": False
            }
        ])
        Log.start(settings.debug)

        with aws.Queue(settings.work_queue) as work_queue:
            source = Connection(settings.aws).get_bucket(settings.args.bucket)

            if settings.args.file:
                now = Date.now()
                for prefix in File(settings.args.file):
                    all_keys = source.keys(prefix=key_prefix(prefix))
                    for k in all_keys:
                        Log.note("Adding {{key}}", key=k)
                        work_queue.add({
                            "bucket": settings.args.bucket,
                            "key": k,
                            "timestamp": now.unix,
                            "date/time": now.format()
                        })
                return

            if settings.args.end and settings.args.start:
                up_to = str(int(settings.args.end) - 1)
                prefix = strings.common_prefix(settings.args.start, up_to)
            else:
                prefix = None
            start = Version(settings.args.start)
            end = Version(settings.args.end)

            all_keys = source.keys(prefix=prefix)
            with Timer("filtering {{num}} keys", {"num": len(all_keys)}):
                all_keys = [(k, Version(k)) for k in all_keys if k.find("None") == -1]
                all_keys = [(k, p) for k, p in all_keys if start <= p < end]
            with Timer("sorting {{num}} keys", {"num": len(all_keys)}):
                all_keys = qb.sort(all_keys, 1)
            for k, p in all_keys:
                Log.note("Adding {{key}}",  key= k)
                now = Date.now()
                work_queue.add({
                    "bucket": settings.args.bucket,
                    "key": k,
                    "timestamp": now.unix,
                    "date/time": now.format()
                })

    except Exception, e:
        Log.error("Problem with etl", e)
コード例 #29
0
    source = aws.s3.Bucket(settings=settings.source)
    destination = elasticsearch.Cluster(settings=settings.destination).get_or_create_index(settings=settings.destination)

    keep_trying = True
    while keep_trying:
        try:
            all_keys = source.keys()
            keep_trying=False
        except Exception, e:
            Log.warning("problem", e)

    # all_keys = set()
    # for i in range(20, 97, 1):
    #     all_keys |= source.keys(prefix=unicode(i))

    for k in qb.sort(all_keys):
        try:
            pulse_block_to_es.process(k, source.get_key(k), destination)
        except Exception, e:
            Log.warning("Problem with {{key}}", key=k, cause=e)


def main():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        backfill(settings)
    except Exception, e:
        Log.error("Problem with backfill", e)
    finally:
        Log.stop()
コード例 #30
0
ファイル: aggs.py プロジェクト: mozilla/ChangeDetector
 def done_count(self):
     self.edge.domain = self.domain = SimpleSetDomain(partitions=qb.sort(set(self.parts)))
     self.parts = None
コード例 #31
0
def _es_terms2(es, mvel, query):
    """
    WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value
    """

    # REQUEST VALUES IN FIRST DIMENSION
    q1 = query.copy()
    q1.edges = query.edges[0:1:]
    values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value

    select = listwrap(query.select)
    FromES = build_es_query(query)
    for s in select:
        for i, v in enumerate(values1):
            FromES.facets[s.name + "," + str(i)] = {
                "terms": {
                    "field": query.edges[1].value,
                    "size": coalesce(query.limit, 200000)
                },
                "facet_filter":
                simplify_esfilter({
                    "and": [query.where, {
                        "term": {
                            query.edges[0].value: v
                        }
                    }]
                })
            }

    data = es09.util.post(es, FromES, query.limit)

    # UNION ALL TERMS FROM SECOND DIMENSION
    values2 = set()
    for k, f in data.facets.items():
        values2.update(f.terms.term)
    values2 = qb.sort(values2)
    term2index = {v: i for i, v in enumerate(values2)}
    query.edges[1].domain.partitions = DictList([{
        "name": v,
        "value": v
    } for v in values2])

    # MAKE CUBE
    output = {}
    dims = [len(values1), len(values2)]
    for s in select:
        output[s.name] = Matrix(*dims)

    # FILL CUBE
    # EXPECTING ONLY SELECT CLAUSE FACETS
    for facetName, facet in data.facets.items():
        coord = facetName.split(",")
        s = [s for s in select if s.name == coord[0]][0]
        i1 = int(coord[1])
        for term in facet.terms:
            i2 = term2index[term.term]
            output[s.name][(i1, i2)] = term[aggregates[s.aggregate]]

    cube = Cube(query.select, query.edges, output)
    cube.query = query
    return cube
コード例 #32
0
ファイル: aggs.py プロジェクト: klahnakoski/intermittents
 def done_count(self):
     self.edge.domain = SimpleSetDomain(
         key="value",
         partitions=[{"value": v, "dataIndex": i} for i, v in enumerate(qb.sort(self.edge.domain.partitions, [k for k, v in self.fields]))]
     )
コード例 #33
0
ファイル: aggs.py プロジェクト: klahnakoski/intermittents
 def done_count(self):
     self.edge.domain = SimpleSetDomain(
         partitions=qb.sort(self.edge.domain.partitions)
     )
コード例 #34
0
ファイル: mysql.py プロジェクト: mozilla/ActiveData-ETL
def int_list_packer(term, values):
    """
    return singletons, ranges and exclusions
    """
    DENSITY = 10  # a range can have holes, this is inverse of the hole density
    MIN_RANGE = 20  # min members before a range is allowed to be used

    singletons = set()
    ranges = []
    exclude = set()

    sorted = qb.sort(values)

    last = sorted[0]
    curr_start = last
    curr_excl = set()

    for v in sorted[1::]:
        if v <= last + 1:
            pass
        elif v - last > 3:
            # big step, how do we deal with it?
            if last == curr_start:
                # not a range yet, so just add as singlton
                singletons.add(last)
            elif last - curr_start - len(curr_excl) < MIN_RANGE or (
                (last - curr_start) < len(curr_excl) * DENSITY):
                # small ranges are singletons, sparse ranges are singletons
                singletons |= set(range(curr_start, last + 1))
                singletons -= curr_excl
            else:
                # big enough, and dense enough range
                ranges.append({"gte": curr_start, "lte": last})
                exclude |= curr_excl
            curr_start = v
            curr_excl = set()
        else:
            if 1 + last - curr_start >= len(curr_excl) * DENSITY:
                # high density, keep track of excluded and continue
                add_me = set(range(last + 1, v))
                curr_excl |= add_me
            elif 1 + last - curr_start - len(curr_excl) < MIN_RANGE:
                # not big enough, convert range to singletons
                new_singles = set(range(curr_start, last + 1)) - curr_excl
                singletons = singletons | new_singles

                curr_start = v
                curr_excl = set()
            else:
                ranges.append({"gte": curr_start, "lte": last})
                exclude |= curr_excl
                curr_start = v
                curr_excl = set()
        last = v

    if last == curr_start:
        # not a range yet, so just add as singlton
        singletons.add(last)
    elif last - curr_start - len(curr_excl) < MIN_RANGE or (
        (last - curr_start) < len(curr_excl) * DENSITY):
        # small ranges are singletons, sparse ranges are singletons
        singletons |= set(range(curr_start, last + 1))
        singletons -= curr_excl
    else:
        # big enough, and dense enough range
        ranges.append({"gte": curr_start, "lte": last})
        exclude |= curr_excl

    if ranges:
        r = {"or": [{"range": {term: r}} for r in ranges]}
        if exclude:
            r = {"and": [r, {"not": {"terms": {term: qb.sort(exclude)}}}]}
        if singletons:
            return {"or": [{"terms": {term: qb.sort(singletons)}}, r]}
        else:
            return r
    else:
        raise Except("no packing possible")
コード例 #35
0
ファイル: meta.py プロジェクト: klahnakoski/MoDevETL
    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in ["object", "nested"]:
            Log.error("not supported")
        try:
            if c.table == "meta.columns":
                with self.columns.locker:
                    partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.columns, c.abs_name) if g[c.abs_name] != None])
                    self.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                    })
                return
            if c.table == "meta.tables":
                with self.columns.locker:
                    partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.tables, c.abs_name) if g[c.abs_name] != None])
                    self.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return

            es_index = c.table.split(".")[0]
            result = self.default_es.post("/"+es_index+"/_search", data={
                "aggs": {c.name: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value)
            if cardinality == None:
                Log.error("logic error")

            query = Dict(size=0)
            if c.type in ["object", "nested"]:
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif c.nested_path:
                query.aggs[literal_field(c.name)] = {
                    "nested": {"path": listwrap(c.nested_path)[0]},
                    "aggs": {"_nested": {"terms": {"field": c.abs_name, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.name)] = {"terms": {"field": c.abs_name, "size": 0}}

            result = self.default_es.post("/"+es_index+"/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = qb.sort(aggs._nested.buckets.key)
            else:
                parts = qb.sort(aggs.buckets.key)

            Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.columns.locker:
                self.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                })
        except Exception, e:
            if "IndexMissingException" in e and c.table.startswith("testing"):
                Log.alert("{{col.table}} does not exist", col=c)
            else:
                self.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                })
                Log.warning("Could not get {{col.table}}.{{col.abs_name}} info", col=c, cause=e)
コード例 #36
0
ファイル: mysql.py プロジェクト: mozilla/ChangeDetector
def int_list_packer(term, values):
    """
    return singletons, ranges and exclusions
    """
    DENSITY = 10  # a range can have holes, this is inverse of the hole density
    MIN_RANGE = 20  # min members before a range is allowed to be used

    singletons = set()
    ranges = []
    exclude = set()

    sorted = qb.sort(values)

    last = sorted[0]
    curr_start = last
    curr_excl = set()

    for v in sorted[1::]:
        if v <= last + 1:
            pass
        elif v - last > 3:
            # big step, how do we deal with it?
            if last == curr_start:
                # not a range yet, so just add as singlton
                singletons.add(last)
            elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY):
                # small ranges are singletons, sparse ranges are singletons
                singletons |= set(range(curr_start, last + 1))
                singletons -= curr_excl
            else:
                # big enough, and dense enough range
                ranges.append({"gte": curr_start, "lte": last})
                exclude |= curr_excl
            curr_start = v
            curr_excl = set()
        else:
            if 1 + last - curr_start >= len(curr_excl) * DENSITY:
                # high density, keep track of excluded and continue
                add_me = set(range(last + 1, v))
                curr_excl |= add_me
            elif 1 + last - curr_start - len(curr_excl) < MIN_RANGE:
                # not big enough, convert range to singletons
                new_singles = set(range(curr_start, last + 1)) - curr_excl
                singletons = singletons | new_singles

                curr_start = v
                curr_excl = set()
            else:
                ranges.append({"gte": curr_start, "lte": last})
                exclude |= curr_excl
                curr_start = v
                curr_excl = set()
        last = v

    if last == curr_start:
        # not a range yet, so just add as singlton
        singletons.add(last)
    elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY):
        # small ranges are singletons, sparse ranges are singletons
        singletons |= set(range(curr_start, last + 1))
        singletons -= curr_excl
    else:
        # big enough, and dense enough range
        ranges.append({"gte": curr_start, "lte": last})
        exclude |= curr_excl

    if ranges:
        r = {"or": [{"range": {term: r}} for r in ranges]}
        if exclude:
            r = {"and": [r, {"not": {"terms": {term: qb.sort(exclude)}}}]}
        if singletons:
            return {"or": [
                {"terms": {term: qb.sort(singletons)}},
                r
            ]}
        else:
            return r
    else:
        raise Except("no packing possible")
コード例 #37
0
ファイル: etl.py プロジェクト: klahnakoski/Activedata-ETL
    def _dispatch_work(self, source_block):
        """
        source_block POINTS TO THE bucket AND key TO PROCESS
        :return: False IF THERE IS NOTHING LEFT TO DO
        """
        source_keys = listwrap(coalesce(source_block.key, source_block.keys))

        if not isinstance(source_block.bucket, basestring):  # FIX MISTAKE
            source_block.bucket = source_block.bucket.bucket
        bucket = source_block.bucket
        work_actions = [w for w in self.settings.workers if w.source.bucket == bucket]

        if not work_actions:
            Log.note("No worker defined for records from {{bucket}}, {{action}}.\n{{message|indent}}",
                bucket= source_block.bucket,
                message= source_block,
                action= "skipping" if self.settings.keep_unknown_on_queue else "deleting")
            return not self.settings.keep_unknown_on_queue

        for action in work_actions:
            try:
                source_key = unicode(source_keys[0])
                if len(source_keys) > 1:
                    multi_source = action._source
                    source = ConcatSources([multi_source.get_key(k) for k in source_keys])
                    source_key = MIN(source_key)
                else:
                    source = action._source.get_key(source_key)
                    source_key = source.key

                Log.note("Execute {{action}} on bucket={{source}} key={{key}}",
                    action= action.name,
                    source= source_block.bucket,
                    key= source_key)

                if action.transform_type == "bulk":
                    old_keys = set()
                else:
                    old_keys = action._destination.keys(prefix=source_block.key)

                new_keys = set(action._transformer(source_key, source, action._destination, resources=self.resources, please_stop=self.please_stop))

                #VERIFY KEYS
                if len(new_keys) == 1 and list(new_keys)[0] == source_key:
                    pass  # ok
                else:
                    etls = map(key2etl, new_keys)
                    etls = qb.sort(etls, "id")
                    for i, e in enumerate(etls):
                        if i != e.id:
                            Log.error("expecting keys to have dense order: {{ids}}", ids=etls.id)
                    #VERIFY KEYS EXIST
                    if hasattr(action._destination, "get_key"):
                        for k in new_keys:
                            action._destination.get_key(k)

                for n in action._notify:
                    for k in new_keys:
                        n.add(k)

                if action.transform_type == "bulk":
                    continue

                # DUE TO BUGS THIS INVARIANT IS NOW BROKEN
                # TODO: FIGURE OUT HOW TO FIX THIS (CHANGE NAME OF THE SOURCE BLOCK KEY?)
                # for n in new_keys:
                #     if not n.startswith(source_key):
                #         Log.error("Expecting new keys ({{new_key}}) to start with source key ({{source_key}})",  new_key= n,  source_key= source_key)

                if not new_keys and old_keys:
                    Log.alert("Expecting some new keys after etl of {{source_key}}, especially since there were old ones\n{{old_keys}}",
                        old_keys= old_keys,
                        source_key= source_key)
                    continue
                elif not new_keys:
                    Log.alert("Expecting some new keys after processing {{source_key}}",
                        old_keys= old_keys,
                        source_key= source_key)
                    continue

                for k in new_keys:
                    if len(k.split(".")) == 3 and action.destination.type!="test_result":
                        Log.error("two dots have not been needed yet, this is a consitency check")

                delete_me = old_keys - new_keys
                if delete_me:
                    if action.destination.bucket == "ekyle-test-result":
                        for k in delete_me:
                            action._destination.delete_key(k)
                    else:
                        Log.note("delete keys?\n{{list}}",  list= sorted(delete_me))
                        # for k in delete_me:
                # WE DO NOT PUT KEYS ON WORK QUEUE IF ALREADY NOTIFYING SOME OTHER
                # AND NOT GOING TO AN S3 BUCKET
                if not action._notify and isinstance(action._destination, (aws.s3.Bucket, S3Bucket)):
                    for k in old_keys | new_keys:
                        self.work_queue.add(Dict(
                            bucket=action.destination.bucket,
                            key=k
                        ))
            except Exception, e:
                if "Key {{key}} does not exist" in e:
                    err = Log.warning
                elif "multiple keys in {{bucket}}" in e:
                    err = Log.warning
                    if source_block.bucket=="ekyle-test-result":
                        for k in action._source.list(prefix=key_prefix(source_key)):
                            action._source.delete_key(strip_extension(k.key))
                elif "expecting keys to have dense order" in e:
                    err = Log.warning
                    if source_block.bucket=="ekyle-test-result":
                        # WE KNOW OF THIS ETL MISTAKE, REPROCESS
                        self.work_queue.add({
                            "key": unicode(key_prefix(source_key)),
                            "bucket": "ekyle-pulse-logger"
                        })
                elif "Expecting a pure key" in e:
                    err = Log.warning
                else:
                    err = Log.error

                err("Problem transforming {{action}} on bucket={{source}} key={{key}} to destination={{destination}}", {
                    "action": action.name,
                    "source": source_block.bucket,
                    "key": source_key,
                    "destination": coalesce(action.destination.name, action.destination.index)
                }, e)
コード例 #38
0
                                  })
                new_records.append(new_record)

                # ADD RECORD FOR GRAPH SERVER SUMMARYh
                new_record = Dict(
                    machine=r.machine,
                    treeherder=r.treeherder,
                    run=r.run,
                    build=r.build,
                    result={
                        "test_name":
                        "summary_old",
                        "ordering":
                        -1,
                        "stats":
                        Stats(samples=qb.sort(total.mean)[:len(total) - 1:])
                    })
                new_records.append(new_record)

            return new_records
        except Exception, e:
            Log.error("Transformation failure on id={{uid}}", {"uid": uid}, e)


def stats(values):
    """
    RETURN LOTS OF AGGREGATES
    """
    if values == None:
        return None
コード例 #39
0
ファイル: lists.py プロジェクト: klahnakoski/MoDevETL
 def sort(self, sort):
     return ListContainer("from "+self.name, qb.sort(self.data, sort), self.schema)
コード例 #40
0
 def done_count(self):
     self.edge.domain = SimpleSetDomain(
         partitions=qb.sort(self.edge.domain.partitions))
コード例 #41
0
ファイル: reviews.py プロジェクト: klahnakoski/MoDevETL
def full_etl(settings, sink, bugs):
    with Timer("process block {{start}}", {"start": min(bugs)}):
        es = elasticsearch.Index(settings.source)
        with FromES(es) as esq:
            versions = esq.query({
                "from": "bugs",
                "select": "*",
                "where": {"terms": {"bug_id": bugs}}
            })

        starts = qb.run({
            "select": [
                "bug_id",
                "bug_status",
                {"name": "attach_id", "value": "attachments.attach_id"},
                {"name": "request_time", "value": "modified_ts"},
                {"name": "request_type", "value": "attachments.flags.request_type"},
                {"name": "reviewer", "value": "attachments.flags.requestee"},
                {"name": "created_by", "value": "attachments.created_by"},
                "product",
                "component"
            ],
            "from":
                versions,
            "where":
                {"and": [
                    {"terms": {"attachments.flags.request_status": ["?"]}},
                    {"terms": {"attachments.flags.request_type": TYPES}},
                    {"equal": ["attachments.flags.modified_ts", "modified_ts"]},
                    {"term": {"attachments.isobsolete": 0}}
                ]},
            "sort": ["bug_id", "attach_id", "created_by"]
        })

        ends = qb.run({
            "select": [
                {"name": "bug_id", "value": "bug_id"},
                "bug_status",
                {"name": "attach_id", "value": "attachments.attach_id"},
                {"name": "modified_ts", "value": lambda r: Math.max(r.modified_ts, r.attachments.modified_ts, r.attachments.flags.modified_ts)},
                {"name": "reviewer", "value": "attachments.flags.requestee"},
                {"name": "request_type", "value": "attachments.flags.request_type"},
                {"name": "modified_by", "value": "attachments.flags.modified_by"},
                {"name": "product", "value": "product"},
                {"name": "component", "value": "component"},
                {"name": "review_end_reason", "value": lambda r: 'done' if r.attachments.flags.request_status != '?' else ('obsolete' if r.attachments.isobsolete == 1 else 'closed')},
                {"name": "review_result", "value": lambda r: '+' if r.attachments.flags.request_status == '+' else ('-' if r.attachments.flags.request_status == '-' else '?')}
            ],
            "from":
                versions,
            "where":
                {"and": [
                    {"terms": {"attachments.flags.request_type": TYPES}},
                    {"or": [
                        {"and": [# IF THE REQUESTEE SWITCHED THE ? FLAG, THEN IT IS DONE
                            {"term": {"attachments.flags.previous_status": "?"}},
                            {"not": {"term": {"attachments.flags.request_status": "?"}}},
                            {"equal": ["attachments.flags.modified_ts", "modified_ts"]}
                        ]},
                        {"and": [# IF OBSOLETED THE ATTACHMENT, IT IS DONE
                            {"term": {"attachments.isobsolete": 1}},
                            {"term": {"previous_values.isobsolete_value": 0}}
                        ]},
                        {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW
                            {"terms": {"bug_status": ["resolved", "verified", "closed"]}},
                            {"not": {"terms": {"previous_values.bug_status_value": ["resolved", "verified", "closed"]}}}
                        ]}
                    ]}
                ]}
        })

        # SOME ATTACHMENTS GO MISSING, CLOSE THEM TOO
        closed_bugs = {b.bug_id: b for b in qb.filter(versions, {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW
            {"terms": {"bug_status": ["resolved", "verified", "closed"]}},
            {"range": {"expires_on": {"gte": Date.now().milli}}}
        ]})}

        for s in starts:
            if s.bug_id in closed_bugs:
                e = closed_bugs[s.bug_id]
                ends.append({
                    "bug_id": e.bug_id,
                    "bug_status": e.bug_status,
                    "attach_id": s.attach_id,
                    "modified_ts": e.modified_ts,
                    "reviewer": s.reviewer,
                    "request_type": s.request_type,
                    "modified_by": e.modified_by,
                    "product": e.product,
                    "component": e.component,
                    "review_end_reason": 'closed',
                    "review_result": '?'
                })

        # REVIEWS END WHEN REASSIGNED TO SOMEONE ELSE
        changes = qb.run({
            "select": [
                "bug_id",
                {"name": "attach_id", "value": "changes.attach_id"},
                "modified_ts",
                {"name": "reviewer", "value": lambda r: r.changes.old_value.split("?")[1]},
                {"name": "request_type", "value": lambda r: r.changes.old_value.split("?")[0]},
                {"name": "modified_by", "value": "null"},
                "product",
                "component",
                {"name": "review_end_reason", "value": "'reassigned'"}
            ],
            "from":
                versions,
            "where":
                {"and": [# ONLY LOOK FOR NAME CHANGES IN THE "review?" FIELD
                    {"term": {"changes.field_name": "flags"}},
                    {"or": [{"prefix": {"changes.old_value": t + "?"}} for t in TYPES]}
                ]}
        })

        ends.extend(changes)

    # PYTHON VERSION NOT CAPABLE OF THIS JOIN, YET
    # reviews = qb.run({
    #     "from":
    #         starts,
    #     "select": [
    #         {"name": "bug_status", "value": "bug_status", "aggregate": "one"},
    #         {"name": "review_time", "value": "doneReview.modified_ts", "aggregate": "minimum"},
    #         {"name": "review_result", "value": "doneReview.review_result", "aggregate": "minimum"},
    #         {"name": "product", "value": "coalesce(doneReview.product, product)", "aggregate": "minimum"},
    #         {"name": "component", "value": "coalesce(doneReview.component, component)", "aggregate": "minimum"},
    #         # {"name": "keywords", "value": "(coalesce(keywords, '')+' '+ETL.parseWhiteBoard(whiteboard)).trim()+' '+flags", "aggregate": "one"},
    #         {"name": "requester_review_num", "value": "-1", "aggregate": "one"}
    #     ],
    #     "analytic": [
    #         {"name": "is_first", "value": "rownum==0 ? 1 : 0", "sort": "request_time", "edges": ["bug_id"]}
    #     ],
    #     "edges": [
    #         "bug_id",
    #         "attach_id",
    #         {"name": "reviewer", "value": "requestee"},
    #         {"name": "requester", "value": "created_by"},
    #         {"name": "request_time", "value": "modified_ts"},
    #         {
    #             "name": "doneReview",
    #             "test":
    #                 "bug_id==doneReview.bug_id && " +
    #                 "attach_id==doneReview.attach_id && " +
    #                 "requestee==doneReview.requestee && " +
    #                 "!(bug_status=='closed' && doneReview.review_end_reason=='closed') && " +
    #                 "modified_ts<=doneReview.modified_ts",
    #             "allowNulls": True,
    #             "domain": {"type": "set", "key":["bug_id", "attach_id", "requestee", "modified_ts"], "partitions": ends}
    #         }
    #     ]
    # })

    with Timer("match starts and ends for block {{start}}", {"start":min(*bugs)}):
        reviews = []
        ends = Index(data=ends, keys=["bug_id", "attach_id", "request_type", "reviewer"])

        for g, s in qb.groupby(starts, ["bug_id", "attach_id", "request_type", "reviewer"]):
            start_candidates = qb.sort(s, {"value": "request_time", "sort": 1})
            end_candidates = qb.sort(ends[g], {"value": "modified_ts", "sort": 1})

            #ZIP, BUT WITH ADDED CONSTRAINT s.modified_ts<=e.modified_ts
            if len(start_candidates) > 1:
                Log.note("many reviews on one attachment")
            ei = 0
            for i, s in enumerate(start_candidates):
                while ei < len(end_candidates) and end_candidates[ei].modified_ts < coalesce(s.request_time, convert.datetime2milli(Date.MAX)):
                    ei += 1
                e = end_candidates[ei]

                s.review_time = e.modified_ts
                s.review_duration = e.modified_ts - s.request_time
                s.review_result = e.review_result
                s.review_end_reason = e.review_end_reason
                s.product = coalesce(e.product, s.product)
                s.component = coalesce(e.component, s.component)
                s.requester_review_num = -1
                ei += 1

                if s.bug_status == 'closed' and e.review_end_reason == 'closed':
                    #reviews on closed bugs are ignored
                    continue
                reviews.append(s)

        qb.run({
            "from": reviews,
            "window": [{
                "name": "is_first",
                "value": "rownum == 0",
                "edges": ["bug_id"],
                "sort": ["request_time"],
                "aggregate": "none"
            }]
        })

    with Timer("add {{num}} reviews to ES for block {{start}}", {"start": min(*bugs), "num": len(reviews)}):
        sink.extend({"json": convert.value2json(r)} for r in reviews)
コード例 #42
0
    def _dispatch_work(self, source_block):
        """
        source_block POINTS TO THE bucket AND key TO PROCESS
        :return: False IF THERE IS NOTHING LEFT TO DO
        """
        source_keys = listwrap(coalesce(source_block.key, source_block.keys))

        if not isinstance(source_block.bucket, basestring):  # FIX MISTAKE
            source_block.bucket = source_block.bucket.bucket
        bucket = source_block.bucket
        work_actions = [
            w for w in self.settings.workers if w.source.bucket == bucket
        ]

        if not work_actions:
            Log.note(
                "No worker defined for records from {{bucket}}, {{action}}.\n{{message|indent}}",
                bucket=source_block.bucket,
                message=source_block,
                action="skipping"
                if self.settings.keep_unknown_on_queue else "deleting")
            return not self.settings.keep_unknown_on_queue

        for action in work_actions:
            try:
                source_key = unicode(source_keys[0])
                if len(source_keys) > 1:
                    multi_source = action._source
                    source = ConcatSources(
                        [multi_source.get_key(k) for k in source_keys])
                    source_key = MIN(source_key)
                else:
                    source = action._source.get_key(source_key)
                    source_key = source.key

                Log.note("Execute {{action}} on bucket={{source}} key={{key}}",
                         action=action.name,
                         source=source_block.bucket,
                         key=source_key)

                if action.transform_type == "bulk":
                    old_keys = set()
                else:
                    old_keys = action._destination.keys(
                        prefix=source_block.key)

                new_keys = set(
                    action._transformer(source_key,
                                        source,
                                        action._destination,
                                        resources=self.resources,
                                        please_stop=self.please_stop))

                #VERIFY KEYS
                if len(new_keys) == 1 and list(new_keys)[0] == source_key:
                    pass  # ok
                else:
                    etls = map(key2etl, new_keys)
                    etls = qb.sort(etls, "id")
                    for i, e in enumerate(etls):
                        if i != e.id:
                            Log.error(
                                "expecting keys to have dense order: {{ids}}",
                                ids=etls.id)
                    #VERIFY KEYS EXIST
                    if hasattr(action._destination, "get_key"):
                        for k in new_keys:
                            action._destination.get_key(k)

                for n in action._notify:
                    for k in new_keys:
                        n.add(k)

                if action.transform_type == "bulk":
                    continue

                # DUE TO BUGS THIS INVARIANT IS NOW BROKEN
                # TODO: FIGURE OUT HOW TO FIX THIS (CHANGE NAME OF THE SOURCE BLOCK KEY?)
                # for n in new_keys:
                #     if not n.startswith(source_key):
                #         Log.error("Expecting new keys ({{new_key}}) to start with source key ({{source_key}})",  new_key= n,  source_key= source_key)

                if not new_keys and old_keys:
                    Log.alert(
                        "Expecting some new keys after etl of {{source_key}}, especially since there were old ones\n{{old_keys}}",
                        old_keys=old_keys,
                        source_key=source_key)
                    continue
                elif not new_keys:
                    Log.alert(
                        "Expecting some new keys after processing {{source_key}}",
                        old_keys=old_keys,
                        source_key=source_key)
                    continue

                for k in new_keys:
                    if len(k.split(".")
                           ) == 3 and action.destination.type != "test_result":
                        Log.error(
                            "two dots have not been needed yet, this is a consitency check"
                        )

                delete_me = old_keys - new_keys
                if delete_me:
                    if action.destination.bucket == "ekyle-test-result":
                        for k in delete_me:
                            action._destination.delete_key(k)
                    else:
                        Log.note("delete keys?\n{{list}}",
                                 list=sorted(delete_me))
                        # for k in delete_me:
                # WE DO NOT PUT KEYS ON WORK QUEUE IF ALREADY NOTIFYING SOME OTHER
                # AND NOT GOING TO AN S3 BUCKET
                if not action._notify and isinstance(
                        action._destination, (aws.s3.Bucket, S3Bucket)):
                    for k in old_keys | new_keys:
                        self.work_queue.add(
                            Dict(bucket=action.destination.bucket, key=k))
            except Exception, e:
                if "Key {{key}} does not exist" in e:
                    err = Log.warning
                elif "multiple keys in {{bucket}}" in e:
                    err = Log.warning
                    if source_block.bucket == "ekyle-test-result":
                        for k in action._source.list(
                                prefix=key_prefix(source_key)):
                            action._source.delete_key(strip_extension(k.key))
                elif "expecting keys to have dense order" in e:
                    err = Log.warning
                    if source_block.bucket == "ekyle-test-result":
                        # WE KNOW OF THIS ETL MISTAKE, REPROCESS
                        self.work_queue.add({
                            "key":
                            unicode(key_prefix(source_key)),
                            "bucket":
                            "ekyle-pulse-logger"
                        })
                elif "Expecting a pure key" in e:
                    err = Log.warning
                else:
                    err = Log.error

                err(
                    "Problem transforming {{action}} on bucket={{source}} key={{key}} to destination={{destination}}",
                    {
                        "action":
                        action.name,
                        "source":
                        source_block.bucket,
                        "key":
                        source_key,
                        "destination":
                        coalesce(action.destination.name,
                                 action.destination.index)
                    }, e)