Ejemplo n.º 1
0
def doc_to_column(doc):
    try:
        doc = wrap(untyped(doc))

        # I HAVE MANAGED TO MAKE MANY MISTAKES WRITING COLUMNS TO ES. HERE ARE THE FIXES

        # FIX
        if not doc.last_updated:
            doc.last_updated = Date.now() - YEAR

        # FIX
        if doc.es_type == None:
            if doc.jx_type == OBJECT:
                doc.es_type = "object"
            else:
                Log.warning("{{doc}} has no es_type", doc=doc)

        # FIX
        doc.multi = 1001 if doc.es_type == "nested" else doc.multi

        # FIX
        doc.nested_path = tuple(listwrap(doc.nested_path))
        if last(split_field(
                doc.es_column)) == NESTED_TYPE and doc.es_type != "nested":
            doc.es_type = "nested"
            doc.jx_type = NESTED
            doc.multi = 1001
            doc.last_updated = Date.now()

        # FIX
        expected_nested_path = get_nested_path(doc.es_column)
        if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.':
            doc.nested_path = doc.nested_path[:-1]

        # FIX
        if untype_path(doc.es_column) == doc.es_column:
            if doc.nested_path != (".", ):
                if doc.es_index in {"repo"}:
                    pass
                else:
                    Log.note("not expected")
                    doc.nested_path = expected_nested_path
        else:
            if doc.nested_path != expected_nested_path:
                doc.nested_path = expected_nested_path

        # FIX
        if last(split_field(doc.es_column)) == EXISTS_TYPE:
            doc.jx_type = EXISTS

        return Column(**doc)
    except Exception:
        doc.nested_path = ["."]
        mark_as_deleted(Column(**doc))
        return None
Ejemplo n.º 2
0
    def _get_queue(self, row):
        row = wrap(row)
        if row.json:
            row.value, row.json = json2value(row.json), None
        timestamp = Date(self.rollover_field(row.value))
        if timestamp == None:
            return Null
        elif timestamp < Date.today() - self.rollover_max:
            return DATA_TOO_OLD

        rounded_timestamp = timestamp.floor(self.rollover_interval)
        with self.locker:
            queue = self.known_queues.get(rounded_timestamp.unix)
        if queue == None:
            candidates = wrap(sort_using_key(
                filter(
                    lambda r: re.match(
                        re.escape(self.settings.index) + r"\d\d\d\d\d\d\d\d_\d\d\d\d\d\d$",
                        r['index']
                    ),
                    self.cluster.get_aliases()
                ),
                key=lambda r: r['index']
            ))
            best = None
            for c in candidates:
                c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT)
                if timestamp > c.date:
                    best = c
            if not best or rounded_timestamp > best.date:
                if rounded_timestamp < wrap(last(candidates)).date:
                    es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)
                else:
                    try:
                        es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings)
                        es.add_alias(self.settings.index)
                    except Exception as e:
                        e = Except.wrap(e)
                        if "IndexAlreadyExistsException" not in e:
                            Log.error("Problem creating index", cause=e)
                        return self._get_queue(row)  # TRY AGAIN
            else:
                es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)

            def refresh(please_stop):
                try:
                    es.set_refresh_interval(seconds=coalesce(Duration(self.settings.refresh_interval).seconds, 60 * 10), timeout=5)
                except Exception:
                    Log.note("Could not set refresh interval for {{index}}", index=es.settings.index)

            Thread.run("refresh", refresh).release()

            self._delete_old_indexes(candidates)
            threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
            with self.locker:
                queue = self.known_queues[rounded_timestamp.unix] = threaded_queue
        return queue
Ejemplo n.º 3
0
 def partial_eval(self):
     term = self.term.partial_eval()
     if is_op(self.term, LastOp):
         return term
     elif term.type != OBJECT and not term.many:
         return term
     elif term is NULL:
         return term
     elif is_literal(term):
         return last(term)
     else:
         return self.lang[LastOp(term)]
    def __init__(self, var):
        """
        :param var:  DOT DELIMITED PATH INTO A DOCUMENT

        """
        Expression.__init__(self, None)

        # if self.lang != self.__class_.lang:
        #     pass
        self.var = get_property_name(var)
        jx_type = inserter_type_to_json_type.get(last(split_field(var)))
        if jx_type:
            self.data_type = jx_type
Ejemplo n.º 5
0
def doc_to_column(doc):
    try:
        doc = wrap(untyped(doc))
        if not doc.last_updated:
            doc.last_updated = Date.now() - YEAR

        if doc.es_type == None:
            if doc.jx_type == OBJECT:
                doc.es_type = "object"
            else:
                Log.warning("{{doc}} has no es_type", doc=doc)
        doc.multi = 1001 if doc.es_type == "nested" else doc.multi

        doc.nested_path = tuple(listwrap(doc.nested_path))
        if last(split_field(
                doc.es_column)) == NESTED_TYPE and doc.es_type != "nested":
            doc.es_type = "nested"
            doc.jx_type = NESTED
            doc.multi = 1001
            doc.last_updated = Date.now()

        expected_nested_path = get_nested_path(doc.es_column)
        if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.':
            doc.nested_path = doc.nested_path[:-1]
        if untype_path(doc.es_column) == doc.es_column:
            if doc.nested_path != (".", ):
                if doc.es_index in {"repo"}:
                    pass
                else:
                    Log.note("not expected")
                    doc.nested_path = expected_nested_path
        else:
            if doc.nested_path != expected_nested_path:
                doc.nested_path = expected_nested_path
        return Column(**doc)
    except Exception:
        doc.nested_path = ["."]
        mark_as_deleted(Column(**doc))
        return None
Ejemplo n.º 6
0
    def __init__(self, var, type=None, multi=None):
        """

        :param var:   DOT DELIMITED PATH INTO A DOCUMENT
        :param type:  JSON TYPE, IF KNOWN
        :param multi: NUMBER OF DISTINCT VALUES IN A SLOT
        """
        Expression.__init__(self, None)

        # if self.lang != self.__class_.lang:
        #     pass
        self.var = get_property_name(var)

        if type == None:
            jx_type = inserter_type_to_json_type.get(last(split_field(var)))
            if jx_type:
                self.data_type = jx_type
        else:
            self.data_type = type

        self._many = False
        if multi and multi > 1:
            self._many = True
def doc_to_column(doc):
    now = Date.now()
    try:
        doc = to_data(untyped(doc))

        # I HAVE MANAGED TO MAKE MANY MISTAKES WRITING COLUMNS TO ES. HERE ARE THE FIXES

        # FIX
        if not doc.last_updated:
            doc.last_updated = Date.now() - YEAR

        # FIX
        if doc.es_type == None:
            if doc.jx_type == OBJECT:
                doc.es_type = "object"
            else:
                Log.warning("{{doc}} has no es_type", doc=doc)

        # FIX
        if doc.es_type == "nested":
            doc.multi = 1001
        if doc.multi == None:
            doc.multi = 1

        # FIX
        if doc.es_column.endswith("." + NESTED_TYPE):
            if doc.jx_type == OBJECT:
                doc.jx_type = NESTED
                doc.last_updated = now
            if doc.es_type == "nested":
                doc.es_type = "nested"
                doc.last_updated = now

        # FIX
        doc.nested_path = tuple(listwrap(doc.nested_path))
        if last(split_field(
                doc.es_column)) == NESTED_TYPE and doc.es_type != "nested":
            doc.es_type = "nested"
            doc.jx_type = NESTED
            doc.multi = 1001
            doc.last_updated = now

        # FIX
        expected_nested_path = get_nested_path(doc.es_column)
        if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.':
            doc.nested_path = doc.nested_path[:-1]
            doc.last_updated = now

        # FIX
        if untype_path(doc.es_column) == doc.es_column:
            if doc.nested_path != (".", ):
                if doc.es_index in {"repo"}:
                    pass
                else:
                    Log.note("not expected")
                    doc.nested_path = expected_nested_path
                    doc.last_updated = now
        else:
            if doc.nested_path != expected_nested_path:
                doc.nested_path = expected_nested_path
                doc.last_updated = now

        # FIX
        if last(split_field(doc.es_column)) == EXISTS_TYPE:
            if doc.jx_type != EXISTS:
                doc.jx_type = EXISTS
                doc.last_updated = now

            if doc.cardinality == None:
                doc.cardinality = 1
                doc.last_updated = now

        # FIX
        if doc.jx_type in STRUCT:
            if doc.cardinality not in [0, 1]:
                doc.cardinality = 1  # DO NOT KNOW IF EXISTS OR NOT
                doc.last_updated = now

        return Column(**doc)
    except Exception as e:
        try:
            mark_as_deleted(Column(**doc), now)
        except Exception:
            pass
        return None
Ejemplo n.º 8
0
    def __init__(
            self,
            host,
            index,  # THE NAME OF THE SNOWFLAKE (IF WRITING)
            alias=None,  # THE NAME OF THE SNOWFLAKE (FOR READING)
            type=None,
            name=None,  # THE FULL NAME OF THE TABLE (THE NESTED PATH INTO THE SNOWFLAKE)
            port=9200,
            read_only=True,
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
            typed=None,
            kwargs=None):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.edges = Data()  # SET EARLY, SO OTHER PROCESSES CAN REQUEST IT
        self.worker = None
        self.settings = kwargs
        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.name = name = self._namespace._find_alias(
            coalesce(alias, index, name))
        if read_only:
            self.es = elasticsearch.Alias(alias=name,
                                          index=None,
                                          kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs)

        self._ensure_max_result_window_set(name)
        self.settings.type = self.es.settings.type
        self.stats = QueryStats(self.es.cluster)

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error(
                    "Expecting given typed {{typed}} to match {{is_typed}}",
                    typed=typed,
                    is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {'.': None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if v == '.':
                    return ('.', )
                return (v, ) + nested_path_of(all_paths[v])

            query_paths = sort_using_key(set(
                step for path in self.snowflake.query_paths for step in path),
                                         key=lambda p: len(split_field(p)))
            for step in query_paths:
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                nested_path = nested_path_of(p)
                try:
                    self.namespace.meta.columns.add(
                        Column(name=p,
                               es_column=p,
                               es_index=self.name,
                               es_type=OBJECT,
                               jx_type=OBJECT,
                               nested_path=nested_path,
                               multi=1001 if last(split_field(p))
                               == NESTED_TYPE else None,
                               last_updated=Date.now()))
                except Exception as e:
                    raise e
Ejemplo n.º 9
0
    def get_revision(self,
                     revision,
                     locale=None,
                     get_diff=False,
                     get_moves=True,
                     after=None):
        """
        EXPECTING INCOMPLETE revision OBJECT
        RETURNS revision
        """
        rev = revision.changeset.id
        if not rev:
            return Null
        elif rev == "None":
            return Null
        elif revision.branch.name == None:
            return Null
        locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)
        output = self._get_from_elasticsearch(revision,
                                              locale=locale,
                                              get_diff=get_diff,
                                              get_moves=get_moves,
                                              after=after)
        if output:
            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            DEBUG and Log.note(
                "Got hg ({{branch}}, {{locale}}, {{revision}}) from ES",
                branch=output.branch.name,
                locale=locale,
                revision=output.changeset.id,
            )
            if output.push.date >= Date.now() - MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents), None))
                self.todo.add((output.branch, listwrap(output.children), None))
            if output.push.date:
                return output

        # RATE LIMIT CALLS TO HG (CACHE MISSES)
        next_cache_miss = self.last_cache_miss + (
            Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND)
        self.last_cache_miss = Date.now()
        if next_cache_miss > self.last_cache_miss:
            Log.note(
                "delaying next hg call for {{seconds|round(decimal=1)}} seconds",
                seconds=next_cache_miss - self.last_cache_miss,
            )
            Till(till=next_cache_miss.unix).wait()

        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}",
                      rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name,
                                                       DEFAULT_LOCALE)]
            if not b:
                Log.warning(
                    "can not find branch ({{branch}}, {{locale}})",
                    branch=lower_name,
                    locale=locale,
                )
                return Null

        if Date.now() - Date(b.etl.timestamp) > _hg_branches.OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        push = self._get_push(found_revision.branch,
                              found_revision.changeset.id)
        id12 = found_revision.changeset.id[0:12]

        url1 = found_revision.branch.url.rstrip(
            "/") + "/json-info?node=" + id12
        url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + id12
        url3 = (found_revision.branch.url.rstrip("/") +
                "/json-automationrelevance/" + id12)
        with Explanation("get revision from {{url}}", url=url1, debug=DEBUG):
            raw_rev2 = Null
            automation_details = Null
            try:
                raw_rev1 = self._get_raw_json_info(url1, found_revision.branch)
                raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch)
                automation_details = self._get_raw_json_rev(
                    url3, found_revision.branch)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e

            raw_rev3_changeset = first(r for r in automation_details.changesets
                                       if r.node[:12] == id12)
            if last(automation_details.changesets) != raw_rev3_changeset:
                Log.note("interesting")

            output = self._normalize_revision(
                set_default(raw_rev1, raw_rev2, raw_rev3_changeset),
                found_revision,
                push,
                get_diff,
                get_moves,
            )
            if output.push.date >= Date.now() - MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents), None))
                self.todo.add((output.branch, listwrap(output.children), None))
                self.todo.add((output.branch, listwrap(output.backsoutnodes),
                               output.push.date))

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            return output
Ejemplo n.º 10
0
    def _get_from_hg(self,
                     revision,
                     locale=None,
                     get_diff=False,
                     get_moves=True):
        # RATE LIMIT CALLS TO HG (CACHE MISSES)
        next_cache_miss = self.last_cache_miss + (
            Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND)
        self.last_cache_miss = Date.now()
        if next_cache_miss > self.last_cache_miss:
            Log.note(
                "delaying next hg call for {{seconds|round(decimal=1)}} seconds",
                seconds=next_cache_miss - self.last_cache_miss,
            )
            Till(till=next_cache_miss.unix).wait()

        # CLEAN UP BRANCH NAME
        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}",
                      rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name,
                                                       DEFAULT_LOCALE)]
            if not b:
                Log.warning(
                    "can not find branch ({{branch}}, {{locale}})",
                    branch=lower_name,
                    locale=locale,
                )
                return Null

        # REFRESH BRANCHES, IF TOO OLD
        if Date.now() - Date(b.etl.timestamp) > _hg_branches.OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        # FIND THE PUSH
        push = self._get_push(found_revision.branch,
                              found_revision.changeset.id)
        id12 = found_revision.changeset.id[0:12]
        base_url = URL(found_revision.branch.url)

        with Explanation("get revision from {{url}}",
                         url=base_url,
                         debug=DEBUG):
            raw_rev2 = Null
            automation_details = Null
            try:
                raw_rev1 = self._get_raw_json_info((base_url / "json-info") +
                                                   {"node": id12})
                raw_rev2 = self._get_raw_json_rev(base_url / "json-rev" / id12)
                automation_details = self._get_raw_json_rev(
                    base_url / "json-automationrelevance" / id12)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e

            raw_rev3_changeset = first(r for r in automation_details.changesets
                                       if r.node[:12] == id12)
            if last(automation_details.changesets) != raw_rev3_changeset:
                Log.note("interesting")

            output = self._normalize_revision(
                set_default(raw_rev1, raw_rev2, raw_rev3_changeset),
                found_revision,
                push,
                get_diff,
                get_moves,
            )
            if output.push.date >= Date.now() - MAX_TODO_AGE:
                self.todo.extend([
                    (output.branch, listwrap(output.parents), None),
                    (output.branch, listwrap(output.children), None),
                    (
                        output.branch,
                        listwrap(output.backsoutnodes),
                        output.push.date,
                    ),
                ])

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None

        return output