def __init__(
        self,
        host,
        index,
        port=9200,
        type="log",
        queue_size=1000,
        batch_size=100,
        kwargs=None,
    ):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds
        kwargs.retry.times = coalesce(kwargs.retry.times, 3)
        kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds
        kwargs.host = Random.sample(listwrap(host), 1)[0]

        schema = json2value(value2json(SCHEMA), leaves=True)
        schema.mappings[type].properties["~N~"].type = "nested"
        self.es = Cluster(kwargs).get_or_create_index(
            schema=schema,
            limit_replicas=True,
            typed=True,
            kwargs=kwargs,
        )
        self.batch_size = batch_size
        self.es.add_alias(coalesce(kwargs.alias, kwargs.index))
        self.queue = Queue("debug logs to es", max=queue_size, silent=True)

        self.worker = Thread.run("add debug logs to es", self._insert_loop)
def fix(source_key, rownum, line, source, sample_only_filter, sample_size):
    """
    :param rownum:
    :param line:
    :param source:
    :param sample_only_filter:
    :param sample_size:
    :return:  (row, no_more_data) TUPLE WHERE row IS {"value":<data structure>} OR {"json":<text line>}
    """
    value = json2value(line)

    if rownum == 0:
        if len(line) > MAX_RECORD_LENGTH:
            _shorten(source_key, value, source)
        value = _fix(value)
        if sample_only_filter and Random.int(int(1.0/coalesce(sample_size, 0.01))) != 0 and jx.filter([value], sample_only_filter):
            # INDEX etl.id==0, BUT NO MORE
            if value.etl.id != 0:
                Log.error("Expecting etl.id==0")
            row = {"value": value}
            return row, True
    elif len(line) > MAX_RECORD_LENGTH:
        _shorten(source_key, value, source)
        value = _fix(value)
    elif '"resource_usage":' in line:
        value = _fix(value)

    row = {"value": value}
    return row, False
Example #3
0
    def output(*args):
        with cache_store.locker:
            if using_self:
                self = args[0]
                args = args[1:]
            else:
                self = cache_store

            now = Date.now()
            try:
                _cache = getattr(self, attr_name)
            except Exception, _:
                _cache = {}
                setattr(self, attr_name, _cache)

            if Random.int(100) == 0:
                # REMOVE OLD CACHE
                _cache = {
                    k: v
                    for k, v in _cache.items() if v[0] == None or v[0] > now
                }
                setattr(self, attr_name, _cache)

            timeout, key, value, exception = _cache.get(
                args, (Null, Null, Null, Null))
Example #4
0
    def _send_email(self):
        try:
            if not self.accumulation:
                return
            with Emailer(self.settings) as emailer:
                # WHO ARE WE SENDING TO
                emails = Data()
                for template, params in self.accumulation:
                    content = expand_template(template, params)
                    emails[literal_field(
                        self.settings.to_address)] += [content]
                    for c in self.cc:
                        if any(d in params.params.error for d in c.contains):
                            emails[literal_field(c.to_address)] += [content]

                # SEND TO EACH
                for to_address, content in emails.items():
                    emailer.send_email(from_address=self.settings.from_address,
                                       to_address=listwrap(to_address),
                                       subject=self.settings.subject,
                                       text_data="\n\n".join(content))

            self.accumulation = []
        except Exception as e:
            Log.warning("Could not send", e)
        finally:
            self.next_send = Date.now() + self.settings.average_interval * (
                2 * Random.float())
Example #5
0
    def commit(self):
        with self.lock:
            if self.closed:
                Log.error("Queue is closed, commit not allowed")

            try:
                self._add_pending({"add": {"status.start": self.start}})
                for i in range(self.db.status.start, self.start):
                    self._add_pending({"remove": str(i)})

                if self.db.status.end - self.start < 10 or Random.range(
                        0, 1000) == 0:  # FORCE RE-WRITE TO LIMIT FILE SIZE
                    # SIMPLY RE-WRITE FILE
                    if DEBUG:
                        Log.note(
                            "Re-write {{num_keys}} keys to persistent queue",
                            num_keys=self.db.status.end - self.start)
                        for k in self.db.keys():
                            if k == "status" or int(k) >= self.db.status.start:
                                continue
                            Log.error("Not expecting {{key}}", key=k)
                    self._commit()
                    self.file.write(
                        mo_json.value2json({"add": self.db}) + "\n")
                else:
                    self._commit()
            except Exception as e:
                raise e
Example #6
0
    def _send_email(self):
        try:
            if not self.accumulation:
                return
            with Emailer(self.settings) as emailer:
                # WHO ARE WE SENDING TO
                emails = Data()
                for template, params in self.accumulation:
                    content = expand_template(template, params)
                    emails[literal_field(self.settings.to_address)] += [content]
                    for c in self.cc:
                        if any(d in params.params.error for d in c.contains):
                            emails[literal_field(c.to_address)] += [content]

                # SEND TO EACH
                for to_address, content in emails.items():
                    emailer.send_email(
                        from_address=self.settings.from_address,
                        to_address=listwrap(to_address),
                        subject=self.settings.subject,
                        text_data="\n\n".join(content)
                    )

            self.accumulation = []
        except Exception as e:
            Log.warning("Could not send", e)
        finally:
            self.next_send = Date.now() + self.settings.average_interval * (2 * Random.float())
Example #7
0
def fix(source_key, rownum, line, source, sample_only_filter, sample_size):
    """
    :param rownum:
    :param line:
    :param source:
    :param sample_only_filter:
    :param sample_size:
    :return:  (row, no_more_data) TUPLE WHERE row IS {"value":<data structure>} OR {"json":<text line>}
    """
    value = json2value(line)

    if rownum == 0:
        if len(line) > MAX_RECORD_LENGTH:
            _shorten(source_key, value, source)
        value = _fix(value)
        if sample_only_filter and Random.int(int(1.0/coalesce(sample_size, 0.01))) != 0 and jx.filter([value], sample_only_filter):
            # INDEX etl.id==0, BUT NO MORE
            if value.etl.id != 0:
                Log.error("Expecting etl.id==0")
            row = {"value": value}
            return row, True
    elif len(line) > MAX_RECORD_LENGTH:
        _shorten(source_key, value, source)
        value = _fix(value)
    elif '"resource_usage":' in line:
        value = _fix(value)

    row = {"value": value}
    return row, False
Example #8
0
 def _create_new_shard(self):
     primary_shard = self.container.create_table(
         table=self.short_name + "_" + "".join(Random.sample(ALLOWED, 20)),
         sharded=False,
         schema=self._flake.schema,
         kwargs=self.config,
     )
     self.shard = primary_shard.shard
 def put(self, local, remote, use_sudo=False):
     if use_sudo:
         filename = "/tmp/" + Random.string(20, string.digits + 'ABCDEF')
         self.conn.put(File(local).abspath, filename)
         self.sudo("cp " + filename + " " + remote)
         self.sudo("rm " + filename)
     else:
         self.conn.put(File(local).abspath, remote)
Example #10
0
def test_wrap_2():
    Log.alert("Random types")
    switch = [
        lambda: Random.int(20),
        lambda: Random.string(20),
        lambda: {"i": Random.int(2000)},
        lambda: Data(i=Random.int(2000)),
        lambda: FlatList([{"i": Random.int(2000)}]),
        lambda: [{"i": Random.int(2000)}]
    ]

    inputs = [switch[min(len(switch) - 1, int(floor(-log(Random.float(), 2))))]() for i in range(NUM_INPUT)]

    for i in range(NUM_REPEAT):
        results = []
        gc.collect()
        with Timer("more string: to_data"):
            for v in inputs:
                results.append(to_data(v))

        results = []
        gc.collect()
        with Timer("more string: baseline"):
            for v in inputs:
                results.append(baseline(v))

        Log.note("Done {{i}} of {{num}}", i=i, num=NUM_REPEAT)
Example #11
0
    def output(*args, **kwargs):
        if kwargs:
            Log.error(
                "Sorry, caching only works with ordered parameter, not keyword arguments"
            )

        with cache_store.locker:
            if using_self:
                self = args[0]
                args = args[1:]
            else:
                self = cache_store

            now = Date.now()
            try:
                _cache = getattr(self, attr_name)
            except Exception:
                _cache = {}
                setattr(self, attr_name, _cache)

            if Random.int(100) == 0:
                # REMOVE OLD CACHE
                _cache = {
                    k: v
                    for k, v in _cache.items()
                    if v.timeout == None or v.timeout > now
                }
                setattr(self, attr_name, _cache)

            timeout, key, value, exception = _cache.get(
                args, (Null, Null, Null, Null))

        if now >= timeout:
            value = func(self, *args)
            with cache_store.locker:
                _cache[args] = CacheElement(now + cache_store.timeout, args,
                                            value, None)
            return value

        if value == None:
            if exception == None:
                try:
                    value = func(self, *args)
                    with cache_store.locker:
                        _cache[args] = CacheElement(now + cache_store.timeout,
                                                    args, value, None)
                    return value
                except Exception as e:
                    e = Except.wrap(e)
                    with cache_store.locker:
                        _cache[args] = CacheElement(now + cache_store.timeout,
                                                    args, None, e)
                    raise e
            else:
                raise exception
        else:
            return value
Example #12
0
    def __enter__(self):
        if self.sample and Random.int(100) == 0:
            _Log.warning("acquire  lock {{name|quote}}", name=self.name)

        self.debug and _Log.note("acquire  lock {{name|quote}}",
                                 name=self.name)
        self.lock.acquire()
        self.debug and _Log.note("acquired lock {{name|quote}}",
                                 name=self.name)
        return self
Example #13
0
    def _daemon(self, please_stop):
        while not please_stop:
            with Explanation("looking for work"):
                try:
                    branch, revisions, after = self.todo.pop(till=please_stop)
                except Exception as e:
                    if please_stop:
                        break
                    else:
                        raise e
                if branch.name in DAEMON_DO_NO_SCAN:
                    continue
                revisions = set(revisions)

                # FIND THE REVSIONS ON THIS BRANCH
                for r in list(revisions):
                    try:
                        rev = self.get_revision(
                            Revision(branch=branch, changeset={"id": r}),
                            None,  # local
                            False,  # get_diff
                            True,  # get_moves
                        )
                        if after and after > rev.etl.timestamp:
                            rev = self._get_from_hg(revision=rev)

                        if DAEMON_DEBUG:
                            Log.note(
                                "found revision with push date {{date|datetime}}",
                                date=rev.push.date,
                            )
                        revisions.discard(r)

                        if rev.etl.timestamp > Date.now() - (
                                DAEMON_RECENT_HG_PULL * SECOND):
                            # SOME PUSHES ARE BIG, RUNNING THE RISK OTHER MACHINES ARE
                            # ALSO INTERESTED AND PERFORMING THE SAME SCAN. THIS DELAY
                            # WILL HAVE SMALL EFFECT ON THE MAJORITY OF SMALL PUSHES
                            # https://bugzilla.mozilla.org/show_bug.cgi?id=1417720
                            Till(seconds=Random.float(DAEMON_HG_INTERVAL *
                                                      2)).wait()

                    except Exception as e:
                        Log.warning(
                            "Scanning {{branch}} {{revision|left(12)}}",
                            branch=branch.name,
                            revision=r,
                            cause=e,
                        )
                        if "Read timed out" in e:
                            Till(seconds=DAEMON_WAIT_AFTER_TIMEOUT).wait()

                # FIND ANY BRANCH THAT MAY HAVE THIS REVISION
                for r in list(revisions):
                    self._find_revision(r)
Example #14
0
    def put(self, local, remote, use_sudo=False):
        if self.conn.command_cwds and not remote.startswith(("/", "~")):
            remote = self.conn.command_cwds[-1].rstrip("/'") + "/" + remote

        if use_sudo:
            filename = "/tmp/" + Random.hex(20)
            self.conn.put(File(local).abspath, filename)
            self.sudo("cp " + filename + " " + remote)
            self.sudo("rm " + filename)
        else:
            self.conn.put(File(local).abspath, remote)
Example #15
0
    def __init__(
        self,
        host,
        index,
        port=9200,
        type="log",
        queue_size=1000,
        batch_size=100,
        kwargs=None,
    ):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds
        kwargs.retry.times = coalesce(kwargs.retry.times, 3)
        kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep,
                                               MINUTE)).seconds
        kwargs.host = Random.sample(listwrap(host), 1)[0]

        rollover_interval = coalesce(kwargs.rollover.interval,
                                     kwargs.rollover.max, "year")
        rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval,
                                "year")

        schema = set_default(kwargs.schema, {
            "mappings": {
                kwargs.type: {
                    "properties": {
                        "~N~": {
                            "type": "nested"
                        }
                    }
                }
            }
        }, json2value(value2json(SCHEMA), leaves=True))

        self.es = RolloverIndex(
            rollover_field={"get": [{
                "first": "."
            }, {
                "literal": "timestamp"
            }]},
            rollover_interval=rollover_interval,
            rollover_max=rollover_max,
            schema=schema,
            limit_replicas=True,
            typed=True,
            read_only=False,
            kwargs=kwargs,
        )
        self.batch_size = batch_size
        self.queue = Queue("debug logs to es", max=queue_size, silent=True)

        self.worker = Thread.run("add debug logs to es", self._insert_loop)
Example #16
0
def fix(rownum, line, source, sample_only_filter, sample_size):
    # ES SCHEMA IS STRICTLY TYPED, USE "code" FOR TEXT IDS
    line = line.replace('{"id": "bb"}',
                        '{"code": "bb"}').replace('{"id": "tc"}',
                                                  '{"code": "tc"}')

    # ES SCHEMA IS STRICTLY TYPED, THE SUITE OBJECT CAN NOT BE HANDLED
    if source.name.startswith("active-data-test-result"):
        # "suite": {"flavor": "plain-chunked", "name": "mochitest"}
        found = strings.between(line, '"suite": {', '}')
        if found:
            suite_json = '{' + found + "}"
            if suite_json:
                suite = mo_json.json2value(suite_json)
                suite = convert.value2json(coalesce(suite.fullname,
                                                    suite.name))
                line = line.replace(suite_json, suite)

    if source.name.startswith("active-data-codecoverage"):
        d = convert.json2value(line)
        if d.source.file.total_covered > 0:
            return {"id": d._id, "json": line}, False
        else:
            return None, False

    if rownum == 0:
        value = mo_json.json2value(line)
        if len(line) > MAX_RECORD_LENGTH:
            _shorten(value, source)
        _id, value = _fix(value)
        row = {"id": _id, "value": value}
        if sample_only_filter and Random.int(
                int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter(
                    [value], sample_only_filter):
            # INDEX etl.id==0, BUT NO MORE
            if value.etl.id != 0:
                Log.error("Expecting etl.id==0")
            return row, True
    elif len(line) > MAX_RECORD_LENGTH:
        value = mo_json.json2value(line)
        _shorten(value, source)
        _id, value = _fix(value)
        row = {"id": _id, "value": value}
    elif line.find('"resource_usage":') != -1:
        value = mo_json.json2value(line)
        _id, value = _fix(value)
        row = {"id": _id, "value": value}
    else:
        # FAST
        _id = strings.between(line, "\"_id\": \"", "\"")  # AVOID DECODING JSON
        row = {"id": _id, "json": line}

    return row, False
Example #17
0
    def get(self, remote, local, use_sudo=False):
        if self.conn.command_cwds and not remote.startswith(("/", "~")):
            remote = self.conn.command_cwds[-1].rstrip("/'") + "/" + remote

        if use_sudo:
            filename = "/tmp/" + Random.filename()
            self.sudo("cp " + remote + " " + filename)
            self.sudo("chmod a+r " + filename)
            self.conn.get(filename, File(local).abspath)
            self.sudo("rm " + filename)
        else:
            self.conn.get(remote, File(local).abspath)
Example #18
0
    def _get_from_elasticsearch(self, revision, locale=None, get_diff=False, get_moves=True):
        rev = revision.changeset.id
        if self.es.cluster.version.startswith("1.7."):
            query = {
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and": [
                        {"term": {"changeset.id12": rev[0:12]}},
                        {"term": {"branch.name": revision.branch.name}},
                        {"term": {"branch.locale": coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)}},
                        {"range": {"etl.timestamp": {"gt": MIN_ETL_AGE}}}
                    ]}
                }},
                "size": 20
            }
        else:
            query = {
                "query": {"bool": {"must": [
                    {"term": {"changeset.id12": rev[0:12]}},
                    {"term": {"branch.name": revision.branch.name}},
                    {"term": {"branch.locale": coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)}},
                    {"range": {"etl.timestamp": {"gt": MIN_ETL_AGE}}}
                ]}},
                "size": 20
            }

        for attempt in range(3):
            try:
                with self.es_locker:
                    docs = self.es.search(query).hits.hits
                if len(docs) == 0:
                    return None
                best = docs[0]._source
                if len(docs) > 1:
                    for d in docs:
                        if d._id.endswith(d._source.branch.locale):
                            best = d._source
                    Log.warning("expecting no more than one document")
                return best
            except Exception as e:
                e = Except.wrap(e)
                if "EsRejectedExecutionException[rejected execution (queue capacity" in e:
                    (Till(seconds=Random.int(30))).wait()
                    continue
                else:
                    Log.warning("Bad ES call, waiting for {{num}} seconds", num=WAIT_AFTER_NODE_FAILURE, cause=e)
                    Till(seconds=WAIT_AFTER_NODE_FAILURE).wait()
                    continue

        Log.warning("ES did not deliver, fall back to HG")
        return None
Example #19
0
    def output(*args):
        with cache_store.locker:
            if using_self:
                self = args[0]
                args = args[1:]
            else:
                self = cache_store

            now = Date.now()
            try:
                _cache = getattr(self, attr_name)
            except Exception:
                _cache = {}
                setattr(self, attr_name, _cache)

            if Random.int(100) == 0:
                # REMOVE OLD CACHE
                _cache = {
                    k: v
                    for k, v in _cache.items() if v[0] == None or v[0] > now
                }
                setattr(self, attr_name, _cache)

            timeout, key, value, exception = _cache.get(
                args, (Null, Null, Null, Null))

        if now >= timeout:
            value = func(self, *args)
            with cache_store.locker:
                _cache[args] = (now + cache_store.timeout, args, value, None)
            return value

        if value == None:
            if exception == None:
                try:
                    value = func(self, *args)
                    with cache_store.locker:
                        _cache[args] = (now + cache_store.timeout, args, value,
                                        None)
                    return value
                except Exception as e:
                    e = Except.wrap(e)
                    with cache_store.locker:
                        _cache[args] = (now + cache_store.timeout, args, None,
                                        e)
                    raise e
            else:
                raise exception
        else:
            return value
Example #20
0
def test_wrap_2():
    switch = [
        lambda: {
            "i": Random.int(2000)
        }, lambda: Data(i=Random.int(2000)),
        lambda: FlatList([{
            "i": Random.int(2000)
        }]), lambda: [{
            "i": Random.int(2000)
        }]
    ]

    inputs = [
        switch[min(len(switch) - 1, int(floor(-log(Random.float(), 2))))]()
        for i in range(NUM_INPUT)
    ]

    for i in range(NUM_REPEAT):
        results = []
        gc.collect()
        with Profiler("more dict: slow_wrap"):
            for v in inputs:
                results.append(slow_wrap(v))

        results = []
        gc.collect()
        with Profiler("more dict: wrap"):
            for v in inputs:
                results.append(wrap(v))

        results = []
        gc.collect()
        with Profiler("more dict: baseline"):
            for v in inputs:
                results.append(baseline(v))

        Log.note("Done {{i}} of {{num}}", {"i": i, "num": NUM_REPEAT})
Example #21
0
def fix(rownum, line, source, sample_only_filter, sample_size):
    value = json2value(line)

    if value._id.startswith(("tc.97", "96", "bb.27")):
        # AUG 24, 25 2017 - included full diff with repo; too big to index
        try:
            data = json2value(line)
            repo = data.repo
            repo.etl = None
            repo.branch.last_used = None
            repo.branch.description = None
            repo.branch.etl = None
            repo.branch.parent_name = None
            repo.children = None
            repo.parents = None
            if repo.changeset.diff or data.build.repo.changeset.diff:
                Log.error("no diff allowed")
            else:
                assertAlmostEqual(minimize_repo(repo), repo)
        except Exception as e:
            if CAN_NOT_DECODE_JSON in e:
                raise e
            data.repo = minimize_repo(repo)
            data.build.repo = minimize_repo(data.build.repo)
            line = value2json(data)
    else:
        pass

    if rownum == 0:
        if len(line) > MAX_RECORD_LENGTH:
            _shorten(value, source)
        value = _fix(value)
        if sample_only_filter and Random.int(
                int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter(
                    [value], sample_only_filter):
            # INDEX etl.id==0, BUT NO MORE
            if value.etl.id != 0:
                Log.error("Expecting etl.id==0")
            row = {"value": value}
            return row, True
    elif len(line) > MAX_RECORD_LENGTH:
        _shorten(value, source)
        value = _fix(value)
    elif line.find('"resource_usage":') != -1:
        value = _fix(value)

    row = {"value": value}
    return row, False
Example #22
0
    def test_id_vs_id(self):

        ops = [Op() for _ in range(200)]
        lang1 = {id(o): o for o in ops}

        sample = Random.sample(ops, 1000 * 1000)
        with Timer("using id()"):
            result1 = [lang1[id(o)] for o in sample]

        lang2 = [None] * (max(o.id for o in ops) + 1)
        for o in ops:
            lang2[o.id] = o
        # lang2 = tuple(lang2)

        with Timer("using o.id"):
            result2 = [lang2[o.id] for o in sample]
Example #23
0
def encrypt(text, _key, salt=None):
    """
    RETURN {"salt":s, "length":l, "data":d} -> JSON -> UTF8
    """

    if is_text(text):
        encoding = 'utf8'
        data = bytearray(text.encode("utf8"))
    elif is_binary(text):
        encoding = None
        if PY2:
            data = bytearray(text)
        else:
            data = text

    if _key is None:
        Log.error("Expecting a key")
    if is_binary(_key):
        _key = bytearray(_key)
    if salt is None:
        salt = Random.bytes(16)

    # Initialize encryption using key and iv
    key_expander_256 = key_expander.KeyExpander(256)
    expanded_key = key_expander_256.expand(_key)
    aes_cipher_256 = aes_cipher.AESCipher(expanded_key)
    aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16)
    aes_cbc_256.set_iv(salt)

    output = Data()
    output.type = "AES256"
    output.salt = bytes2base64(salt)
    output.length = len(data)
    output.encoding = encoding

    encrypted = bytearray()
    for _, d in _groupby16(data):
        encrypted.extend(aes_cbc_256.encrypt_block(d))
    output.data = bytes2base64(encrypted)
    json = get_module("mo_json").value2json(output, pretty=True).encode('utf8')

    if DEBUG:
        test = decrypt(json, _key)
        if test != text:
            Log.error("problem with encryption")

    return json
Example #24
0
    def output(*args, **kwargs):
        if kwargs:
            Log.error("Sorry, caching only works with ordered parameter, not keyword arguments")

        with cache_store.locker:
            if using_self:
                self = args[0]
                args = args[1:]
            else:
                self = cache_store

            now = Date.now()
            try:
                _cache = getattr(self, attr_name)
            except Exception:
                _cache = {}
                setattr(self, attr_name, _cache)

            if Random.int(100) == 0:
                # REMOVE OLD CACHE
                _cache = {k: v for k, v in _cache.items() if v.timeout == None or v.timeout > now}
                setattr(self, attr_name, _cache)

            timeout, key, value, exception = _cache.get(args, (Null, Null, Null, Null))

        if now >= timeout:
            value = func(self, *args)
            with cache_store.locker:
                _cache[args] = CacheElement(now + cache_store.timeout, args, value, None)
            return value

        if value == None:
            if exception == None:
                try:
                    value = func(self, *args)
                    with cache_store.locker:
                        _cache[args] = CacheElement(now + cache_store.timeout, args, value, None)
                    return value
                except Exception as e:
                    e = Except.wrap(e)
                    with cache_store.locker:
                        _cache[args] = CacheElement(now + cache_store.timeout, args, None, e)
                    raise e
            else:
                raise exception
        else:
            return value
Example #25
0
def encrypt(text, _key, salt=None):
    """
    RETURN {"salt":s, "length":l, "data":d} -> JSON -> UTF8
    """

    if isinstance(text, text_type):
        encoding = 'utf8'
        data = bytearray(text.encode("utf8"))
    elif isinstance(text, binary_type):
        encoding = None
        if PY2:
            data = bytearray(text)
        else:
            data = text

    if _key is None:
        Log.error("Expecting a key")
    if isinstance(_key, binary_type):
        _key = bytearray(_key)
    if salt is None:
        salt = Random.bytes(16)

    # Initialize encryption using key and iv
    key_expander_256 = key_expander.KeyExpander(256)
    expanded_key = key_expander_256.expand(_key)
    aes_cipher_256 = aes_cipher.AESCipher(expanded_key)
    aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16)
    aes_cbc_256.set_iv(salt)

    output = Data()
    output.type = "AES256"
    output.salt = bytes2base64(salt)
    output.length = len(data)
    output.encoding = encoding

    encrypted = bytearray()
    for _, d in _groupby16(data):
        encrypted.extend(aes_cbc_256.encrypt_block(d))
    output.data = bytes2base64(encrypted)
    json = get_module("mo_json").value2json(output, pretty=True).encode('utf8')

    if DEBUG:
        test = decrypt(json, _key)
        if test != text:
            Log.error("problem with encryption")

    return json
Example #26
0
    def _daemon(self, please_stop):
        while not please_stop:
            with Explanation("looking for work"):
                try:
                    branch, revisions = self.todo.pop(till=please_stop)
                except Exception as e:
                    if please_stop:
                        break
                    else:
                        raise e
                if branch.name in DAEMON_DO_NO_SCAN:
                    continue
                revisions = set(revisions)

                # FIND THE REVSIONS ON THIS BRANCH
                for r in list(revisions):
                    try:
                        rev = self.get_revision(Revision(branch=branch, changeset={"id": r}))
                        if DAEMON_DEBUG:
                            Log.note("found revision with push date {{date|datetime}}", date=rev.push.date)
                        revisions.discard(r)

                        if rev.etl.timestamp > Date.now() - (DAEMON_RECENT_HG_PULL * SECOND):
                            # SOME PUSHES ARE BIG, RUNNING THE RISK OTHER MACHINES ARE
                            # ALSO INTERESTED AND PERFORMING THE SAME SCAN. THIS DELAY
                            # WILL HAVE SMALL EFFECT ON THE MAJORITY OF SMALL PUSHES
                            # https://bugzilla.mozilla.org/show_bug.cgi?id=1417720
                            Till(seconds=Random.float(DAEMON_HG_INTERVAL*2)).wait()

                    except Exception as e:
                        Log.warning(
                            "Scanning {{branch}} {{revision|left(12)}}",
                            branch=branch.name,
                            revision=r,
                            cause=e
                        )
                        if "Read timed out" in e:
                            Till(seconds=DAEMON_WAIT_AFTER_TIMEOUT).wait()


                # FIND ANY BRANCH THAT MAY HAVE THIS REVISION
                for r in list(revisions):
                    self._find_revision(r)
Example #27
0
def fix(rownum, line, source, sample_only_filter, sample_size):
    # ES SCHEMA IS STRICTLY TYPED, USE "code" FOR TEXT IDS
    line = line.replace('{"id": "bb"}', '{"code": "bb"}').replace('{"id": "tc"}', '{"code": "tc"}')

    # ES SCHEMA IS STRICTLY TYPED, THE SUITE OBJECT CAN NOT BE HANDLED
    if source.name.startswith("active-data-test-result"):
        # "suite": {"flavor": "plain-chunked", "name": "mochitest"}
        found = strings.between(line, '"suite": {', '}')
        if found:
            suite_json = '{' + found + "}"
            if suite_json:
                suite = mo_json.json2value(suite_json)
                suite = convert.value2json(coalesce(suite.fullname, suite.name))
                line = line.replace(suite_json, suite)

    if rownum == 0:
        value = mo_json.json2value(line)
        if len(line) > MAX_RECORD_LENGTH:
            _shorten(value, source)
        _id, value = _fix(value)
        row = {"id": _id, "value": value}
        if sample_only_filter and Random.int(int(1.0/coalesce(sample_size, 0.01))) != 0 and jx.filter([value], sample_only_filter):
            # INDEX etl.id==0, BUT NO MORE
            if value.etl.id != 0:
                Log.error("Expecting etl.id==0")
            return row, True
    elif len(line) > MAX_RECORD_LENGTH:
        value = mo_json.json2value(line)
        _shorten(value, source)
        _id, value = _fix(value)
        row = {"id": _id, "value": value}
    elif line.find('"resource_usage":') != -1:
        value = mo_json.json2value(line)
        _id, value = _fix(value)
        row = {"id": _id, "value": value}
    else:
        # FAST
        _id = strings.between(line, "\"_id\": \"", "\"")  # AVOID DECODING JSON
        row = {"id": _id, "json": line}

    return row, False
Example #28
0
    def output(*args):
        with cache_store.locker:
            if using_self:
                self = args[0]
                args = args[1:]
            else:
                self = cache_store

            now = Date.now()
            try:
                _cache = getattr(self, attr_name)
            except Exception, _:
                _cache = {}
                setattr(self, attr_name, _cache)

            if Random.int(100) == 0:
                # REMOVE OLD CACHE
                _cache = {k: v for k, v in _cache.items() if v[0]==None or v[0] > now}
                setattr(self, attr_name, _cache)

            timeout, key, value, exception = _cache.get(args, (Null, Null, Null, Null))
Example #29
0
    def device_register(self, path=None):
        """
        EXPECTING A SIGNED REGISTRATION REQUEST
        RETURN JSON WITH url FOR LOGIN
        """
        now = Date.now().unix
        request_body = request.get_data().strip()
        signed = json2value(request_body.decode("utf8"))
        command = json2value(base642bytes(signed.data).decode("utf8"))
        session.public_key = command.public_key
        rsa_crypto.verify(signed, session.public_key)

        self.session_manager.setup_session(session)
        session.expires = now + parse("10minute").seconds
        session.state = bytes2base64URL(Random.bytes(32))

        with self.device.db.transaction() as t:
            t.execute(
                sql_insert(
                    self.device.table,
                    {
                        "state": session.state,
                        "session_id": session.session_id
                    },
                ))
        response = value2json(
            Data(
                session_id=session.session_id,
                interval="5second",
                expiry=session.expires,
                url=URL(
                    self.device.home,
                    path=self.device.endpoints.login,
                    query={"state": session.state},
                ),
            ))

        return Response(response,
                        headers={"Content-Type": "application/json"},
                        status=200)
Example #30
0
def encrypt(text, _key, salt=None):
    """
    RETURN JSON OF ENCRYPTED DATA   {"salt":s, "length":l, "data":d}
    """
    if not isinstance(text, unicode):
        Log.error("only unicode is encrypted")
    if _key is None:
        Log.error("Expecting a key")
    if isinstance(_key, str):
        _key = bytearray(_key)
    if salt is None:
        salt = Random.bytes(16)

    data = bytearray(text.encode("utf8"))

    # Initialize encryption using key and iv
    key_expander_256 = key_expander.KeyExpander(256)
    expanded_key = key_expander_256.expand(_key)
    aes_cipher_256 = aes_cipher.AESCipher(expanded_key)
    aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16)
    aes_cbc_256.set_iv(salt)

    output = Data()
    output.type = "AES256"
    output.salt = bytes2base64(salt)
    output.length = len(data)

    encrypted = bytearray()
    for _, d in _groupby16(data):
        encrypted.extend(aes_cbc_256.encrypt_block(d))
    output.data = bytes2base64(encrypted)
    json = get_module("mo_json").value2json(output)

    if DEBUG:
        test = decrypt(json, _key)
        if test != text:
            Log.error("problem with encryption")

    return json
Example #31
0
def encrypt(text, _key, salt=None):
    """
    RETURN JSON OF ENCRYPTED DATA   {"salt":s, "length":l, "data":d}
    """
    if not isinstance(text, text_type):
        Log.error("only unicode is encrypted")
    if _key is None:
        Log.error("Expecting a key")
    if isinstance(_key, str):
        _key = bytearray(_key)
    if salt is None:
        salt = Random.bytes(16)

    data = bytearray(text.encode("utf8"))

    # Initialize encryption using key and iv
    key_expander_256 = key_expander.KeyExpander(256)
    expanded_key = key_expander_256.expand(_key)
    aes_cipher_256 = aes_cipher.AESCipher(expanded_key)
    aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16)
    aes_cbc_256.set_iv(salt)

    output = Data()
    output.type = "AES256"
    output.salt = bytes2base64(salt)
    output.length = len(data)

    encrypted = bytearray()
    for _, d in _groupby16(data):
        encrypted.extend(aes_cbc_256.encrypt_block(d))
    output.data = bytes2base64(encrypted)
    json = get_module("mo_json").value2json(output)

    if DEBUG:
        test = decrypt(json, _key)
        if test != text:
            Log.error("problem with encryption")

    return json
Example #32
0
def es_bulksetop(esq, frum, query):
    abs_limit = MIN([query.limit, MAX_DOCUMENTS])
    guid = Random.base64(32, extra="-_")

    schema = query.frum.schema
    query_path = schema.query_path[0]
    new_select, split_select = get_selects(query)
    split_wheres = split_expression_by_path(query.where, schema, lang=ES52)
    es_query = es_query_proto(query_path, split_select, split_wheres, schema)
    es_query.size = MIN([query.chunk_size, MAX_CHUNK_SIZE])
    es_query.sort = jx_sort_to_es_sort(query.sort, schema)
    if not es_query.sort:
        es_query.sort = ["_doc"]

    formatter = formatters[query.format](abs_limit, new_select, query)

    Thread.run(
        "Download " + guid,
        extractor,
        guid,
        abs_limit,
        esq,
        es_query,
        formatter,
        parent_thread=Null,
    ).release()

    output = wrap({
        "url": URL_PREFIX / (guid + ".json"),
        "status": URL_PREFIX / (guid + ".status.json"),
        "meta": {
            "format": query.format,
            "es_query": es_query,
            "limit": abs_limit
        },
    })
    return output
Example #33
0
    def test_compare_isinstance_to_class_checks(self):
        num = 1 * 1000 * 1000
        options = {
            0: lambda: {},
            1: lambda: Data(),
            2: lambda: Null,
            3: lambda: 6,
            4: lambda: "string",
        }
        data = [options[Random.int(len(options))]() for _ in range(num)]

        with Timer("isinstance check") as i_time:
            i_result = [isinstance(d, Mapping) for d in data]

        with Timer("set check") as s_time:
            s_result = [d.__class__ in MAPPING_TYPES for d in data]

        with Timer("eq check") as e_time:
            e_result = [
                d.__class__ is Data or d.__class__ is dict for d in data
            ]

        with Timer("name check") as n_time:
            n_result = [
                is_instance(d, Data) or is_instance(d, dict) for d in data
            ]

        with Timer("check w method") as m_time:
            m_result = [is_mapping(d) for d in data]

        self.assertEqual(s_result, i_result)
        self.assertEqual(m_result, i_result)
        self.assertEqual(e_result, i_result)
        self.assertEqual(n_result, i_result)

        self.assertGreater(i_time.duration, s_time.duration)
        self.assertGreater(m_time.duration, s_time.duration)
Example #34
0
    def commit(self):
        with self.lock:
            if self.closed:
                Log.error("Queue is closed, commit not allowed")

            try:
                self._add_pending({"add": {"status.start": self.start}})
                for i in range(self.db.status.start, self.start):
                    self._add_pending({"remove": str(i)})

                if self.db.status.end - self.start < 10 or Random.range(0, 1000) == 0:  # FORCE RE-WRITE TO LIMIT FILE SIZE
                    # SIMPLY RE-WRITE FILE
                    if DEBUG:
                        Log.note("Re-write {{num_keys}} keys to persistent queue", num_keys=self.db.status.end - self.start)
                        for k in self.db.keys():
                            if k == "status" or int(k) >= self.db.status.start:
                                continue
                            Log.error("Not expecting {{key}}", key=k)
                    self._commit()
                    self.file.write(mo_json.value2json({"add": self.db}) + "\n")
                else:
                    self._commit()
            except Exception as e:
                raise e
Example #35
0
def complex_job(
    transactional_db, generic_reference_data, test_repository, extract_job_settings, now
):
    fc = FailureClassification.objects.create(id=1, name="not classified")
    repository_group = RepositoryGroup.objects.create(name="common")
    repo = Repository.objects.create(name="autoland", repository_group=repository_group)

    push = Push.objects.create(
        **{
            "author": "*****@*****.**",
            "repository": repo,
            "revision": "ae6bb3a1066959a8c43d003a3caab0af769455bf",
            "time": unix2datetime(1578427105).replace(tzinfo=None),
        }
    )

    Commit.objects.create(
        push=push,
        revision="ae6bb3a1066959a8c43d003a3caab0af769455bf",
        author="*****@*****.**",
        comments="no comment",
    )
    Commit.objects.create(
        push=push,
        revision="0123456789012345678901234567890123456789",
        author="*****@*****.**",
        comments="no comment2",
    )

    debug = Option.objects.create(name="debug")
    oc = OptionCollection.objects.create(option_collection_hash=Random.base64(5), option=debug)

    job = Job.objects.create(
        autoclassify_status=1,
        guid=Random.base64(20),
        repository=test_repository,
        push_id=push.id,
        signature=generic_reference_data.signature,
        build_platform=generic_reference_data.build_platform,
        machine_platform=generic_reference_data.machine_platform,
        machine=generic_reference_data.machine,
        option_collection_hash=oc.option_collection_hash,
        job_type=generic_reference_data.job_type,
        job_group=generic_reference_data.job_group,
        product=generic_reference_data.product,
        failure_classification_id=fc.id,
        who="*****@*****.**",
        reason="scheduled",
        result="success",
        state="completed",
        submit_time=unix2datetime(1578427253).replace(tzinfo=None),
        start_time=unix2datetime(1578430841).replace(tzinfo=None),
        last_modified=unix2datetime(1578432686.364459).replace(tzinfo=None),
        end_time=unix2datetime(1578432680).replace(tzinfo=None),
        tier=1,
    )

    text_log_step = TextLogStep.objects.create(
        job=job,
        **{
            "finished_line_number": 88739,
            "name": "Unnamed step",
            "result": 7,
            "started_line_number": 0,
        },
    )

    TextLogError.objects.create(
        step=text_log_step, line="line contents here", line_number=619845839
    )
    TextLogError.objects.create(step=text_log_step, line="ERROR! more line contents", line_number=6)

    TaskclusterMetadata.objects.create(job=job, retry_id=0, task_id="WWb9ExAvQUa78ku0DIxdSQ")

    JobLog.objects.create(
        **{
            "job_id": job.id,
            "name": "builds-4h",
            "status": 1,
            "url": "https://example.com/api/queue/v1/task/WWb9ExAvQUa78ku0DIxdSQ/runs/0/artifacts/public/logs/live_backing.log",
        }
    )
    job_logs1 = JobLog.objects.create(
        **{
            "job_id": job.id,
            "name": "errorsummary_json",
            "status": 1,
            "url": "https://example.com/api/queue/v1/task/WWb9ExAvQUa78ku0DIxdSQ/runs/0/artifacts/public/test_info/wpt_errorsummary.log",
        }
    )

    bcf = ClassifiedFailure.objects.create(**{"bug_number": 1234567,})
    bcf.created = Date("2020-01-17 12:00:00").datetime
    bcf.save()

    FailureLine.objects.create(
        job_log=job_logs1,
        **{
            "action": "test_groups",
            "best_classification": bcf,
            "best_is_verified": True,
            "repository": repo,
            "job_guid": job.guid,
            "line": 15,
            "modified": 0,
            "stackwalk_stderr": 1578432686,
            "stackwalk_stdout": 1578432686,
        },
    )
    FailureLine.objects.create(
        job_log=job_logs1,
        **{
            "action": "crash",
            "best_classification": bcf,
            "best_is_verified": False,
            "repository": repo,
            "job_guid": job.guid,
            "line": 24031,
            "modified": 0,
            "signature": "@ mozilla::dom::CustomElementData::SetCustomElementDefinition(mozilla::dom::CustomElementDefinition*)",
            "stackwalk_stderr": 1578432686,
            "stackwalk_stdout": 1578432686,
            "test": "/custom-elements/upgrading.html",
        },
    )

    return job
Example #36
0
def unique_name():
    return Random.string(20)
Example #37
0
    def _scan_database(self):
        # GET ALL RELATIONS
        raw_relations = self.db.query(
            """
            SELECT
                table_schema,
                table_name,
                referenced_table_schema,
                referenced_table_name,
                referenced_column_name,
                constraint_name,
                column_name,
                ordinal_position
            FROM
                information_schema.key_column_usage
            WHERE
                referenced_column_name IS NOT NULL
            """,
            param=self.settings.database,
        )

        if not raw_relations:
            Log.error("No relations in the database")

        for r in self.settings.add_relations:
            try:
                lhs, rhs = map(strings.trim, r.split("->"))
                lhs = lhs.split(".")
                if len(lhs) == 2:
                    lhs = [self.settings.database.schema] + lhs
                rhs = rhs.split(".")
                if len(rhs) == 2:
                    rhs = [self.settings.database.schema] + rhs
                to_add = Data(
                    ordinal_position=1,  # CAN ONLY HANDLE 1-COLUMN RELATIONS
                    table_schema=lhs[0],
                    table_name=lhs[1],
                    column_name=lhs[2],
                    referenced_table_schema=rhs[0],
                    referenced_table_name=rhs[1],
                    referenced_column_name=rhs[2],
                )

                # CHECK IF EXISTING
                if jx.filter(raw_relations, {"eq": to_add}):
                    Log.note("Relation {{relation}} already exists",
                             relation=r)
                    continue

                to_add.constraint_name = Random.hex(20)
                raw_relations.append(to_add)
            except Exception as e:
                Log.error("Could not parse {{line|quote}}", line=r, cause=e)

        relations = jx.select(
            raw_relations,
            [
                {
                    "name": "constraint.name",
                    "value": "constraint_name"
                },
                {
                    "name": "table.schema",
                    "value": "table_schema"
                },
                {
                    "name": "table.name",
                    "value": "table_name"
                },
                {
                    "name": "column.name",
                    "value": "column_name"
                },
                {
                    "name": "referenced.table.schema",
                    "value": "referenced_table_schema"
                },
                {
                    "name": "referenced.table.name",
                    "value": "referenced_table_name"
                },
                {
                    "name": "referenced.column.name",
                    "value": "referenced_column_name"
                },
                {
                    "name": "ordinal_position",
                    "value": "ordinal_position"
                },
            ],
        )

        # GET ALL TABLES
        raw_tables = self.db.query("""
            SELECT
                t.table_schema,
                t.table_name,
                c.constraint_name,
                c.constraint_type,
                k.column_name,
                k.ordinal_position
            FROM
                information_schema.tables t
            LEFT JOIN
                information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY')
            LEFT JOIN
                information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema
            ORDER BY
                t.table_schema,
                t.table_name,
                c.constraint_name,
                k.ordinal_position,
                k.column_name
            """)

        # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING
        tables = UniqueIndex(keys=["name", "schema"])
        for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]):
            c = wrap(list(c))
            best_index = Null
            is_referenced = False
            is_primary = False
            for g, w in jx.groupby(c, "constraint_name"):
                if not g.constraint_name:
                    continue
                w = list(w)
                ref = False
                for r in relations:
                    if (r.table.name == t.table_name
                            and r.table.schema == t.table_schema
                            and r.constraint.name == g.constraint_name):
                        ref = True
                is_prime = w[0].constraint_type == "PRIMARY"

                reasons_this_one_is_better = [
                    best_index == None,  # WE DO NOT HAVE A CANDIDATE YET
                    is_prime
                    and not is_primary,  # PRIMARY KEYS ARE GOOD TO HAVE
                    is_primary == is_prime and ref and
                    not is_referenced,  # REFERENCED UNIQUE TUPLES ARE GOOD TOO
                    is_primary == is_prime and ref == is_referenced and len(w)
                    < len(best_index),  # THE SHORTER THE TUPLE, THE BETTER
                ]
                if any(reasons_this_one_is_better):
                    is_primary = is_prime
                    is_referenced = ref
                    best_index = w

            tables.add({
                "name": t.table_name,
                "schema": t.table_schema,
                "id": [b.column_name for b in best_index],
            })

        fact_table = tables[self.settings.fact_table,
                            self.settings.database.schema]
        ids_table = {
            "alias": "t0",
            "name": "__ids__",
            "schema": fact_table.schema,
            "id": fact_table.id,
        }
        relations.extend(
            wrap({
                "constraint": {
                    "name": "__link_ids_to_fact_table__"
                },
                "table": ids_table,
                "column": {
                    "name": c
                },
                "referenced": {
                    "table": fact_table,
                    "column": {
                        "name": c
                    }
                },
                "ordinal_position": i,
            }) for i, c in enumerate(fact_table.id))
        tables.add(ids_table)

        # GET ALL COLUMNS
        raw_columns = self.db.query("""
            SELECT
                column_name,
                table_schema,
                table_name,
                ordinal_position,
                data_type
            FROM
                information_schema.columns
            """)

        reference_only_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 2
        ]
        reference_all_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 1
        ]
        foreign_column_table_schema_triples = {(r.column.name, r.table.name,
                                                r.table.schema)
                                               for r in relations}
        referenced_column_table_schema_triples = {(
            r.referenced.column.name,
            r.referenced.table.name,
            r.referenced.table.schema,
        )
                                                  for r in relations}
        related_column_table_schema_triples = (
            foreign_column_table_schema_triples
            | referenced_column_table_schema_triples)

        columns = UniqueIndex(["column.name", "table.name", "table.schema"])
        for c in raw_columns:
            if c.table_name in reference_only_tables:
                if c.table_name + "." + c.column_name in self.settings.reference_only:
                    include = True
                    reference = True
                    foreign = False
                elif c.column_name in tables[(c.table_name,
                                              c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = False
                    foreign = False
                else:
                    include = False
                    reference = False
                    foreign = False
            elif c.table_name in reference_all_tables:
                # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED
                if c.column_name in tables[(c.table_name, c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = True
                    foreign = False
                elif (
                        c.column_name,
                        c.table_name,
                        c.table_schema,
                ) in foreign_column_table_schema_triples:
                    include = False
                    reference = False
                    foreign = True
                else:
                    include = True
                    reference = False
                    foreign = False
            elif c.column_name in tables[(c.table_name, c.table_schema)].id:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            elif (
                    c.column_name,
                    c.table_name,
                    c.table_schema,
            ) in foreign_column_table_schema_triples:
                include = False
                reference = False
                foreign = True
            elif (
                    c.column_name,
                    c.table_name,
                    c.table_schema,
            ) in referenced_column_table_schema_triples:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            else:
                include = True
                reference = False
                foreign = False

            rel = {
                "column": {
                    "name": c.column_name,
                    "type": c.data_type
                },
                "table": {
                    "name": c.table_name,
                    "schema": c.table_schema
                },
                "ordinal_position": c.ordinal_position,
                "is_id": c.column_name
                in tables[(c.table_name, c.table_schema)].id,
                "include": include,  # TRUE IF THIS COLUMN IS OUTPUTTED
                "reference":
                reference,  # TRUE IF THIS COLUMN REPRESENTS THE ROW
                "foreign":
                foreign,  # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW
            }
            columns.add(rel)

        # ITERATE OVER ALL PATHS
        todo = FlatList()
        output_columns = FlatList()
        nested_path_to_join = {}
        all_nested_paths = [["."]]

        def follow_paths(position, path, nested_path, done_relations,
                         no_nested_docs):
            if position.name in self.settings.exclude:
                return

            if self.path_not_allowed(path):
                return
            if DEBUG:
                Log.note("Trace {{path}}", path=path)
            if position.name != "__ids__":
                # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS)
                self.db.query(
                    ConcatSQL(
                        SQL_SELECT,
                        SQL_STAR,
                        SQL_FROM,
                        quote_column(position.schema, position.name),
                        SQL_LIMIT,
                        SQL_ONE,
                    ))

            if position.name in reference_all_tables:
                no_nested_docs = True
            if position.name in reference_only_tables:
                return
            curr_join_list = copy(nested_path_to_join[nested_path[0]])

            ###############################################################################
            # INNER OBJECTS
            ###############################################################################
            referenced_tables = list(
                sort_using_key(
                    jx.groupby(
                        jx.filter(
                            relations,
                            {
                                "eq": {
                                    "table.name": position.name,
                                    "table.schema": position.schema,
                                }
                            },
                        ),
                        "constraint.name",
                    ),
                    key=lambda p: first(p[1]).column.name,
                ))
            for g, constraint_columns in referenced_tables:
                g = unwrap(g)
                constraint_columns = deepcopy(constraint_columns)
                if g["constraint.name"] in done_relations:
                    continue
                if any(cc for cc in constraint_columns
                       if cc.referenced.table.name in self.settings.exclude):
                    continue

                done_relations.add(g["constraint.name"])

                many_to_one_joins = nested_path_to_join[nested_path[0]]
                index = len(many_to_one_joins)

                alias = "t" + text(index)
                for c in constraint_columns:
                    c.referenced.table.alias = alias
                    c.table = position
                many_to_one_joins.append({
                    "join_columns": constraint_columns,
                    "path": path,
                    "nested_path": nested_path,
                })

                # HANDLE THE COMMON *id SUFFIX
                name = []
                for cname, tname in zip(
                        constraint_columns.column.name,
                        constraint_columns.referenced.table.name,
                ):
                    if cname.startswith(tname):
                        name.append(tname)
                    elif cname.endswith("_id"):
                        name.append(cname[:-3])
                    else:
                        name.append(cname)

                relation_string = many_to_one_string(constraint_columns[0])
                step = "/".join(name)
                if len(constraint_columns) == 1:
                    step = self.name_relations.get(relation_string, step)

                referenced_column_path = concat_field(path, step)
                if self.path_not_allowed(referenced_column_path):
                    continue

                if referenced_column_path in reference_only_tables:
                    continue

                col_pointer_name = relative_field(referenced_column_path,
                                                  nested_path[0])
                for col in columns:
                    if (col.table.name
                            == constraint_columns[0].referenced.table.name
                            and col.table.schema
                            == constraint_columns[0].referenced.table.schema):
                        col_full_name = concat_field(
                            col_pointer_name, literal_field(col.column.name))

                        if (col.is_id and len(nested_path) == 1
                                and col.table.name == fact_table.name
                                and col.table.schema == fact_table.schema):
                            # ALWAYS SHOW THE ID OF THE FACT
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                True,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })
                        elif col.column.name == constraint_columns[
                                0].column.name:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.is_id:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.reference:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_pointer_name
                                if not self.settings.show_foreign_keys else
                                col_full_name,  # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED
                            })
                        elif col.include:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })

                if position.name in reference_only_tables:
                    continue

                todo.append(
                    Data(
                        position=copy(constraint_columns[0].referenced.table),
                        path=referenced_column_path,
                        nested_path=nested_path,
                        done_relations=copy(done_relations),
                        no_nested_docs=no_nested_docs,
                    ))
            ###############################################################################
            # NESTED OBJECTS
            ###############################################################################
            if not no_nested_docs:
                nesting_tables = list(
                    sort_using_key(
                        jx.groupby(
                            jx.filter(
                                relations,
                                {
                                    "eq": {
                                        "referenced.table.name": position.name,
                                        "referenced.table.schema":
                                        position.schema,
                                    }
                                },
                            ),
                            "constraint.name",
                        ),
                        key=lambda p: [(r.table.name, r.column.name)
                                       for r in [first(p[1])]][0],
                    ))

                for g, constraint_columns in nesting_tables:
                    g = unwrap(g)
                    constraint_columns = deepcopy(constraint_columns)
                    if g["constraint.name"] in done_relations:
                        continue
                    done_relations.add(g["constraint.name"])

                    many_table = set(constraint_columns.table.name)
                    if not (many_table - self.settings.exclude):
                        continue

                    relation_string = one_to_many_string(constraint_columns[0])
                    step = "/".join(many_table)
                    if len(constraint_columns) == 1:
                        step = self.name_relations.get(relation_string, step)

                    referenced_column_path = concat_field(path, step)
                    if self.path_not_allowed(referenced_column_path):
                        continue

                    new_nested_path = [referenced_column_path] + nested_path
                    all_nested_paths.append(new_nested_path)

                    if referenced_column_path in nested_path_to_join:
                        Log.error(
                            "{{path}} already exists, try adding entry to name_relations",
                            path=referenced_column_path,
                        )
                    one_to_many_joins = nested_path_to_join[
                        referenced_column_path] = copy(curr_join_list)
                    index = len(one_to_many_joins)
                    alias = "t" + text(index)
                    for c in constraint_columns:
                        c.table.alias = alias
                        c.referenced.table = position
                    one_to_many_joins.append(
                        set_default(
                            {},
                            g,
                            {
                                "children": True,
                                "join_columns": constraint_columns,
                                "path": path,
                                "nested_path": nested_path,
                            },
                        ))
                    for col in columns:
                        if (col.table.name == constraint_columns[0].table.name
                                and col.table.schema
                                == constraint_columns[0].table.schema):
                            col_full_name = join_field(
                                split_field(referenced_column_path)
                                [len(split_field(new_nested_path[0])):] +
                                [literal_field(col.column.name)])

                            if col.column.name == constraint_columns[
                                    0].column.name:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            elif col.is_id:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            else:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if col.include else None,
                                })

                    todo.append(
                        Data(
                            position=constraint_columns[0].table,
                            path=referenced_column_path,
                            nested_path=new_nested_path,
                            done_relations=copy(done_relations),
                            no_nested_docs=no_nested_docs,
                        ))

        path = "."
        nested_path = [path]
        nested_path_to_join["."] = [{
            "path":
            path,
            "join_columns": [{
                "referenced": {
                    "table": ids_table
                }
            }],
            "nested_path":
            nested_path,
        }]

        todo.append(
            Data(
                position=ids_table,
                path=path,
                nested_path=nested_path,
                done_relations=set(),
                no_nested_docs=False,
            ))

        while todo:
            item = todo.pop(0)
            follow_paths(**item)

        self.all_nested_paths = all_nested_paths
        self.nested_path_to_join = nested_path_to_join
        self.columns = output_columns
Example #38
0
    def _get_from_elasticsearch(
            self,
            revision,
            locale=None,
            get_diff=False,
            get_moves=True,
            after=None,  # RETURN RECORDS ETLed AFTER GIVEN TIME
    ):
        """
        MAKE CALL TO ES
        """
        rev = revision.changeset.id
        if self.repo.cluster.version.startswith("1.7."):
            query = {
                "query": {
                    "filtered": {
                        "query": {
                            "match_all": {}
                        },
                        "filter": {
                            "and": [
                                {
                                    "term": {
                                        "changeset.id12": rev[0:12]
                                    }
                                },
                                {
                                    "term": {
                                        "branch.name": revision.branch.name
                                    }
                                },
                                {
                                    "term": {
                                        "branch.locale":
                                        coalesce(
                                            locale,
                                            revision.branch.locale,
                                            DEFAULT_LOCALE,
                                        )
                                    }
                                },
                                {
                                    "range": {
                                        "etl.timestamp": {
                                            "gt": Date.max(after, MIN_ETL_AGE)
                                        }
                                    }
                                },
                            ]
                        },
                    }
                },
                "size": 20,
            }
        else:
            query = {
                "query": {
                    "bool": {
                        "must": [
                            {
                                "term": {
                                    "changeset.id12": rev[0:12]
                                }
                            },
                            {
                                "term": {
                                    "branch.name": revision.branch.name
                                }
                            },
                            {
                                "term": {
                                    "branch.locale":
                                    coalesce(locale, revision.branch.locale,
                                             DEFAULT_LOCALE)
                                }
                            },
                            {
                                "range": {
                                    "etl.timestamp": {
                                        "gt": Date.max(after, MIN_ETL_AGE)
                                    }
                                }
                            },
                        ]
                    }
                },
                "size": 20,
            }

        for attempt in range(3):
            try:
                if get_moves:
                    with self.moves_locker:
                        docs = self.moves.search(query).hits.hits
                else:
                    with self.repo_locker:
                        docs = self.repo.search(query).hits.hits
                if len(docs) == 0:
                    return None
                best = docs[0]._source
                if len(docs) > 1:
                    for d in docs:
                        if d._id.endswith(d._source.branch.locale):
                            best = d._source
                    Log.warning("expecting no more than one document")
                return best
            except Exception as e:
                e = Except.wrap(e)
                if ("EsRejectedExecutionException[rejected execution (queue capacity"
                        in e):
                    (Till(seconds=Random.int(30))).wait()
                    continue
                else:
                    Log.warning(
                        "Bad ES call, waiting for {{num}} seconds",
                        num=WAIT_AFTER_NODE_FAILURE,
                        cause=e,
                    )
                    Till(seconds=WAIT_AFTER_NODE_FAILURE).wait()
                    continue

        Log.warning("ES did not deliver, fall back to HG")
        return None
Example #39
0
def es_bulkaggsop(esq, frum, query):
    query = query.copy()  # WE WILL MARK UP THIS QUERY

    chunk_size = min(coalesce(query.chunk_size, MAX_CHUNK_SIZE), MAX_CHUNK_SIZE)
    schema = frum.schema
    query_path = first(schema.query_path)
    selects = listwrap(query.select)

    variable = first(query.groupby).value
    # FIND CARDINALITY

    cardinality_check = Timer(
        "Get cardinality for {{column}}", param={"column": variable.var}
    )

    with cardinality_check:
        columns = schema.leaves(variable.var)
        if len(columns) != 1:
            Log.error(
                "too many columns to bulk groupby:\n{{columns|json}}", columns=columns
            )
        column = first(columns)

        if query.where is TRUE:
            cardinality = column.cardinality
            if cardinality == None:
                esq.namespace._update_cardinality(column)
                cardinality = column.cardinality
        else:
            cardinality = esq.query(
                {
                    "select": {
                        "name": "card",
                        "value": variable,
                        "aggregate": "cardinality",
                    },
                    "from": frum.name,
                    "where": query.where,
                    "format": "cube",
                }
            ).card

        num_partitions = (cardinality + chunk_size - 1) // chunk_size

        if num_partitions > MAX_PARTITIONS:
            Log.error("Requesting more than {{num}} partitions", num=num_partitions)

        acc, decoders, es_query = build_es_query(selects, query_path, schema, query)
        guid = Random.base64(32, extra="-_")
        abs_limit = mo_math.MIN((query.limit, first(query.groupby).domain.limit))
        formatter = formatters[query.format](abs_limit)

        Thread.run(
            "extract to " + guid + ".json",
            extractor,
            guid,
            num_partitions,
            esq,
            query,
            selects,
            query_path,
            schema,
            chunk_size,
            cardinality,
            abs_limit,
            formatter,
            parent_thread=Null,
        )

    output = wrap(
        {
            "url": URL_PREFIX / (guid + ".json"),
            "status": URL_PREFIX / (guid + ".status.json"),
            "meta": {
                "format": query.format,
                "timing": {"cardinality_check": cardinality_check.duration},
                "es_query": es_query,
                "num_partitions": num_partitions,
                "cardinality": cardinality,
            },
        }
    )
    return output
Example #40
0
    def get_revision(self, revision, locale=None, get_diff=False, get_moves=True):
        """
        EXPECTING INCOMPLETE revision OBJECT
        RETURNS revision
        """
        rev = revision.changeset.id
        if not rev:
            return Null
        elif rev == "None":
            return Null
        elif revision.branch.name == None:
            return Null
        locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)
        output = self._get_from_elasticsearch(revision, locale=locale, get_diff=get_diff)
        if output:
            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            DEBUG and Log.note("Got hg ({{branch}}, {{locale}}, {{revision}}) from ES", branch=output.branch.name, locale=locale, revision=output.changeset.id)
            if output.push.date >= Date.now()-MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))
            if output.push.date:
                return output

        # RATE LIMIT CALLS TO HG (CACHE MISSES)
        next_cache_miss = self.last_cache_miss + (Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND)
        self.last_cache_miss = Date.now()
        if next_cache_miss > self.last_cache_miss:
            Log.note("delaying next hg call for {{seconds|round(decimal=1)}}", seconds=next_cache_miss - self.last_cache_miss)
            Till(till=next_cache_miss.unix).wait()

        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text_type, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)]
            if not b:
                Log.warning("can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale)
                return Null

        if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        push = self._get_push(found_revision.branch, found_revision.changeset.id)

        url1 = found_revision.branch.url.rstrip("/") + "/json-info?node=" + found_revision.changeset.id[0:12]
        url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + found_revision.changeset.id[0:12]
        with Explanation("get revision from {{url}}", url=url1, debug=DEBUG):
            raw_rev2 = Null
            try:
                raw_rev1 = self._get_raw_json_info(url1, found_revision.branch)
                raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e
            output = self._normalize_revision(set_default(raw_rev1, raw_rev2), found_revision, push, get_diff, get_moves)
            if output.push.date >= Date.now()-MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            return output
Example #41
0
    def _install_es(self, gigabytes):
        volumes = self.instance.markup.drives

        if not fabric_files.exists("/usr/local/elasticsearch"):
            with cd("/home/ec2-user/"):
                run("mkdir -p temp")

            if not File(LOCAL_JRE).exists:
                Log.error(
                    "Expecting {{file}} on manager to spread to ES instances",
                    file=LOCAL_JRE)
            with cd("/home/ec2-user/temp"):
                run('rm -f ' + JRE)
                put("resources/" + JRE, JRE)
                sudo("rpm -i " + JRE)
                sudo(
                    "alternatives --install /usr/bin/java java /usr/java/default/bin/java 20000"
                )
                run("export JAVA_HOME=/usr/java/default")

            with cd("/home/ec2-user/"):
                run('wget https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-1.7.1.tar.gz'
                    )
                run('tar zxfv elasticsearch-1.7.1.tar.gz')
                sudo('mkdir /usr/local/elasticsearch')
                sudo('cp -R elasticsearch-1.7.1/* /usr/local/elasticsearch/')

            with cd('/usr/local/elasticsearch/'):
                # BE SURE TO MATCH THE PLUGLIN WITH ES VERSION
                # https://github.com/elasticsearch/elasticsearch-cloud-aws
                sudo(
                    'bin/plugin -install elasticsearch/elasticsearch-cloud-aws/2.7.1'
                )

            #REMOVE THESE FILES, WE WILL REPLACE THEM WITH THE CORRECT VERSIONS AT THE END
            sudo("rm -f /usr/local/elasticsearch/config/elasticsearch.yml")
            sudo("rm -f /usr/local/elasticsearch/bin/elasticsearch.in.sh")

        self.conn = self.instance.connection

        # MOUNT AND FORMAT THE EBS VOLUMES (list with `lsblk`)
        for i, k in enumerate(volumes):
            if not fabric_files.exists(k.path):
                with fabric_settings(warn_only=True):
                    sudo('sudo umount ' + k.device)

                sudo('yes | sudo mkfs -t ext4 ' + k.device)
                sudo('mkdir ' + k.path)
                sudo('sudo mount ' + k.device + ' ' + k.path)

                #ADD TO /etc/fstab SO AROUND AFTER REBOOT
                sudo("sed -i '$ a\\" + k.device + "   " + k.path +
                     "       ext4    defaults,nofail  0   2' /etc/fstab")

        # TEST IT IS WORKING
        sudo('mount -a')

        # INCREASE THE FILE HANDLE LIMITS
        with cd("/home/ec2-user/"):
            File("./results/temp/sysctl.conf").delete()
            get("/etc/sysctl.conf",
                "./results/temp/sysctl.conf",
                use_sudo=True)
            lines = File("./results/temp/sysctl.conf").read()
            if lines.find("fs.file-max = 100000") == -1:
                lines += "\nfs.file-max = 100000"
            lines = lines.replace("net.bridge.bridge-nf-call-ip6tables = 0",
                                  "")
            lines = lines.replace("net.bridge.bridge-nf-call-iptables = 0", "")
            lines = lines.replace("net.bridge.bridge-nf-call-arptables = 0",
                                  "")
            File("./results/temp/sysctl.conf").write(lines)
            put("./results/temp/sysctl.conf",
                "/etc/sysctl.conf",
                use_sudo=True)

        sudo("sysctl -p")

        # INCREASE FILE HANDLE PERMISSIONS
        sudo("sed -i '$ a\\root soft nofile 50000' /etc/security/limits.conf")
        sudo("sed -i '$ a\\root hard nofile 100000' /etc/security/limits.conf")
        sudo("sed -i '$ a\\root memlock unlimited' /etc/security/limits.conf")

        sudo(
            "sed -i '$ a\\ec2-user soft nofile 50000' /etc/security/limits.conf"
        )
        sudo(
            "sed -i '$ a\\ec2-user hard nofile 100000' /etc/security/limits.conf"
        )
        sudo(
            "sed -i '$ a\\ec2-user memlock unlimited' /etc/security/limits.conf"
        )

        # EFFECTIVE LOGIN TO LOAD CHANGES TO FILE HANDLES
        # sudo("sudo -i -u ec2-user")

        if not fabric_files.exists("/data1/logs"):
            sudo('mkdir /data1/logs')
            sudo('mkdir /data1/heapdump')

            #INCREASE NUMBER OF FILE HANDLES
            # sudo("sysctl -w fs.file-max=64000")
        # COPY CONFIG FILE TO ES DIR
        if not fabric_files.exists(
                "/usr/local/elasticsearch/config/elasticsearch.yml"):
            yml = File("./examples/config/es_config.yml").read().replace(
                "\r", "")
            yml = expand_template(
                yml, {
                    "id":
                    Random.hex(length=8),
                    "data_paths":
                    ",".join("/data" + unicode(i + 1)
                             for i, _ in enumerate(volumes))
                })
            File("./results/temp/elasticsearch.yml").write(yml)
            put("./results/temp/elasticsearch.yml",
                '/usr/local/elasticsearch/config/elasticsearch.yml',
                use_sudo=True)

        # FOR SOME REASON THE export COMMAND DOES NOT SEEM TO WORK
        # THIS SCRIPT SETS THE ES_MIN_MEM/ES_MAX_MEM EXPLICITLY
        if not fabric_files.exists(
                "/usr/local/elasticsearch/bin/elasticsearch.in.sh"):
            sh = File("./examples/config/es_run.sh").read().replace("\r", "")
            sh = expand_template(sh, {"memory": unicode(int(gigabytes / 2))})
            File("./results/temp/elasticsearch.in.sh").write(sh)
            with cd("/home/ec2-user"):
                put("./results/temp/elasticsearch.in.sh",
                    './temp/elasticsearch.in.sh',
                    use_sudo=True)
                sudo(
                    "cp -f ./temp/elasticsearch.in.sh /usr/local/elasticsearch/bin/elasticsearch.in.sh"
                )
Example #42
0
    def _scan_database(self):
        # GET ALL RELATIONS
        raw_relations = self.db.query("""
            SELECT
                table_schema,
                table_name,
                referenced_table_schema,
                referenced_table_name,
                referenced_column_name,
                constraint_name,
                column_name,
                ordinal_position
            FROM
                information_schema.key_column_usage
            WHERE
                referenced_column_name IS NOT NULL
        """,
                                      param=self.settings.database)

        if not raw_relations:
            Log.error("No relations in the database")

        for r in self.settings.add_relations:
            try:
                a, b = map(strings.trim, r.split("->"))
                a = a.split(".")
                b = b.split(".")
                raw_relations.append(
                    Data(table_schema=a[0],
                         table_name=a[1],
                         referenced_table_schema=b[0],
                         referenced_table_name=b[1],
                         referenced_column_name=b[2],
                         constraint_name=Random.hex(20),
                         column_name=a[2],
                         ordinal_position=1))
            except Exception as e:
                Log.error("Could not parse {{line|quote}}", line=r, cause=e)

        relations = jx.select(raw_relations,
                              [{
                                  "name": "constraint.name",
                                  "value": "constraint_name"
                              }, {
                                  "name": "table.schema",
                                  "value": "table_schema"
                              }, {
                                  "name": "table.name",
                                  "value": "table_name"
                              }, {
                                  "name": "column.name",
                                  "value": "column_name"
                              }, {
                                  "name": "referenced.table.schema",
                                  "value": "referenced_table_schema"
                              }, {
                                  "name": "referenced.table.name",
                                  "value": "referenced_table_name"
                              }, {
                                  "name": "referenced.column.name",
                                  "value": "referenced_column_name"
                              }, {
                                  "name": "ordinal_position",
                                  "value": "ordinal_position"
                              }])

        # GET ALL TABLES
        raw_tables = self.db.query("""
            SELECT
                t.table_schema,
                t.table_name,
                c.constraint_name,
                c.constraint_type,
                k.column_name,
                k.ordinal_position
            FROM
                information_schema.tables t
            LEFT JOIN
                information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY')
            LEFT JOIN
                information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema
            ORDER BY
                t.table_schema,
                t.table_name,
                c.constraint_name,
                k.ordinal_position,
                k.column_name
        """,
                                   param=self.settings.database)

        # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING
        tables = UniqueIndex(keys=["name", "schema"])
        for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]):
            c = wrap(list(c))
            best_index = Null
            is_referenced = False
            is_primary = False
            for g, w in jx.groupby(c, "constraint_name"):
                if not g.constraint_name:
                    continue
                w = list(w)
                ref = False
                for r in relations:
                    if r.table.name == t.table_name and r.table.schema == t.table_schema and r.constraint.name == g.constraint_name:
                        ref = True
                is_prime = w[0].constraint_type == "PRIMARY"

                reasons_this_one_is_better = [
                    best_index == None,  # WE DO NOT HAVE A CANDIDATE YET
                    is_prime
                    and not is_primary,  # PRIMARY KEYS ARE GOOD TO HAVE
                    is_primary == is_prime and ref and
                    not is_referenced,  # REFERENCED UNIQUE TUPLES ARE GOOD TOO
                    is_primary == is_prime and ref == is_referenced and len(w)
                    < len(best_index)  # THE SHORTER THE TUPLE, THE BETTER
                ]
                if any(reasons_this_one_is_better):
                    is_primary = is_prime
                    is_referenced = ref
                    best_index = w

            tables.add({
                "name": t.table_name,
                "schema": t.table_schema,
                "id": [b.column_name for b in best_index]
            })

        fact_table = tables[self.settings.fact_table,
                            self.settings.database.schema]
        ids_table = {
            "alias": "t0",
            "name": "__ids__",
            "schema": fact_table.schema,
            "id": fact_table.id
        }
        relations.extend(
            wrap({
                "constraint": {
                    "name": "__link_ids_to_fact_table__"
                },
                "table": ids_table,
                "column": {
                    "name": c
                },
                "referenced": {
                    "table": fact_table,
                    "column": {
                        "name": c
                    }
                },
                "ordinal_position": i
            }) for i, c in enumerate(fact_table.id))
        tables.add(ids_table)

        # GET ALL COLUMNS
        raw_columns = self.db.query("""
            SELECT
                column_name,
                table_schema,
                table_name,
                ordinal_position,
                data_type
            FROM
                information_schema.columns
        """,
                                    param=self.settings.database)

        reference_only_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 2
        ]
        reference_all_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 1
        ]
        foreign_column_table_schema_triples = {(r.column.name, r.table.name,
                                                r.table.schema)
                                               for r in relations}
        referenced_column_table_schema_triples = {
            (r.referenced.column.name, r.referenced.table.name,
             r.referenced.table.schema)
            for r in relations
        }
        related_column_table_schema_triples = foreign_column_table_schema_triples | referenced_column_table_schema_triples

        columns = UniqueIndex(["column.name", "table.name", "table.schema"])
        for c in raw_columns:
            if c.table_name in reference_only_tables:
                if c.table_name + "." + c.column_name in self.settings.reference_only:
                    include = True
                    reference = True
                    foreign = False
                elif c.column_name in tables[(c.table_name,
                                              c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = False
                    foreign = False
                else:
                    include = False
                    reference = False
                    foreign = False
            elif c.table_name in reference_all_tables:
                # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED
                if c.column_name in tables[(c.table_name, c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = True
                    foreign = False
                elif (c.column_name, c.table_name,
                      c.table_schema) in foreign_column_table_schema_triples:
                    include = False
                    reference = False
                    foreign = True
                else:
                    include = True
                    reference = False
                    foreign = False
            elif c.column_name in tables[(c.table_name, c.table_schema)].id:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            elif (c.column_name, c.table_name,
                  c.table_schema) in foreign_column_table_schema_triples:
                include = False
                reference = False
                foreign = True
            elif (c.column_name, c.table_name,
                  c.table_schema) in referenced_column_table_schema_triples:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            else:
                include = True
                reference = False
                foreign = False

            rel = {
                "column": {
                    "name": c.column_name,
                    "type": c.data_type
                },
                "table": {
                    "name": c.table_name,
                    "schema": c.table_schema
                },
                "ordinal_position": c.ordinal_position,
                "is_id": c.column_name
                in tables[(c.table_name, c.table_schema)].id,
                "include": include,  # TRUE IF THIS COLUMN IS OUTPUTTED
                "reference":
                reference,  # TRUE IF THIS COLUMN REPRESENTS THE ROW
                "foreign": foreign  # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW
            }
            columns.add(rel)

        # ITERATE OVER ALL PATHS
        todo = FlatList()
        output_columns = FlatList()
        nested_path_to_join = {}
        all_nested_paths = [["."]]

        def follow_paths(position, path, nested_path, done_relations,
                         no_nested_docs):
            if position.name in self.settings.exclude:
                return
            if DEBUG:
                Log.note("Trace {{path}}", path=path)
            if position.name != "__ids__":
                # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS)
                self.db.query("SELECT * FROM " +
                              quote_column(position.name, position.schema) +
                              " LIMIT 1")

            if position.name in reference_all_tables:
                no_nested_docs = True
            if position.name in reference_only_tables:
                return

            curr_join_list = copy(nested_path_to_join[nested_path[0]])

            # INNER OBJECTS
            referenced_tables = list(
                jx.groupby(
                    jx.filter(
                        relations, {
                            "eq": {
                                "table.name": position.name,
                                "table.schema": position.schema
                            }
                        }), "constraint.name"))
            for g, constraint_columns in referenced_tables:
                g = unwrap(g)
                constraint_columns = deepcopy(constraint_columns)
                if g["constraint.name"] in done_relations:
                    continue
                if any(cc for cc in constraint_columns
                       if cc.referenced.table.name in self.settings.exclude):
                    continue

                done_relations.add(g["constraint.name"])

                many_to_one_joins = nested_path_to_join[nested_path[0]]
                index = len(many_to_one_joins)

                alias = "t" + text_type(index)
                for c in constraint_columns:
                    c.referenced.table.alias = alias
                    c.table = position
                many_to_one_joins.append({
                    "join_columns": constraint_columns,
                    "path": path,
                    "nested_path": nested_path
                })

                # referenced_table_path = join_field(split_field(path) + ["/".join(constraint_columns.referenced.table.name)])
                # HANDLE THE COMMON *id SUFFIX
                name = []
                for a, b in zip(constraint_columns.column.name,
                                constraint_columns.referenced.table.name):
                    if a.startswith(b):
                        name.append(b)
                    elif a.endswith("_id"):
                        name.append(a[:-3])
                    else:
                        name.append(a)
                referenced_column_path = join_field(
                    split_field(path) + ["/".join(name)])
                col_pointer_name = relative_field(referenced_column_path,
                                                  nested_path[0])
                # insert into nested1 VALUES (100, 10, 'aaa', -1);
                # id.about.time.nested1 .ref=10
                # id.about.time.nested1 .ref.name
                for col in columns:
                    if col.table.name == constraint_columns[
                            0].referenced.table.name and col.table.schema == constraint_columns[
                                0].referenced.table.schema:
                        col_full_name = concat_field(
                            col_pointer_name, literal_field(col.column.name))

                        if col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema:
                            # ALWAYS SHOW THE ID OF THE FACT
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                True,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                            })
                        elif col.column.name == constraint_columns[
                                0].column.name:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None
                            })
                        elif col.is_id:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None
                            })
                        elif col.reference:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_pointer_name
                                if not self.settings.show_foreign_keys else
                                col_full_name  # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED
                            })
                        elif col.include:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                            })

                if position.name in reference_only_tables:
                    continue

                todo.append(
                    Data(position=copy(constraint_columns[0].referenced.table),
                         path=referenced_column_path,
                         nested_path=nested_path,
                         done_relations=copy(done_relations),
                         no_nested_docs=no_nested_docs))

            # NESTED OBJECTS
            if not no_nested_docs:
                for g, constraint_columns in jx.groupby(
                        jx.filter(
                            relations, {
                                "eq": {
                                    "referenced.table.name": position.name,
                                    "referenced.table.schema": position.schema
                                }
                            }), "constraint.name"):
                    g = unwrap(g)
                    constraint_columns = deepcopy(constraint_columns)
                    if g["constraint.name"] in done_relations:
                        continue
                    done_relations.add(g["constraint.name"])

                    many_table = set(constraint_columns.table.name)
                    if not (many_table - self.settings.exclude):
                        continue

                    referenced_column_path = join_field(
                        split_field(path) + ["/".join(many_table)])
                    new_nested_path = [referenced_column_path] + nested_path
                    all_nested_paths.append(new_nested_path)

                    # if new_path not in self.settings.include:
                    #     Log.note("Exclude nested path {{path}}", path=new_path)
                    #     continue
                    one_to_many_joins = nested_path_to_join[
                        referenced_column_path] = copy(curr_join_list)
                    index = len(one_to_many_joins)
                    alias = "t" + text_type(index)
                    for c in constraint_columns:
                        c.table.alias = alias
                        c.referenced.table = position
                    one_to_many_joins.append(
                        set_default({}, g, {
                            "children": True,
                            "join_columns": constraint_columns,
                            "path": path,
                            "nested_path": nested_path
                        }))
                    # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10# id.about.time.nested1 .ref.name
                    for col in columns:
                        if col.table.name == constraint_columns[
                                0].table.name and col.table.schema == constraint_columns[
                                    0].table.schema:
                            col_full_name = join_field(
                                split_field(referenced_column_path)
                                [len(split_field(new_nested_path[0])):] +
                                [literal_field(col.column.name)])

                            if col.column.name == constraint_columns[
                                    0].column.name:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None
                                })
                            elif col.is_id:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None
                                })
                            else:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if col.include else None
                                })

                    todo.append(
                        Data(position=constraint_columns[0].table,
                             path=referenced_column_path,
                             nested_path=new_nested_path,
                             done_relations=copy(done_relations),
                             no_nested_docs=no_nested_docs))

        path = "."
        nested_path = [path]
        nested_path_to_join["."] = [{
            "path":
            path,
            "join_columns": [{
                "referenced": {
                    "table": ids_table
                }
            }],
            "nested_path":
            nested_path
        }]

        todo.append(
            Data(position=ids_table,
                 path=path,
                 nested_path=nested_path,
                 done_relations=set(),
                 no_nested_docs=False))

        while todo:
            item = todo.pop(0)
            follow_paths(**item)

        self.all_nested_paths = all_nested_paths
        self.nested_path_to_join = nested_path_to_join
        self.columns = output_columns
Example #43
0
def random_id():
    return Random.hex(40)
Example #44
0
    def _install_es(self, gigabytes):
        volumes = self.instance.markup.drives

        if not fabric_files.exists("/usr/local/elasticsearch"):
            with cd("/home/ec2-user/"):
                run("mkdir -p temp")

            if not File(LOCAL_JRE).exists:
                Log.error("Expecting {{file}} on manager to spread to ES instances", file=LOCAL_JRE)
            with cd("/home/ec2-user/temp"):
                run('rm -f '+JRE)
                put("resources/"+JRE, JRE)
                sudo("rpm -i "+JRE)
                sudo("alternatives --install /usr/bin/java java /usr/java/default/bin/java 20000")
                run("export JAVA_HOME=/usr/java/default")

            with cd("/home/ec2-user/"):
                run('wget https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-1.7.1.tar.gz')
                run('tar zxfv elasticsearch-1.7.1.tar.gz')
                sudo('mkdir /usr/local/elasticsearch')
                sudo('cp -R elasticsearch-1.7.1/* /usr/local/elasticsearch/')

            with cd('/usr/local/elasticsearch/'):
                # BE SURE TO MATCH THE PLUGLIN WITH ES VERSION
                # https://github.com/elasticsearch/elasticsearch-cloud-aws
                sudo('bin/plugin -install elasticsearch/elasticsearch-cloud-aws/2.7.1')

            #REMOVE THESE FILES, WE WILL REPLACE THEM WITH THE CORRECT VERSIONS AT THE END
            sudo("rm -f /usr/local/elasticsearch/config/elasticsearch.yml")
            sudo("rm -f /usr/local/elasticsearch/bin/elasticsearch.in.sh")

        self.conn = self.instance.connection

        # MOUNT AND FORMAT THE EBS VOLUMES (list with `lsblk`)
        for i, k in enumerate(volumes):
            if not fabric_files.exists(k.path):
                with fabric_settings(warn_only=True):
                    sudo('sudo umount '+k.device)

                sudo('yes | sudo mkfs -t ext4 '+k.device)
                sudo('mkdir '+k.path)
                sudo('sudo mount '+k.device+' '+k.path)

                #ADD TO /etc/fstab SO AROUND AFTER REBOOT
                sudo("sed -i '$ a\\"+k.device+"   "+k.path+"       ext4    defaults,nofail  0   2' /etc/fstab")

        # TEST IT IS WORKING
        sudo('mount -a')

        # INCREASE THE FILE HANDLE LIMITS
        with cd("/home/ec2-user/"):
            File("./results/temp/sysctl.conf").delete()
            get("/etc/sysctl.conf", "./results/temp/sysctl.conf", use_sudo=True)
            lines = File("./results/temp/sysctl.conf").read()
            if lines.find("fs.file-max = 100000") == -1:
                lines += "\nfs.file-max = 100000"
            lines = lines.replace("net.bridge.bridge-nf-call-ip6tables = 0", "")
            lines = lines.replace("net.bridge.bridge-nf-call-iptables = 0", "")
            lines = lines.replace("net.bridge.bridge-nf-call-arptables = 0", "")
            File("./results/temp/sysctl.conf").write(lines)
            put("./results/temp/sysctl.conf", "/etc/sysctl.conf", use_sudo=True)

        sudo("sysctl -p")

        # INCREASE FILE HANDLE PERMISSIONS
        sudo("sed -i '$ a\\root soft nofile 50000' /etc/security/limits.conf")
        sudo("sed -i '$ a\\root hard nofile 100000' /etc/security/limits.conf")
        sudo("sed -i '$ a\\root memlock unlimited' /etc/security/limits.conf")

        sudo("sed -i '$ a\\ec2-user soft nofile 50000' /etc/security/limits.conf")
        sudo("sed -i '$ a\\ec2-user hard nofile 100000' /etc/security/limits.conf")
        sudo("sed -i '$ a\\ec2-user memlock unlimited' /etc/security/limits.conf")

        # EFFECTIVE LOGIN TO LOAD CHANGES TO FILE HANDLES
        # sudo("sudo -i -u ec2-user")

        if not fabric_files.exists("/data1/logs"):
            sudo('mkdir /data1/logs')
            sudo('mkdir /data1/heapdump')

            #INCREASE NUMBER OF FILE HANDLES
            # sudo("sysctl -w fs.file-max=64000")
        # COPY CONFIG FILE TO ES DIR
        if not fabric_files.exists("/usr/local/elasticsearch/config/elasticsearch.yml"):
            yml = File("./examples/config/es_config.yml").read().replace("\r", "")
            yml = expand_template(yml, {
                "id": Random.hex(length=8),
                "data_paths": ",".join("/data"+unicode(i+1) for i, _ in enumerate(volumes))
            })
            File("./results/temp/elasticsearch.yml").write(yml)
            put("./results/temp/elasticsearch.yml", '/usr/local/elasticsearch/config/elasticsearch.yml', use_sudo=True)

        # FOR SOME REASON THE export COMMAND DOES NOT SEEM TO WORK
        # THIS SCRIPT SETS THE ES_MIN_MEM/ES_MAX_MEM EXPLICITLY
        if not fabric_files.exists("/usr/local/elasticsearch/bin/elasticsearch.in.sh"):
            sh = File("./examples/config/es_run.sh").read().replace("\r", "")
            sh = expand_template(sh, {"memory": unicode(int(gigabytes/2))})
            File("./results/temp/elasticsearch.in.sh").write(sh)
            with cd("/home/ec2-user"):
                put("./results/temp/elasticsearch.in.sh", './temp/elasticsearch.in.sh', use_sudo=True)
                sudo("cp -f ./temp/elasticsearch.in.sh /usr/local/elasticsearch/bin/elasticsearch.in.sh")