Beispiel #1
0
    def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None):
        self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD)
        self.rate = coalesce(rate, HG_REQUEST_PER_SECOND)
        self.cache_locker = Lock()
        self.cache = {}  # MAP FROM url TO (ready, headers, response, timestamp) PAIR
        self.no_cache = {}  # VERY SHORT TERM CACHE
        self.workers = []
        self.todo = Queue(APP_NAME+" todo")
        self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds))
        self.url = URL(source.url)
        self.db = Sqlite(database)
        self.inbound_rate = RateLogger("Inbound")
        self.outbound_rate = RateLogger("hg.mo")

        if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data:
            with self.db.transaction() as t:
                t.execute(
                    "CREATE TABLE cache ("
                    "   path TEXT PRIMARY KEY, "
                    "   headers TEXT, "
                    "   response TEXT, "
                    "   timestamp REAL "
                    ")"
                )

        self.threads = [
            Thread.run(APP_NAME+" worker" + text_type(i), self._worker)
            for i in range(CONCURRENCY)
        ]
        self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter)
        self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner)
Beispiel #2
0
    def __init__(self, conn=None, tuid_service=None, kwargs=None):
        try:
            self.config = kwargs

            self.conn = conn if conn else sql.Sql(self.config.database.name)
            self.hg_cache = HgMozillaOrg(
                kwargs=self.config.hg_cache,
                use_cache=True) if self.config.hg_cache else Null

            self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService(
                database=None,
                hg=None,
                kwargs=self.config,
                conn=self.conn,
                clogger=self)
            self.rev_locker = Lock()
            self.working_locker = Lock()

            self.init_db()
            self.next_revnum = coalesce(
                self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1)
            self.csets_todo_backwards = Queue(
                name="Clogger.csets_todo_backwards")
            self.deletions_todo = Queue(name="Clogger.deletions_todo")
            self.maintenance_signal = Signal(name="Clogger.maintenance_signal")
            self.config = self.config.tuid

            self.disable_backfilling = False
            self.disable_tipfilling = False
            self.disable_deletion = False
            self.disable_maintenance = False

            # Make sure we are filled before allowing queries
            numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0]
            if numrevs < MINIMUM_PERMANENT_CSETS:
                Log.note("Filling in csets to hold {{minim}} csets.",
                         minim=MINIMUM_PERMANENT_CSETS)
                oldest_rev = 'tip'
                with self.conn.transaction() as t:
                    tmp = t.query(
                        "SELECT min(revnum), revision FROM csetLog").data[0][1]
                    if tmp:
                        oldest_rev = tmp
                self._fill_in_range(MINIMUM_PERMANENT_CSETS - numrevs,
                                    oldest_rev,
                                    timestamp=False)

            Log.note(
                "Table is filled with atleast {{minim}} entries. Starting workers...",
                minim=MINIMUM_PERMANENT_CSETS)

            Thread.run('clogger-tip', self.fill_forward_continuous)
            Thread.run('clogger-backfill', self.fill_backward_with_list)
            Thread.run('clogger-maintenance', self.csetLog_maintenance)
            Thread.run('clogger-deleter', self.csetLog_deleter)

            Log.note("Started clogger workers.")
        except Exception as e:
            Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))
Beispiel #3
0
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Beispiel #4
0
    def stop(cls):
        """
        DECONSTRUCTS ANY LOGGING, AND RETURNS TO DIRECT-TO-stdout LOGGING
        EXECUTING MULUTIPLE TIMES IN A ROW IS SAFE, IT HAS NO NET EFFECT, IT STILL LOGS TO stdout
        :return: NOTHING
        """

        from mo_threads import profiles

        if cls.cprofiler and hasattr(cls, "settings"):
            if cls.cprofiler == None:
                from mo_threads import Queue

                cls.cprofiler_stats = Queue(
                    "cprofiler stats"
                )  # ACCUMULATION OF STATS FROM ALL THREADS

            import pstats
            cls.cprofiler_stats.add(pstats.Stats(cls.cprofiler))
            write_profile(cls.settings.cprofile, cls.cprofiler_stats.pop_all())

        if profiles.ON and hasattr(cls, "settings"):
            profiles.write(cls.settings.profile)
        cls.main_log.stop()
        cls.main_log = StructuredLogger_usingStream(sys.stdout)
    def __init__(self, stream):
        assert stream

        if is_text(stream):
            name = stream
            stream = self.stream = eval(stream)
            if name.startswith("sys.") and PY3:
                self.stream = Data(write=lambda d: stream.write(d.decode('utf8')))
        else:
            name = "stream"
            self.stream = stream

        # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD
        from mo_threads import Queue

        def utf8_appender(value):
            if is_text(value):
                value = value.encode('utf8')
            self.stream.write(value)

        appender = utf8_appender

        self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True)
        self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3)
        self.thread.parent.remove_child(self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()
Beispiel #6
0
    def _find_revision(self, revision):
        please_stop = False
        locker = Lock()
        output = []
        queue = Queue("branches", max=2000)
        queue.extend(b for b in self.branches if b.locale == DEFAULT_LOCALE and b.name in ["try", "mozilla-inbound", "autoland"])
        queue.add(THREAD_STOP)

        problems = []
        def _find(please_stop):
            for b in queue:
                if please_stop:
                    return
                try:
                    url = b.url + "json-info?node=" + revision
                    rev = self.get_revision(Revision(branch=b, changeset={"id": revision}))
                    with locker:
                        output.append(rev)
                    Log.note("Revision found at {{url}}", url=url)
                except Exception as f:
                    problems.append(f)

        threads = []
        for i in range(3):
            threads.append(Thread.run("find changeset " + text_type(i), _find, please_stop=please_stop))

        for t in threads:
            with assert_no_exception:
                t.join()

        return output
    def __init__(self, logger):
        if not isinstance(logger, StructuredLogger):
            Log.error("Expecting a StructuredLogger")

        self.queue = Queue("Queue for " + self.__class__.__name__,
                           max=10000,
                           silent=True,
                           allow_add_after_close=True)
        self.logger = logger

        def worker(logger, please_stop):
            try:
                while not please_stop:
                    Till(seconds=1).wait()
                    logs = self.queue.pop_all()
                    for log in logs:
                        if log is THREAD_STOP:
                            please_stop.go()
                        else:
                            logger.write(**log)
            finally:
                logger.stop()

        self.thread = Thread("Thread for " + self.__class__.__name__, worker,
                             logger)
        self.thread.parent.remove_child(
            self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()
    def __init__(
        self,
        host,
        index,
        port=9200,
        type="log",
        queue_size=1000,
        batch_size=100,
        kwargs=None,
    ):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds
        kwargs.retry.times = coalesce(kwargs.retry.times, 3)
        kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds

        self.es = Cluster(kwargs).get_or_create_index(
            schema=json2value(value2json(SCHEMA), leaves=True),
            limit_replicas=True,
            typed=True,
            kwargs=kwargs,
        )
        self.batch_size = batch_size
        self.es.add_alias(coalesce(kwargs.alias, kwargs.index))
        self.queue = Queue("debug logs to es", max=queue_size, silent=True)

        self.worker = Thread.run("add debug logs to es", self._insert_loop)
Beispiel #9
0
    def __init__(self, stream):
        assert stream

        use_UTF8 = False

        if isinstance(stream, basestring):
            if stream.startswith("sys."):
                use_UTF8 = True  # sys.* ARE OLD AND CAN NOT HANDLE unicode
            self.stream = eval(stream)
            name = stream
        else:
            self.stream = stream
            name = "stream"

        # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD
        from mo_threads import Queue

        if use_UTF8:
            def utf8_appender(value):
                if isinstance(value, unicode):
                    value = value.encode('utf8')
                self.stream.write(value)

            appender = utf8_appender
        else:
            appender = self.stream.write

        self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True)
        self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3)
        self.thread.parent.remove_child(self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()
Beispiel #10
0
    def __init__(
        self,
        host,
        index,
        port=9200,
        type="log",
        queue_size=1000,
        batch_size=100,
        refresh_interval="1second",
        kwargs=None,
    ):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds
        kwargs.retry.times = coalesce(kwargs.retry.times, 3)
        kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep,
                                               MINUTE)).seconds
        kwargs.host = randoms.sample(listwrap(host), 1)[0]

        rollover_interval = coalesce(kwargs.rollover.interval,
                                     kwargs.rollover.max, "year")
        rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval,
                                "year")

        schema = set_default(
            kwargs.schema,
            {
                "mappings": {
                    kwargs.type: {
                        "properties": {
                            "~N~": {
                                "type": "nested"
                            }
                        }
                    }
                }
            },
            json2value(value2json(SCHEMA), leaves=True),
        )

        self.es = RolloverIndex(
            rollover_field={"get": [{
                "first": "."
            }, {
                "literal": "timestamp"
            }]},
            rollover_interval=rollover_interval,
            rollover_max=rollover_max,
            schema=schema,
            limit_replicas=True,
            typed=True,
            read_only=False,
            kwargs=kwargs,
        )
        self.batch_size = batch_size
        self.queue = Queue("debug logs to es", max=queue_size, silent=True)

        self.worker = Thread.run("add debug logs to es", self._insert_loop)
Beispiel #11
0
def update_local_database(config, deviant_summary, candidates, since):
    if isinstance(deviant_summary, bigquery.Table):
        Log.note("Only the ETL process should fill the bigquery table")
        return

    # GET EVERYTHING WE HAVE SO FAR
    exists = deviant_summary.query({
        "select": ["signature_hash", "last_updated"],
        "where": {
            "and": [
                {
                    "in": {
                        "signature_hash": candidates.signature_hash
                    }
                },
                {
                    "exists": "num_pushes"
                },
            ]
        },
        "sort": "last_updated",
        "limit": 100000,
        "format": "list",
    }).data
    # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT"
    missing = list(set(candidates.signature_hash) - set(exists.signature_hash))

    too_old = Date.today() - parse(LOCAL_RETENTION)
    needs_update = missing + [
        e.signature_hash for e in exists if e.last_updated < too_old.unix
    ]
    Log.alert("{{num}} series are candidates for local update",
              num=len(needs_update))

    limited_update = Queue("sigs")
    limited_update.extend(
        left(needs_update, coalesce(config.display.download_limit, 100)))
    Log.alert("Updating local database with {{num}} series",
              num=len(limited_update))

    with Timer("Updating local database"):

        def loop(please_stop):
            while not please_stop:
                signature_hash = limited_update.pop_one()
                if not signature_hash:
                    return
                process(
                    signature_hash,
                    since,
                    source=config.database,
                    deviant_summary=deviant_summary,
                )

        threads = [Thread.run(text(i), loop) for i in range(3)]
        for t in threads:
            t.join()

    Log.note("Local database is up to date")
Beispiel #12
0
def _late_import():
    global _Log

    from mo_logs import Log as _Log
    from mo_threads import Queue

    if _Log.cprofiler_stats == None:
        _Log.cprofiler_stats = Queue(
            "cprofiler stats")  # ACCUMULATION OF STATS FROM ALL THREADS
 def __init__(self, es_cluster):
     Table.__init__(self, META_COLUMNS_NAME)
     self.data = {}  # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS)
     self.locker = Lock()
     self._schema = None
     self.dirty = False
     self.es_cluster = es_cluster
     self.es_index = None
     self.last_load = Null
     self.for_es_update = Queue(
         "update columns to es"
     )  # HOLD (action, column) PAIR, WHERE action in ['insert', 'update']
     self._db_load()
     self.delete_queue = Queue(
         "delete columns from es")  # CONTAINS (es_index, after) PAIRS
     Thread.run("update " + META_COLUMNS_NAME,
                self._update_from_es,
                parent_thread=MAIN_THREAD).release()
     Thread.run("delete columns",
                self._delete_columns,
                parent_thread=MAIN_THREAD).release()
Beispiel #14
0
    def __init__(self, filename=None, db=None):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread save database
        """
        if not _upgraded:
            _upgrade()

        self.filename = filename
        self.db = db
        self.queue = Queue("sql commands")   # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = DEBUG
Beispiel #15
0
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = Relation_usingList()

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList()

        self.alias_to_query_paths = {
            "meta.columns": [['.']],
            "meta.tables": [['.']]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [
                # TableDesc("meta.columns", None, ".", Date.now()),
                # TableDesc("meta.tables", None, ".", Date.now())
            ],
            jx_base.Schema(".", table_columns))
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Beispiel #16
0
 def __init__(self, db):
     Table.__init__(self, META_COLUMNS_NAME)
     self.data = {}  # MAP FROM fact_name TO (abs_column_name to COLUMNS)
     self.locker = Lock()
     self._schema = None
     self.dirty = False
     self.db = db
     self.es_index = None
     self.last_load = Null
     self.todo = Queue(
         "update columns to es"
     )  # HOLD (action, column) PAIR, WHERE action in ['insert', 'update']
     self._snowflakes = Data()
     self._load_from_database()
Beispiel #17
0
    def __init__(self, filename=None, db=None, get_trace=None, upgrade=True, load_functions=False, kwargs=None):
        """
        :param filename:  FILE TO USE FOR DATABASE
        :param db: AN EXISTING sqlite3 DB YOU WOULD LIKE TO USE (INSTEAD OF USING filename)
        :param get_trace: GET THE STACK TRACE AND THREAD FOR EVERY DB COMMAND (GOOD FOR DEBUGGING)
        :param upgrade: REPLACE PYTHON sqlite3 DLL WITH MORE RECENT ONE, WITH MORE FUNCTIONS (NOT WORKING)
        :param load_functions: LOAD EXTENDED MATH FUNCTIONS (MAY REQUIRE upgrade)
        :param kwargs:
        """
        if upgrade and not _upgraded:
            _upgrade()

        self.settings = kwargs
        self.filename = File(filename).abspath
        if known_databases.get(self.filename):
            Log.error("Not allowed to create more than one Sqlite instance for {{file}}", file=self.filename)

        # SETUP DATABASE
        DEBUG and Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version)
        try:
            if db == None:
                self.db = sqlite3.connect(
                    database=coalesce(self.filename, ":memory:"),
                    check_same_thread=False,
                    isolation_level=None
                )
            else:
                self.db = db
        except Exception as e:
            Log.error("could not open file {{filename}}", filename=self.filename, cause=e)
        load_functions and self._load_functions()

        self.locker = Lock()
        self.available_transactions = []  # LIST OF ALL THE TRANSACTIONS BEING MANAGED
        self.queue = Queue("sql commands")   # HOLD (command, result, signal, stacktrace) TUPLES

        self.get_trace = coalesce(get_trace, TRACE)
        self.upgrade = upgrade
        self.closed = False

        # WORKER VARIABLES
        self.transaction_stack = []  # THE TRANSACTION OBJECT WE HAVE PARTIALLY RUN
        self.last_command_item = None  # USE THIS TO HELP BLAME current_transaction FOR HANGING ON TOO LONG
        self.too_long = None
        self.delayed_queries = []
        self.delayed_transactions = []
        self.worker = Thread.run("sqlite db thread", self._worker)

        DEBUG and Log.note("Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0])
Beispiel #18
0
    def __init__(
        self,
        name,
        broker=None,
        include=None,
        **kwargs
    ):
        self.Task = MethodCaller

        self.name = name
        self.request_queue = Queue(name=name+" requests")
        self.response_queue = Queue(name=name+" responses")
        self.kwargs = kwargs
        self.include = include
        self.broker = broker
        self._config = {}
        self._tasks = {}
        self.on_init()
        self.response_worker = Thread.run("response worker", self._response_worker)
        self.responses = {}
        self.responses_lock = Lock()
        self.id_lock = Lock()
        self.next_id = 1
        self.worker = Worker(self.request_queue, self.response_queue, celery=self)
Beispiel #19
0
 def __init__(self, name):
     Table.__init__(self, "meta.columns")
     self.db_file = File("metadata." + name + ".sqlite")
     self.data = {}  # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS)
     self.locker = Lock()
     self._schema = None
     self.db = sqlite3.connect(
         database=self.db_file.abspath, check_same_thread=False, isolation_level=None
     )
     self.last_load = Null
     self.todo = Queue(
         "update columns to db"
     )  # HOLD (action, column) PAIR, WHERE action in ['insert', 'update']
     self._db_load()
     Thread.run("update " + name, self._db_worker)
    def __init__(self, logger, period=PERIOD):
        if not isinstance(logger, StructuredLogger):
            Log.error("Expecting a StructuredLogger")

        self.logger = logger
        self.queue = Queue(
            "Queue for " + self.__class__.__name__,
            max=10000,
            silent=True,
            allow_add_after_close=True,
        )
        self.thread = Thread("Thread for " + self.__class__.__name__, worker,
                             logger, self.queue, period)
        # worker WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.parent.remove_child(self.thread)
        self.thread.start()
Beispiel #21
0
def update_local_database():
    # GET EVERYTHING WE HAVE SO FAR
    exists = summary_table.query({
        "select": ["id", "last_updated"],
        "where": {
            "and": [{
                "in": {
                    "id": candidates.id
                }
            }, {
                "exists": "num_pushes"
            }]
        },
        "sort": "last_updated",
        "limit": 100000,
        "format": "list",
    }).data
    # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT"
    missing = list(set(candidates.id) - set(exists.id))

    too_old = Date.today() - parse(LOCAL_RETENTION)
    needs_update = missing + [
        e for e in exists if e.last_updated < too_old.unix
    ]
    Log.alert("{{num}} series are candidates for local update",
              num=len(needs_update))

    limited_update = Queue("sigs")
    limited_update.extend(
        left(needs_update, coalesce(config.analysis.download_limit, 100)))
    Log.alert("Updating local database with {{num}} series",
              num=len(limited_update))

    with Timer("Updating local database"):

        def loop(please_stop):
            while not please_stop:
                sig_id = limited_update.pop_one()
                if not sig_id:
                    return
                process(sig_id)

        threads = [Thread.run(text(i), loop) for i in range(3)]
        for t in threads:
            t.join()

    Log.note("Local database is up to date")
Beispiel #22
0
    def __init__(self, filename=None, db=None, upgrade=True, load_functions=False, kwargs=None):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread-safe database
        """
        if upgrade and not _upgraded:
            _upgrade()

        self.settings = kwargs
        self.filename = File(filename).abspath
        self.db = db
        self.queue = Queue("sql commands")   # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = TRACE
        self.upgrade = upgrade
        self.closed = False
        if DEBUG:
            Log.note("Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0])
Beispiel #23
0
    def stop(cls):
        from mo_logs import profiles

        if cls.cprofiler and hasattr(cls, "settings"):
            if cls.cprofiler == None:
                from mo_threads import Queue

                cls.cprofiler_stats = Queue(
                    "cprofiler stats"
                )  # ACCUMULATION OF STATS FROM ALL THREADS

            import pstats
            cls.cprofiler_stats.add(pstats.Stats(cls.cprofiler))
            write_profile(cls.settings.cprofile, cls.cprofiler_stats.pop_all())

        if profiles.ON and hasattr(cls, "settings"):
            profiles.write(cls.settings.profile)
        cls.main_log.stop()
        cls.main_log = StructuredLogger_usingStream(sys.stdout)
Beispiel #24
0
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        if hasattr(self, "settings"):
            return

        self.settings = kwargs
        self.too_old = TOO_OLD
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)
        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.meta = Data()
        self.meta.columns = ColumnList(self.es_cluster)
        self.meta.columns.extend(META_TABLES_DESC.columns)
        self.meta.tables = ListContainer(META_TABLES_NAME, [], jx_base.Schema(".", META_TABLES_DESC.columns))
        self.meta.table.extend([META_COLUMNS_DESC, META_TABLES_DESC])
        self.alias_to_query_paths = {}
        for i, settings in self.es_cluster.get_metadata().indices.items():
            if len(settings.aliases) == 0:
                alias = i
            elif len(settings.aliases) == 1:
                alias = first(settings.aliases)
            else:
                Log.error("expecting only one alias per index")

            desc = TableDesc(
                name=alias,
                url=None,
                query_path=ROOT_PATH,
                last_updated=Date.MIN,
                columns=[]
            )
            self.meta.tables.add(desc)
            self.alias_to_query_paths[alias] = [desc.query_path]
            self.alias_to_query_paths[self._find_alias(alias)] = [desc.query_path]

        # WE MUST PAUSE?

        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("not refresh metadata", self.not_monitor)
        return
Beispiel #25
0
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = elasticsearch.Cluster(kwargs=kwargs)
        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.abs_columns = set()
        self.last_es_metadata = Date.now() - OLD_METADATA

        self.meta = Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer(
            "meta.tables", [], wrap({c.names["."]: c
                                     for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Beispiel #26
0
    def __init__(self, kwargs=None):
        self.settings = kwargs
        self.schema = SnowflakeSchema(self.settings.snowflake)
        self._extract = extract = kwargs.extract

        # SOME PREP
        get_git_revision()

        # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF
        with MySQL(**kwargs.snowflake.database) as db:
            processes = None
            try:
                processes = jx.filter(
                    db.query("show processlist"), {
                        "and": [{
                            "neq": {
                                "Command": "Sleep"
                            }
                        }, {
                            "neq": {
                                "Info": "show processlist"
                            }
                        }]
                    })
            except Exception as e:
                Log.warning("no database", cause=e)

            if processes:
                if DEBUG:
                    Log.warning("Processes are running\n{{list|json}}",
                                list=processes)
                else:
                    Log.error("Processes are running\n{{list|json}}",
                              list=processes)

        extract.type = listwrap(extract.type)
        extract.start = listwrap(extract.start)
        extract.batch = listwrap(extract.batch)
        extract.field = listwrap(extract.field)
        if any(
                len(extract.type) != len(other)
                for other in [extract.start, extract.batch, extract.field]):
            Log.error(
                "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object"
            )
        for i, t in enumerate(extract.type):
            if t == "time":
                extract.start[i] = Date(extract.start[i])
                extract.batch[i] = Duration(extract.batch[i])
            elif t == "number":
                pass
            else:
                Log.error('Expecting `extract.type` to be "number" or "time"')

        extract.threads = coalesce(extract.threads, 1)
        self.done_pulling = Signal()
        self.queue = Queue("all batches",
                           max=2 * coalesce(extract.threads, 1),
                           silent=True)

        self.bucket = s3.Bucket(self.settings.destination)
        self.notify = aws.Queue(self.settings.notify)
        Thread.run("get records", self.pull_all_remaining)
Beispiel #27
0
    def __init__(
        self,
        table,
        typed,
        read_only,
        sharded,
        container,
        id=Null,
        partition=Null,
        cluster=Null,
        top_level_fields=Null,
        kwargs=None,
    ):
        self.short_name = table
        self.typed = typed
        self.read_only = read_only
        self.cluster = cluster
        self.id = id
        self.top_level_fields = top_level_fields
        self.config = Data(  # USED TO REPLICATE THIS
            typed=typed,
            read_only=read_only,
            sharded=sharded,
            id=id,
            partition=partition,
            cluster=cluster,
            top_level_fields=top_level_fields,
        )

        esc_name = escape_name(table)
        self.full_name = container.full_name + esc_name
        self.alias_view = alias_view = container.client.get_table(text(self.full_name))
        self.partition = partition
        self.container = container

        if not sharded:
            if not read_only and alias_view.table_type == "VIEW":
                Log.error("Expecting a table, not a view")
            self.shard = alias_view
            self._flake = Snowflake.parse(
                alias_view.schema,
                text(self.full_name),
                self.top_level_fields,
                partition,
            )
        else:
            if alias_view.table_type != "VIEW":
                Log.error("Sharded tables require a view")
            current_view = container.client.get_table(text(self.full_name))
            view_sql = current_view.view_query
            shard_name = _extract_primary_shard_name(view_sql)
            try:
                self.shard = container.client.get_table(
                    text(container.full_name + shard_name)
                )
                self._flake = Snowflake.parse(
                    alias_view.schema,
                    text(self.full_name),
                    self.top_level_fields,
                    partition,
                )
            except Exception as e:
                Log.warning("view {{name}} is invalid", name=shard_name, cause=e)
                self._flake = Snowflake.parse(
                    alias_view.schema,
                    text(self.full_name),
                    self.top_level_fields,
                    partition,
                )
                # REMOVE STALE VIEW
                container.client.delete_table(current_view)

                # MAKE NEW VIEW POINTING TO NEW SHARD
                self._create_new_shard()
                container.create_view(
                    self.full_name,
                    self.container.full_name + ApiName(self.shard.table_id),
                )

        self.last_extend = Date.now() - EXTEND_LIMIT
        self.extend_locker = Lock()
        self.extend_queue = Queue("wait for extend")
Beispiel #28
0
 def __init__(self, name=None):
     queue_name = "log messages to queue"
     if name:
         queue_name += " " + name
     self.queue = Queue(queue_name)
Beispiel #29
0
    def __init__(self,
                 conn=None,
                 tuid_service=None,
                 start_workers=True,
                 new_table=False,
                 kwargs=None):
        try:
            self.config = kwargs
            self.conn = conn if conn else sql.Sql(self.config.database.name)
            self.hg_cache = HgMozillaOrg(
                kwargs=self.config.hg_cache,
                use_cache=True) if self.config.hg_cache else Null

            self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService(
                kwargs=self.config.tuid, conn=self.conn, clogger=self)
            self.rev_locker = Lock()
            self.working_locker = Lock()

            if new_table:
                with self.conn.transaction() as t:
                    t.execute("DROP TABLE IF EXISTS csetLog")

            self.init_db()
            self.next_revnum = coalesce(
                self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1)
            self.csets_todo_backwards = Queue(
                name="Clogger.csets_todo_backwards")
            self.deletions_todo = Queue(name="Clogger.deletions_todo")
            self.maintenance_signal = Signal(name="Clogger.maintenance_signal")

            if 'tuid' in self.config:
                self.config = self.config.tuid

            self.disable_backfilling = False
            self.disable_tipfilling = False
            self.disable_deletion = False
            self.disable_maintenance = False

            self.backfill_thread = None
            self.tipfill_thread = None
            self.deletion_thread = None
            self.maintenance_thread = None

            # Make sure we are filled before allowing queries
            numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0]
            if numrevs < MINIMUM_PERMANENT_CSETS:
                Log.note("Filling in csets to hold {{minim}} csets.",
                         minim=MINIMUM_PERMANENT_CSETS)
                oldest_rev = 'tip'
                with self.conn.transaction() as t:
                    tmp = t.query(
                        "SELECT min(revnum), revision FROM csetLog").data[0][1]
                    if tmp:
                        oldest_rev = tmp
                self._fill_in_range(MINIMUM_PERMANENT_CSETS - numrevs,
                                    oldest_rev,
                                    timestamp=False)

            Log.note("Table is filled with atleast {{minim}} entries.",
                     minim=MINIMUM_PERMANENT_CSETS)

            if start_workers:
                self.start_workers()
        except Exception as e:
            Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))