Ejemplo n.º 1
0
def get_branches(hg, branches, kwargs=None):
    # TRY ES
    try:
        es = elasticsearch.Cluster(kwargs=branches).get_index(kwargs=branches, read_only=False)

        query = {
            "query": {"match_all": {}},
            "size": 10000
        }

        found_branches = es.search(query).hits.hits._source
        # IF IT IS TOO OLD, THEN PULL FROM HG
        oldest = Date(MAX(found_branches.etl.timestamp))
        if oldest == None or Date.now() - oldest > OLD_BRANCH:
            found_branches = _get_branches_from_hg(hg)
            es.extend({"id": b.name + " " + b.locale, "value": b} for b in found_branches)
            es.flush()

        try:
            return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False)
        except Exception as e:
            Log.error("Bad branch in ES index", cause=e)
    except Exception as e:
        if "Can not find index " in e:
            set_default(branches, {"schema": branches_schema})
            es = elasticsearch.Cluster(kwargs=branches).get_or_create_index(kwargs=branches)
            es.add_alias()
            return get_branches(kwargs=kwargs)
        Log.error("problem getting branches", cause=e)
Ejemplo n.º 2
0
 def __new__(cls, kwargs, *args, **_kwargs):
     es_cluster = elasticsearch.Cluster(kwargs)
     output = known_clusters.get(id(es_cluster))
     if output is None:
         output = object.__new__(cls)
         known_clusters[id(es_cluster)] = output
     return output
Ejemplo n.º 3
0
    def __init__(self,
                 host,
                 index,
                 type=None,
                 alias=None,
                 name=None,
                 port=9200,
                 read_only=True,
                 typed=None,
                 settings=None):
        Container.__init__(self, None)
        if not containers.config.default:
            containers.config.default.settings = settings
        self.settings = settings
        self.name = coalesce(name, alias, index)
        if read_only:
            self._es = elasticsearch.Alias(alias=coalesce(alias, index),
                                           settings=settings)
        else:
            self._es = elasticsearch.Cluster(settings=settings).get_index(
                read_only=read_only, settings=settings)

        self.meta = FromESMetadata(settings=settings)
        self.settings.type = self._es.settings.type
        self.edges = Dict()
        self.worker = None
        if typed == None:
            self._columns = self.get_columns(table_name=index)
            # SWITCH ON TYPED MODE
            self.typed = any(c.name in ("$value", "$object")
                             for c in self._columns)
        else:
            self.typed = typed
Ejemplo n.º 4
0
def get_branches(hg, branches, kwargs=None):
    # TRY ES
    cluster = elasticsearch.Cluster(branches)
    try:
        es = cluster.get_index(kwargs=branches, read_only=False)
        esq = jx_elasticsearch.new_instance(branches)
        found_branches = esq.query({"from": "branches", "format": "list", "limit": 10000}).data

        # IF IT IS TOO OLD, THEN PULL FROM HG
        oldest = Date(MAX(found_branches.etl.timestamp))
        if oldest == None or Date.now() - oldest > OLD_BRANCH:
            found_branches = _get_branches_from_hg(hg)
            es.extend({"id": b.name + " " + b.locale, "value": b} for b in found_branches)
            es.flush()

        try:
            return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False)
        except Exception as e:
            Log.error("Bad branch in ES index", cause=e)
    except Exception as e:
        e = Except.wrap(e)
        if "Can not find index " in e:
            set_default(branches, {"schema": branches_schema})
            es = cluster.get_or_create_index(branches)
            es.add_alias()
            return get_branches(kwargs)
        Log.error("problem getting branches", cause=e)
Ejemplo n.º 5
0
    def __init__(
        self,
        rollover_field,  # the FIELD with a timestamp to use for determining which index to push to
        rollover_interval,  # duration between roll-over to new index
        rollover_max,  # remove old indexes, do not add old records
        schema,  # es schema
        queue_size=10000,  # number of documents to queue in memory
        batch_size=5000,  # number of documents to push at once
        typed=None,  # indicate if we are expected typed json
        kwargs=None  # plus additional ES settings
    ):
        if kwargs.tjson != None:
            Log.error("not expected")
        if typed == None:
            Log.error("not expected")

        schema.settings.index.max_result_window = 100000  # REQUIRED FOR ACTIVEDATA NESTED QUERIES
        schema.settings.index.max_inner_result_window = 100000  # REQUIRED FOR ACTIVEDATA NESTED QUERIES

        self.settings = kwargs
        self.locker = Lock("lock for rollover_index")
        self.rollover_field = jx.get(rollover_field)
        self.rollover_interval = self.settings.rollover_interval = Duration(
            rollover_interval)
        self.rollover_max = self.settings.rollover_max = Duration(rollover_max)
        self.known_queues = {}  # MAP DATE TO INDEX
        self.cluster = elasticsearch.Cluster(self.settings)
Ejemplo n.º 6
0
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Ejemplo n.º 7
0
 def __init__(self,
              rollover_field,
              rollover_interval,
              rollover_max,
              queue_size=10000,
              batch_size=5000,
              kwargs=None):
     """
     :param rollover_field: the FIELD with a timestamp to use for determining which index to push to
     :param rollover_interval: duration between roll-over to new index
     :param rollover_max: remove old indexes, do not add old records
     :param queue_size: number of documents to queue in memory
     :param batch_size: number of documents to push at once
     :param kwargs: plus additional ES settings
     :return:
     """
     self.settings = kwargs
     self.locker = Lock("lock for rollover_index")
     self.rollover_field = jx.get(rollover_field)
     self.rollover_interval = self.settings.rollover_interval = Duration(
         kwargs.rollover_interval)
     self.rollover_max = self.settings.rollover_max = Duration(
         kwargs.rollover_max)
     self.known_queues = {}  # MAP DATE TO INDEX
     self.cluster = elasticsearch.Cluster(self.settings)
Ejemplo n.º 8
0
def setup(settings=None):
    global config

    try:
        config = startup.read_settings(defs={
            "name": ["--process_num", "--process"],
            "help": "Additional port offset (for multiple Flask processes",
            "type": int,
            "dest": "process_num",
            "default": 0,
            "required": False
        },
                                       filename=settings)
        constants.set(config.constants)
        Log.start(config.debug)

        if config.args.process_num and config.flask.port:
            config.flask.port += config.args.process_num

        # PIPE REQUEST LOGS TO ES DEBUG
        if config.request_logs:
            request_logger = elasticsearch.Cluster(
                config.request_logs).get_or_create_index(config.request_logs)
            active_data.request_log_queue = request_logger.threaded_queue(
                max_size=2000)

        # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY
        containers.config.default = {
            "type": "elasticsearch",
            "settings": config.elasticsearch.copy()
        }

        # TURN ON /exit FOR WINDOWS DEBUGGING
        if config.flask.debug or config.flask.allow_exit:
            config.flask.allow_exit = None
            Log.warning("ActiveData is in debug mode")
            app.add_url_rule('/exit', 'exit', _exit)

        # TRIGGER FIRST INSTANCE
        FromESMetadata(config.elasticsearch)
        if config.saved_queries:
            setattr(save_query, "query_finder",
                    SaveQueries(config.saved_queries))
        HeaderRewriterFix(app, remove_headers=['Date', 'Server'])

        if config.flask.ssl_context:
            if config.args.process_num:
                Log.error(
                    "can not serve ssl and multiple Flask instances at once")
            setup_ssl()

        return app
    except Exception, e:
        Log.error(
            "Serious problem with ActiveData service construction!  Shutdown!",
            cause=e)
Ejemplo n.º 9
0
 def __init__(self,
              host,
              index,
              alias=None,
              name=None,
              port=9200,
              settings=None):
     self.settings = settings
     self.name = coalesce(name, alias, index)
     self._es = elasticsearch.Cluster(settings=settings)
     self.metadata = self._es.get_metadata()
     self.columns = None
Ejemplo n.º 10
0
    def __init__(self, settings, queue_size=10000):
        self.settings = settings
        self.queue_size = queue_size
        self.indicies = {}  # MAP DATE (AS UNIX TIMESTAMP) TO INDEX

        es = elasticsearch.Cluster(
            self.settings).get_or_create_index(settings=self.settings)
        es.add_alias(self.settings.index)
        es.set_refresh_interval(seconds=60 * 60)
        self.queue = es.threaded_queue(max_size=self.queue_size,
                                       batch_size=5000,
                                       silent=False)
        self.es = elasticsearch.Alias(alias=settings.index, settings=settings)
Ejemplo n.º 11
0
def main():
    try:
        config = startup.read_settings()
        with startup.SingleInstance(flavor_id=config.args.filename):
            constants.set(config.constants)
            Log.start(config.debug)

            please_stop = Signal("main stop signal")
            coverage_index = elasticsearch.Cluster(config.source).get_index(settings=config.source)
            config.destination.schema = coverage_index.get_schema()
            coverage_summary_index = elasticsearch.Cluster(config.destination).get_or_create_index(read_only=False, settings=config.destination)
            coverage_summary_index.add_alias(config.destination.index)
            Thread.run(
                "processing loop",
                loop,
                config.source,
                coverage_summary_index,
                config,
                please_stop=please_stop
            )
            Thread.wait_for_shutdown_signal(please_stop)
    except Exception, e:
        Log.error("Problem with code coverage score calculation", cause=e)
Ejemplo n.º 12
0
def backfill(settings):

    source = aws.s3.Bucket(settings=settings.source)
    destination = elasticsearch.Cluster(
        settings=settings.destination).get_or_create_index(
            settings=settings.destination)

    keep_trying = True
    while keep_trying:
        try:
            all_keys = source.keys()
            keep_trying = False
        except Exception, e:
            Log.warning("problem", e)
Ejemplo n.º 13
0
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = Relation_usingList()

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList()

        self.alias_to_query_paths = {
            "meta.columns": [['.']],
            "meta.tables": [['.']]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [
                # TableDesc("meta.columns", None, ".", Date.now()),
                # TableDesc("meta.tables", None, ".", Date.now())
            ],
            jx_base.Schema(".", table_columns))
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Ejemplo n.º 14
0
def get_branches(hg, branches, use_cache=True, settings=None):
    if not settings.branches or not use_cache:
        found_branches = _get_branches_from_hg(hg)

        es = elasticsearch.Cluster(settings=branches).get_or_create_index(
            settings=branches)
        es.add_alias()
        es.extend({
            "id": b.name + " " + b.locale,
            "value": b
        } for b in found_branches)
        es.flush()
        return found_branches

    # TRY ES
    try:
        es = elasticsearch.Cluster(settings=branches).get_index(
            settings=branches)
        query = {"query": {"match_all": {}}, "size": 20000}

        docs = es.search(query).hits.hits._source
        # IF IT IS TOO OLD, THEN PULL FROM HG
        oldest = Date(Math.MAX(docs.etl.timestamp))
        if Date.now() - oldest > OLD_BRANCH:
            return get_branches(use_cache=False, settings=settings)

        try:
            return UniqueIndex(["name", "locale"],
                               data=docs,
                               fail_on_dup=False)
        except Exception, e:
            Log.error("Bad branch in ES index", cause=e)
    except Exception, e:
        if "Can not find index " in e:
            return get_branches(use_cache=False, settings=settings)
        Log.error("problem getting branches", cause=e)
Ejemplo n.º 15
0
    def __init__(
            self,
            host,
            index,
            type=None,
            alias=None,
            name=None,
            port=9200,
            read_only=True,
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
            typed=None,
            kwargs=None):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, alias, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=coalesce(alias, index),
                                          kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self._namespace.get_snowflake(
            self.es.settings.alias).columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error(
                    "Expecting given typed {{typed}} to match {{is_typed}}",
                    typed=typed,
                    is_typed=is_typed)
            self.typed = typed
Ejemplo n.º 16
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        branches = _get_branches_from_hg(settings.hg)

        es = elasticsearch.Cluster(kwargs=settings.hg.branches).get_or_create_index(kwargs=settings.hg.branches)
        es.add_alias()
        es.extend({"id": b.name + " " + b.locale, "value": b} for b in branches)
        Log.alert("DONE!")
    except Exception as e:
        Log.error("Problem with etl", e)
    finally:
        Log.stop()
Ejemplo n.º 17
0
    def __init__(
            self,
            hg=None,  # CONNECT TO hg
            repo=None,  # CONNECTION INFO FOR ES CACHE
            branches=None,  # CONNECTION INFO FOR ES CACHE
            use_cache=False,  # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES
            timeout=30 * SECOND,
            kwargs=None):
        if not _hg_branches:
            _late_imports()

        self.es_locker = Lock()
        self.todo = mo_threads.Queue("todo for hg daemon",
                                     max=DAEMON_QUEUE_SIZE)

        self.settings = kwargs
        self.timeout = Duration(timeout)

        # VERIFY CONNECTIVITY
        with Explanation("Test connect with hg"):
            response = http.head(self.settings.hg.url)

        if branches == None:
            self.branches = _hg_branches.get_branches(kwargs=kwargs)
            self.es = None
            return

        self.last_cache_miss = Date.now()

        set_default(repo, {"schema": revision_schema})
        self.es = elasticsearch.Cluster(kwargs=repo).get_or_create_index(
            kwargs=repo)

        def setup_es(please_stop):
            with suppress_exception:
                self.es.add_alias()

            with suppress_exception:
                self.es.set_refresh_interval(seconds=1)

        Thread.run("setup_es", setup_es)
        self.branches = _hg_branches.get_branches(kwargs=kwargs)
        self.timeout = timeout
        Thread.run("hg daemon", self._daemon)
Ejemplo n.º 18
0
    def __init__(
            self,
            host,
            index,
            type=None,
            alias=None,
            name=None,
            port=9200,
            read_only=True,
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
            typed=None,
            kwargs=None):
        Container.__init__(self, None)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = coalesce(name, alias, index)
        if read_only:
            self._es = elasticsearch.Alias(alias=coalesce(alias, index),
                                           kwargs=kwargs)
        else:
            self._es = elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs)

        self.meta = FromESMetadata(kwargs=kwargs)
        self.settings.type = self._es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.meta.get_columns(
            table_name=coalesce(name, alias, index))
        self._schema = Schema(coalesce(name, alias, index), columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = any(
                c.es_column.find("." + TYPE_PREFIX) != -1 for c in columns)
        else:
            self.typed = typed
Ejemplo n.º 19
0
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        if hasattr(self, "settings"):
            return

        self.settings = kwargs
        self.too_old = TOO_OLD
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)
        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.meta = Data()
        self.meta.columns = ColumnList(self.es_cluster)
        self.meta.columns.extend(META_TABLES_DESC.columns)
        self.meta.tables = ListContainer(META_TABLES_NAME, [], jx_base.Schema(".", META_TABLES_DESC.columns))
        self.meta.table.extend([META_COLUMNS_DESC, META_TABLES_DESC])
        self.alias_to_query_paths = {}
        for i, settings in self.es_cluster.get_metadata().indices.items():
            if len(settings.aliases) == 0:
                alias = i
            elif len(settings.aliases) == 1:
                alias = first(settings.aliases)
            else:
                Log.error("expecting only one alias per index")

            desc = TableDesc(
                name=alias,
                url=None,
                query_path=ROOT_PATH,
                last_updated=Date.MIN,
                columns=[]
            )
            self.meta.tables.add(desc)
            self.alias_to_query_paths[alias] = [desc.query_path]
            self.alias_to_query_paths[self._find_alias(alias)] = [desc.query_path]

        # WE MUST PAUSE?

        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("not refresh metadata", self.not_monitor)
        return
Ejemplo n.º 20
0
def setup():
    global config

    config = startup.read_settings(
        filename=os.environ.get('ACTIVEDATA_CONFIG'),
        defs=[
            {
                "name": ["--process_num", "--process"],
                "help": "Additional port offset (for multiple Flask processes",
                "type": int,
                "dest": "process_num",
                "default": 0,
                "required": False
            }
        ]
    )

    constants.set(config.constants)
    Log.start(config.debug)

    # PIPE REQUEST LOGS TO ES DEBUG
    if config.request_logs:
        cluster = elasticsearch.Cluster(config.request_logs)
        request_logger = cluster.get_or_create_index(config.request_logs)
        active_data.request_log_queue = request_logger.threaded_queue(max_size=2000)

    if config.dockerflow:
        def backend_check():
            http.get_json(config.elasticsearch.host + ":" + text_type(config.elasticsearch.port))
        dockerflow(flask_app, backend_check)


    # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY
    container.config.default = {
        "type": "elasticsearch",
        "settings": config.elasticsearch.copy()
    }

    # TRIGGER FIRST INSTANCE
    if config.saved_queries:
        setattr(save_query, "query_finder", SaveQueries(config.saved_queries))

    HeaderRewriterFix(flask_app, remove_headers=['Date', 'Server'])
Ejemplo n.º 21
0
    def __init__(self,
                 hg,
                 rate_limit,
                 use_cache=True,
                 cache=None,
                 settings=None):
        self.settings = settings
        self.failure_classification = {
            c.id: c.name
            for c in http.get_json(FAILURE_CLASSIFICATION_URL)
        }
        self.repo = {c.id: c.name for c in http.get_json(REPO_URL)}
        self.hg = hg
        self.cache = elasticsearch.Cluster(cache).get_or_create_index(cache)
        self.locker = Lock()
        self.pending = {}

        self.rate_locker = Lock()
        self.request_times = [0] * rate_limit
        self.request_pointer = 0
Ejemplo n.º 22
0
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = elasticsearch.Cluster(kwargs=kwargs)
        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.abs_columns = set()
        self.last_es_metadata = Date.now() - OLD_METADATA

        self.meta = Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer(
            "meta.tables", [], wrap({c.names["."]: c
                                     for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Ejemplo n.º 23
0
    def __init__(
        self,
        host,
        index,
        type=None,
        alias=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        consistency="one",  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self, None)
        if not containers.config.default:
            containers.config.default.settings = kwargs
        self.settings = kwargs
        self.name = coalesce(name, alias, index)
        if read_only:
            self._es = elasticsearch.Alias(alias=coalesce(alias, index), kwargs=kwargs)
        else:
            self._es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self.meta = FromESMetadata(kwargs=kwargs)
        self.settings.type = self._es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.get_columns(table_name=name)
        self._schema = Schema(columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = any(c.name in ("$value", "$object") for c in columns)
        else:
            self.typed = typed
Ejemplo n.º 24
0
    def __init__(
        self,
        host,
        index,
        type=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=index, kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {".": None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if not v:
                    return []
                else:
                    return [v] + nested_path_of(all_paths[v])

            all = sort_using_key(set(step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p)))
            for step in sorted(all):
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                nested_path = nested_path_of(all_paths[p])
                if not nested_path:
                    nested_path = ['.']
                self.namespace.meta.columns.add(Column(
                    name=p,
                    es_column=p,
                    es_index=self.name,
                    es_type=OBJECT,
                    jx_type=EXISTS,
                    nested_path=nested_path,
                    last_updated=Date.now()
                ))
Ejemplo n.º 25
0
def loop(source, coverage_summary_index, settings, please_stop):
    try:
        cluster = elasticsearch.Cluster(source)
        aliases = cluster.get_aliases()
        candidates = []
        for pairs in aliases:
            if pairs.alias == source.index:
                candidates.append(pairs.index)
        candidates = jx.sort(candidates, {".": "desc"})

        for index_name in candidates:
            coverage_index = elasticsearch.Index(index=index_name, read_only=False, settings=source)
            push_date_filter = unicode2Date(coverage_index.settings.index[-15::], elasticsearch.INDEX_DATE_FORMAT)

            while not please_stop:
                # IDENTIFY NEW WORK
                Log.note("Working on index {{index}}", index=index_name)
                coverage_index.refresh()

                todo = http.post_json(settings.url, json={
                    "from": "coverage",
                    "groupby": ["source.file.name", "build.revision12"],
                    "where": {"and": [
                        {"missing": "source.method.name"},
                        {"missing": "source.file.min_line_siblings"},
                        {"gte": {"repo.push.date": push_date_filter}}
                    ]},
                    "format": "list",
                    "limit": coalesce(settings.batch_size, 100)
                })

                if not todo.data:
                    break

                queue = Queue("pending source files to review")
                queue.extend(todo.data[0:coalesce(settings.batch_size, 100):])

                threads = [
                    Thread.run(
                        "processor" + unicode(i),
                        process_batch,
                        queue,
                        coverage_index,
                        coverage_summary_index,
                        settings,
                        please_stop=please_stop
                    )
                    for i in range(NUM_THREAD)
                ]

                # ADD STOP MESSAGE
                queue.add(Thread.STOP)

                # WAIT FOR THEM TO COMPLETE
                for t in threads:
                    t.join()

        please_stop.go()
        return

    except Exception, e:
        Log.warning("Problem processing", cause=e)
Ejemplo n.º 26
0
                    fuzzytestcase.assertAlmostEqual(e[0], settings)
                    return e[1]
                except Exception, _:
                    pass
            output = S3Bucket(settings)
            sinks.append((settings, output))
            return output
    else:
        with sinks_locker:
            for e in sinks:
                try:
                    fuzzytestcase.assertAlmostEqual(e[0], settings)
                    return e[1]
                except Exception, _:
                    pass
            output = elasticsearch.Cluster(settings).get_or_create_index(
                settings)
            if settings.use_daily is not False:
                output = MultiDayIndex(settings)
            else:
                output = output.threaded_queue(max_size=2000, batch_size=1000)
                setattr(output, "keys", lambda prefix: set())

            sinks.append((settings, output))
            return output


def main():

    try:
        settings = startup.read_settings(defs=[{
            "name": ["--id"],