Ejemplo n.º 1
0
class BiothingESWebSettings(BiothingWebSettings):
    '''
    `BiothingWebSettings`_ subclass with functions specific to an elasticsearch backend.

    * Use the known live ES connection if more than one is specified.
    * Cache source metadata stored under the _meta field in es indices.

    '''

    ES_VERSION = elasticsearch.__version__[0]

    def __init__(self, config='biothings.web.settings.default'):
        '''
        The ``config`` init parameter specifies a module that configures
        this biothing.  For more information see `config module`_ documentation.
        '''
        super(BiothingESWebSettings, self).__init__(config)

        # temp. move import there to hide async dep unless used
        from elasticsearch_async.transport import AsyncTransport
        from elasticsearch_dsl.connections import Connections

        # elasticsearch connections
        self._connections = Connections()

        connection_settings = {
            "hosts": self.ES_HOST,
            "timeout": self.ES_CLIENT_TIMEOUT,
            "max_retries":
            1,  # maximum number of retries before an exception is propagated
            "timeout_cutoff":
            1,  # number of consecutive failures after which the timeout doesn’t increase
            "selector_class": KnownLiveSelecter
        }
        self._connections.create_connection(alias='sync',
                                            **connection_settings)
        connection_settings.update(transport_class=AsyncTransport)
        self._connections.create_connection(alias='async',
                                            **connection_settings)

        # project metadata under index mappings
        self._source_metadata = {}

        # populate field notes if exist
        try:
            inf = open(self.AVAILABLE_FIELDS_NOTES_PATH, 'r')
            self._fields_notes = json.load(inf)
            inf.close()
        except Exception:
            self._fields_notes = {}

        # initialize payload for standalone tracking batch
        self.tracking_payload = []

    def validate(self):
        '''
        Additional ES settings to validate.
        '''
        super().validate()

        assert isinstance(self.ES_INDEX, str)
        assert isinstance(self.ES_DOC_TYPE, str)
        assert isinstance(self.ES_INDICES, dict)
        assert '*' not in self.ES_DOC_TYPE

        self.ES_INDICES[self.ES_DOC_TYPE] = self.ES_INDEX

    def get_es_client(self):
        '''
        Return the default blocking elasticsearch client.
        The connection is created upon first call.
        '''
        return self._connections.get_connection('sync')

    def get_async_es_client(self):
        '''
        Return the async elasitcsearch client. API calls return awaitable objects.
        The connection is created upon first call.
        '''
        return self._connections.get_connection('async')

    def get_source_metadata(self, biothing_type=None, latest=True):
        '''
        Get metadata defined in the ES index.

        :param biothing_type: If multiple biothings are defined, specify which here.
        :param latest: If set to `false`, return the cached copy. Otherwise retrieve latest.

        '''
        biothing_type = biothing_type or self.ES_DOC_TYPE
        cached = biothing_type in self._source_metadata

        if latest or not cached:
            kwargs = {
                'index': self.ES_INDICES[biothing_type],
                'allow_no_indices': True,
                'ignore_unavailable': True,
                'local': not latest
            }
            if self.ES_VERSION < 7:
                kwargs['doc_type'] = biothing_type

            mappings = self.get_es_client().indices.get_mapping(**kwargs)
            metadata = {}

            for index in mappings:

                if self.ES_VERSION < 7:
                    _meta = mappings[index]['mappings'][biothing_type].get(
                        '_meta', {})
                else:
                    _meta = mappings[index]['mappings'].get('_meta', {})

                metadata.update(_meta)

            self._source_metadata[biothing_type] = metadata

        return self._source_metadata[biothing_type]

    def get_field_notes(self):
        '''
        Return the cached field notes associated with this instance.
        '''
        return self._fields_notes

    ##### COMPATIBILITY METHODS #####

    @property
    def es_client(self):
        return self.get_es_client()

    @property
    def async_es_client(self):
        return self.get_async_es_client()

    def source_metadata(self):
        return self.get_source_metadata()

    def doc_url(self, bid):
        return os.path.join(self.URL_BASE, self.API_VERSION, self.ES_DOC_TYPE,
                            bid)

    def available_fields_notes(self):
        return self._fields_notes
Ejemplo n.º 2
0
class DataConnections:
    """
        Elasticsearch Database Connection
        Connections are created upon first call
    """
    _should_log_es_py_ver = run_once()  # evaluate to true once
    _should_log_es_host_ver = run_once()  # evaluate to true once per host

    def __init__(self, settings):

        self.settings = settings
        self._connections = Connections()

        connection_settings = {
            "hosts": self.settings.ES_HOST,
            "timeout": self.settings.ES_CLIENT_TIMEOUT
        }
        connection_settings.update(transport_class=BiothingsTransport)
        self._connections.create_connection(alias='sync',
                                            **connection_settings)
        connection_settings.update(transport_class=BiothingsAsyncTransport)
        if self.settings.ES_SNIFF:
            connection_settings.update(sniffer_timeout=60)
            connection_settings.update(sniff_on_start=True)
            connection_settings.update(sniff_on_connection_fail=True)
        self._connections.create_connection(alias='async',
                                            **connection_settings)

    async def log_versions(self):

        if DataConnections._should_log_es_py_ver():

            self.settings.logger.info(
                "Python Elasticsearch Version: %s",
                '.'.join(map(str, elasticsearch.__version__)))
            self.settings.logger.info(
                "Python Elasticsearch DSL Version: %s",
                '.'.join(map(str, elasticsearch_dsl.__version__)))

            if elasticsearch.__version__[0] != elasticsearch_dsl.__version__[0]:
                self.settings.logger.error(
                    "ES Pacakge Version Mismatch with ES-DSL.")

        if DataConnections._should_log_es_host_ver(self.settings.ES_HOST):

            versions = await get_es_versions(self.async_client)
            self.settings.logger.info('[%s] %s: %s', self.settings.ES_HOST,
                                      versions["elasticsearch_cluster"],
                                      versions["elasticsearch_version"])

            major_version = versions["elasticsearch_version"].split('.')[0]
            if major_version.isdigit(
            ) and int(major_version) != elasticsearch.__version__[0]:
                self.settings.logger.error("ES Python Version Mismatch.")

    def get_connection(self, connection):
        return self._connections.get_connection(connection)

    @property
    def client(self):
        '''
        Return the blocking elasticsearch client.
        '''
        return self.get_connection('sync')

    @property
    def async_client(self):
        '''
        Return the async elasitcsearch client.
        API calls return awaitable objects.
        '''
        return self.get_connection('async')
Ejemplo n.º 3
0
class BiothingESWebSettings(BiothingWebSettings):
    '''
    `BiothingWebSettings`_ subclass with functions specific to an elasticsearch backend.
    '''
    def __init__(self, config=None, parent=None, **kwargs):
        '''
        The ``config`` init parameter specifies a module that configures
        this biothing.  For more information see `config module`_ documentation.
        '''
        super(BiothingESWebSettings, self).__init__(config, parent, **kwargs)

        # elasticsearch connections
        self._connections = Connections()

        connection_settings = {
            "hosts": self.ES_HOST,
            "timeout": self.ES_CLIENT_TIMEOUT
        }
        connection_settings.update(transport_class=BiothingsTransport)
        self._connections.create_connection(alias='sync',
                                            **connection_settings)
        connection_settings.update(transport_class=BiothingsAsyncTransport)
        if self.ES_SNIFF:
            connection_settings.update(sniffer_timeout=60)
            connection_settings.update(sniff_on_start=True)
            connection_settings.update(sniff_on_connection_fail=True)
        self._connections.create_connection(alias='async',
                                            **connection_settings)

        # cached index mappings
        self.source_metadata = defaultdict(dict)
        self.source_properties = defaultdict(dict)

        # populate field notes if exist
        try:
            inf = open(self.AVAILABLE_FIELDS_NOTES_PATH, 'r')
            self._fields_notes = json.load(inf)
            inf.close()
        except Exception:
            self._fields_notes = {}

        # user query data
        self.userquery = ESUserQuery(self.USERQUERY_DIR)

        # query pipelines
        self.query_builder = self.load_class(self.ES_QUERY_BUILDER)(self)
        self.query_backend = self.load_class(self.ES_QUERY_BACKEND)(self)
        self.result_transform = self.load_class(self.ES_RESULT_TRANSFORM)(self)

        # initialize payload for standalone tracking batch
        self.tracking_payload = []

        self.ES_INDICES = dict(self.ES_INDICES)  # TODO
        self.ES_INDICES[self.ES_DOC_TYPE] = self.ES_INDEX
        self.BIOTHING_TYPES = list(self.ES_INDICES.keys())

        IOLoop.current().add_callback(self._initialize)

    async def _initialize(self):

        # failures will be logged concisely
        logging.getLogger('elasticsearch.trace').propagate = False

        if should_log_es_py_ver():
            self.logger.info("Python Elasticsearch Version: %s",
                             elasticsearch.__version__)
            self.logger.info("Python Elasticsearch DSL Version: %s",
                             elasticsearch_dsl.__version__)
            if elasticsearch.__version__[0] != elasticsearch_dsl.__version__[0]:
                self.logger.error("ES Pacakge Version Mismatch with ES-DSL.")
        if should_log_es_host_ver(self.ES_HOST):
            versions = await get_es_versions(self.async_es_client)
            self.logger.info('Elasticsearch Version: %s',
                             versions["elasticsearch_version"])
            self.logger.info('Elasticsearch Cluster: %s',
                             versions["elasticsearch_cluster"])
            major_version = versions["elasticsearch_version"].split('.')[0]
            if major_version.isdigit(
            ) and int(major_version) != elasticsearch.__version__[0]:
                self.logger.error("ES Python Version Mismatch.")

        # populate source mappings
        for biothing_type in self.ES_INDICES:
            await self.read_index_mappings(biothing_type)

        # resume normal log flow
        logging.getLogger('elasticsearch.trace').propagate = True

    def validate(self):
        '''
        Additional ES settings to validate.
        '''
        super().validate()

        assert isinstance(self.ES_INDEX, str)
        assert isinstance(self.ES_DOC_TYPE, str)
        assert isinstance(self.ES_INDICES, dict)
        assert '*' not in self.ES_DOC_TYPE

    def get_es_client(self):
        '''
        Return the default blocking elasticsearch client.
        The connection is created upon first call.
        '''
        return self._connections.get_connection('sync')

    def get_async_es_client(self):
        '''
        Return the async elasitcsearch client.
        The connection is created upon first call.
        API calls return awaitable objects.
        '''
        return self._connections.get_connection('async')

    async def read_index_mappings(self, biothing_type=None):
        """
        Read ES index mappings for the corresponding biothing_type,
        Populate datasource info and field properties from mappings.
        Return ES raw response. This implementation combines indices.

        The ES response would look like: (for es7+)
        {
            'index_1': {
                'properties': { ... },  ---> source_properties
                '_meta': {
                    "src" : { ... }     ---> source_licenses
                    ...
                },              -----------> source_metadata
                ...
            },
            'index_2': {
                ...     ---------> Combine with results above
            }
        }
        """

        biothing_type = biothing_type or self.ES_DOC_TYPE
        try:
            mappings = await self.async_es_client.indices.get_mapping(
                index=self.ES_INDICES[biothing_type],
                allow_no_indices=True,
                ignore_unavailable=True,
                local=False)
        except elasticsearch.TransportError as exc:
            self.logger.error('Error loading index mapping for [%s].',
                              biothing_type)
            self.logger.debug(str(exc))
            return None

        metadata = self.source_metadata[biothing_type]
        properties = self.source_properties[biothing_type]
        licenses = self.result_transform.source_licenses[biothing_type]

        metadata.clear()
        properties.clear()
        licenses.clear()

        metadata['_biothing'] = biothing_type
        metadata['_indices'] = list(mappings.keys())

        for index in mappings:

            mapping = mappings[index]['mappings']

            if mapping and elasticsearch.__version__[0] < 7:
                # remove doc_type, support 1 type per index
                mapping = next(iter(mapping.values()))

            if '_meta' in mapping:
                for key, val in mapping['_meta'].items():
                    # combine dict from multiple index
                    if key in metadata and isinstance(val, dict) \
                            and isinstance(metadata[key], dict):
                        metadata[key].update(val)
                    else:  # otherwise set/replace
                        metadata[key] = val

                # metadata.update(mapping['_meta'])  # alternative, no combine

                if 'src' in mapping['_meta']:
                    for src, info in mapping['_meta']['src'].items():
                        if 'license_url_short' in info:
                            licenses[src] = info['license_url_short']
                        elif 'license_url' in info:
                            licenses[src] = info['license_url']

            if 'properties' in mapping:
                properties.update(mapping['properties'])

        return mappings

    def get_field_notes(self):
        '''
        Return the cached field notes associated with this instance.
        '''
        return self._fields_notes

    @property
    def es_client(self):
        return self.get_es_client()

    @property
    def async_es_client(self):
        return self.get_async_es_client()