Beispiel #1
0
async def related_word_extractor(parent_docid, doc_datetime, term, debug=False):
    es = Elasticsearch(['%s:%d'%(es_ip, es_port)])
    #print("%s %d" % ((es_ip, es_port)))
    highlight_req =  {
            "_source" : [""],
             "query": {
                "bool": {
                  "filter": [
                    {
                      "term": {
                        "_id": parent_docid
                      }
                    },
                    {
                      "query_string": {
                        "query": term,
                        "fields": ["doc_title", "doc_content"],
                        "default_operator": "AND"
                      }
                    }
                  ]
                }
              },
              "highlight": {
                "fields": {
                  "_all" : {},
                  "doc_title": {
                    "fragment_size": 30,
                    "number_of_fragments": 1,
                    "fragmenter": "simple"
                  },
                  "doc_content": {
                    "fragment_size": 30,
                    "number_of_fragments": 3,
                    "fragmenter": "simple"
                  }
                }
              }
             }
    

    result = await es.search(index=INDEX_DOCUMENTS+"-"+re.sub("-" , ".", doc_datetime[:doc_datetime.find("T")]), doc_type=TYPE_DOC, body=highlight_req)
        
    related = []
    if result['hits']['total']>0:
        title_fragments = []
        content_fragments = []
        for a in result['hits']['hits']:
            if 'doc_title' in a['highlight']:
                title_fragments = [ fragment for fragment in a['highlight']['doc_title']  ]
            if 'doc_content' in a['highlight']:
                content_fragments = [ fragment for fragment in a['highlight']['doc_content'] ]

        for f in (title_fragments+content_fragments):
            related += await get_close_word(f, debug)
    
    es.close()
    
    return list(filter(lambda x:len(x)>1, list(sorted(set(related), key=lambda x:related.index(x)))))
Beispiel #2
0
def client(es_params, index, loop):
    client = Elasticsearch([{'host': es_params['host']}], loop=loop)
    try:
        loop.run_until_complete(client.delete(index, '', ''))
    except NotFoundError:
        pass
    yield client
    client.close()
Beispiel #3
0
def client(es_params, loop, repo_name, snapshot_name):
    client = Elasticsearch([{'host': es_params['host']}], loop=loop)
    try:
        loop.run_until_complete(client.delete(INDEX, '', ''))
    except NotFoundError:
        pass
    yield client

    # cleaning up just in case
    try:
        loop.run_until_complete(
            client.snapshot.delete(repo_name, snapshot_name))
    except NotFoundError:
        pass
    try:
        loop.run_until_complete(client.snapshot.delete_repository(repo_name))
    except NotFoundError:
        pass

    client.close()
class ElasticsearchController:
    def __init__(self):
        self._elasticsearch = None
        self.elasticsearch_conn = False
        self._loop = None

    def get_loop(self):
        return self._loop

    def get_elasticsearch_client(self):
        return self._elasticsearch_client

    @timestamped
    async def get_value(self, index: str, id: str, doc_type='_all'):
        """
        Get record from elasticsearch using id and index
        :param index: used to search the named index for records
        :param id: used to find record in the index
        :param doc_type:
        :return: []
        """
        try:
            assert await self.exists(index, id) is True
            result = await self._elasticsearch_client.get(index, id)
        except AssertionError as e:
            logger.error(
                "Elasticsearch client doesn't exist when it should. " + str(e))
            result = ""

        result = result.decode('utf-8')
        return result

    @timestamped
    async def create(self, index, doc_type, body, id=None):
        """
        Used to create new record in the elasticsearch database
        :param index: used to create in that specific index
        :param doc_type: specify elasticsearch document type
        :param body: actual body of the record to be created in the database
        :return: Json object
        """
        return await self._elasticsearch_client.create(index,
                                                       doc_type,
                                                       body,
                                                       id=42)

    async def exists(self, index, id):
        """
        Used to check if record exist in the elasticsearch database using id and index
        :param index: search index for record
        :param id: find record for the id
        :return:
        """
        return await self._elasticsearch_client.exists(index, id)

    @timestamped
    async def update(self, index, doc_type, id, body=None):
        """
        Used to update record in the elasticsearch database
        :param index: select index to update record in that index
        :param doc_type: specify elasticsearch document type
        :param id: Identifier for the record to be updated
        :param body: the actual body for the record
        :return:
        """
        return await self._elasticsearch_client.update(index,
                                                       doc_type,
                                                       id,
                                                       body=body)

    @timestamped
    async def search(self, index=None, doc_type=None, body=None):
        """
        Used to search for the record in the elasticsearch database
        :param index: used to search the index
        :param doc_type:
        :param body: query to be executed to match the result
        :return:
        """
        return await self._elasticsearch_client.search(index=index,
                                                       doc_type=doc_type,
                                                       body=body)

    @timestamped
    async def delete(self, index, doc_type, id):
        """
        Used to delete record from elasticsearch database
        :param index: specify the index for the record to be deleted
        :param doc_type:
        :param id: specify the id for the record to be deleted
        :return:
        """
        return await self._elasticsearch_client.delete(index, doc_type, id)

    def set_loop(self, loop):
        self._loop = loop

    @timestamped
    async def add_elasticsearch_connection(self):
        elasticsearch_host = None
        elasticsearch_port = None
        try:

            elasticsearch_host = os.getenv("ELASTICSEARCH_HOST",
                                           "redis.dev.muchneededllc.com")
            elasticsearch_port = str(os.getenv("ELASTICSEARCH_PORT", 9200))

        except OSError as e:
            logger.error(
                "Couldn't get environmental variables for elasticearch. " +
                str(e))
            exit(1)

        try:
            if self._loop is not None:
                address = ':'.join([elasticsearch_host, elasticsearch_port])
                self._elasticsearch_client = Elasticsearch([address],
                                                           loop=self._loop)

                logger.info(self._elasticsearch_client)
                self.elasticsearch_conn = True
                logger.debug("Created Elasticsearch Client.")
            else:
                logger.error(
                    "Couldn't create elasticsearch client because loop hasn't been set."
                )

        except Exception as e:
            logger.error("couldn't open elasticsearch.")
            raise Exception(e)

    def cleanup(self):
        if self.elasticsearch_conn:
            self._elasticsearch_client.close()
Beispiel #5
0
class ElasticSearchManager(DefaultSearchUtility):
    def __init__(self, settings={}, loop=None):
        self.loop = loop
        self._conn = None
        self._migration_lock = None

    @property
    def bulk_size(self):
        return self.settings.get('bulk_size', 50)

    @property
    def settings(self):
        return app_settings.get('elasticsearch', {})

    @property
    def conn(self):
        if self._conn is None:
            self._conn = Elasticsearch(loop=self.loop,
                                       **self.settings['connection_settings'])
        return self._conn

    @property
    def enabled(self):
        return len(
            self.settings.get('connection_settings', {}).get('endpoints',
                                                             [])) > 0

    async def initialize(self, app):
        self.app = app
        self._migration_lock = asyncio.Lock()

    async def finalize(self, app):
        if self._conn is not None:
            self._conn.close()

    async def get_registry(self, container, request):
        if request is None:
            request = get_current_request()
        if hasattr(request, 'container_settings'):
            return request.container_settings
        annotations_container = IAnnotations(container)
        request.container_settings = await annotations_container.async_get(
            REGISTRY_DATA_KEY)
        return request.container_settings

    async def get_real_index_name(self, container, request=None):
        index_name = await self.get_index_name(container, request)
        version = await self.get_version(container, request)
        return index_name + '_' + str(version)

    async def get_index_name(self, container, request=None):
        registry = await self.get_registry(container, request)

        try:
            result = registry['el_index_name']
        except KeyError:
            result = app_settings['elasticsearch'].get(
                'index_name_prefix', 'guillotina-') + container.id
        return result

    async def get_next_index_name(self, container, request=None):
        registry = await self.get_registry(container, request)
        if ('el_next_index_version' not in registry
                or registry['el_next_index_version'] is None):
            return None
        index_name = await self.get_index_name(container, request)
        version = registry['el_next_index_version']
        return index_name + '_' + str(version)

    async def set_index_name(self, container, name, request=None):
        registry = await self.get_registry(container, request)
        registry['el_index_name'] = name
        registry._p_register()

    async def initialize_catalog(self, container):
        if not self.enabled:
            return
        await self.remove_catalog(container)
        index_name = await self.get_index_name(container)
        real_index_name = await self.get_real_index_name(container)

        await safe_es_call(self.conn.indices.create, real_index_name)
        await safe_es_call(self.conn.indices.put_alias, index_name,
                           real_index_name)
        await safe_es_call(self.conn.indices.close, index_name)
        await safe_es_call(self.install_mappings_on_index, index_name)

        await self.conn.indices.open(index_name)
        await self.conn.cluster.health(wait_for_status='yellow')
        await self.set_index_name(container, index_name)

    async def remove_catalog(self, container):
        if not self.enabled:
            return
        index_name = await self.get_index_name(container)
        real_index_name = await self.get_real_index_name(container)
        await safe_es_call(self.conn.indices.close, real_index_name)
        await safe_es_call(self.conn.indices.delete_alias, real_index_name,
                           index_name)
        await safe_es_call(self.conn.indices.delete, real_index_name)
        await safe_es_call(self.conn.indices.delete, index_name)

    async def get_version(self, container, request=None):
        registry = await self.get_registry(container, request)
        try:
            version = registry['el_index_version']
        except KeyError:
            version = 1
        return version

    async def set_version(self, container, version, request=None, force=False):
        registry = await self.get_registry(container, request)
        if (not force and 'el_next_index_version' in registry
                and registry['el_next_index_version'] is not None):
            raise Exception(
                'Cannot change index while migration is in progress')
        registry['el_index_version'] = version
        registry._p_register()

    async def stats(self, container):
        index_name = await self.get_index_name(container)
        return await self.conn.indices.stats(index_name)

    async def install_mappings_on_index(self, index_name):
        mappings = get_mappings()
        index_settings = DEFAULT_SETTINGS.copy()
        index_settings.update(app_settings.get('index', {}))
        await self.conn.indices.close(index_name)
        await self.conn.indices.put_settings(index_settings, index_name)
        for key, value in mappings.items():
            await self.conn.indices.put_mapping(index_name, key, value)
        await self.conn.indices.open(index_name)

    async def activate_next_index(self,
                                  container,
                                  version,
                                  request=None,
                                  force=False):
        '''
        Next index support designates an index to also push
        delete and index calls to
        '''
        registry = await self.get_registry(container, request)
        if not force:
            try:
                assert registry['el_next_index_version'] is None
            except KeyError:
                pass
        registry['el_next_index_version'] = version
        registry._p_register()

    async def disable_next_index(self, container, request=None):
        '''
        Next index support designates an index to also push
        delete and index calls to
        '''
        registry = await self.get_registry(container, request)
        registry['el_next_index_version'] = None
        registry._p_register()

    async def apply_next_index(self, container, request=None):
        # make sure to reload the registry to make sure we have the latest
        # to write to
        if (request is not None and hasattr(request, 'container_settings')
                and REGISTRY_DATA_KEY in container.__annotations__):
            await request._txn.refresh(request.container_settings)
        registry = await self.get_registry(container, request)
        assert registry['el_next_index_version'] is not None
        await self.set_version(container,
                               registry['el_next_index_version'],
                               request,
                               force=True)
        registry['el_next_index_version'] = None
        registry._p_register()