def update_entity_crf_data(self, entity_name, sentences, **kwargs):
        """
        This method is used to populate the training data for a given entity

        Args:
            entity_name (str): Name of the entity for which the training data has to be populated
            sentences (Dict[str, List[Dict[str, str]]]: sentences mapped against their languages
                E.g. {"en": [{"sentence": "hello abc", "entities": ["abc"],}, ...], ...}
            **kwargs: For Elasticsearch:
                Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk

        Raises:
            IndexNotFoundException: Description
            IndexNotFoundException if es_training_index was not found in connection settings
        """
        if self._client_or_connection is None:
            self._connect()

        if self._engine == ELASTICSEARCH:
            self._check_doc_type_for_crf_data_elasticsearch()

            es_training_index = self._connection_settings.get(ELASTICSEARCH_CRF_DATA_INDEX_NAME)
            if es_training_index is None:
                raise IndexNotFoundException('Index for ELASTICSEARCH_CRF_DATA_INDEX_NAME not found. '
                                             'Please configure the same')

            elastic_search \
                .populate \
                .update_entity_crf_data_populate(connection=self._client_or_connection,
                                                 index_name=es_training_index,
                                                 doc_type=self._connection_settings[ELASTICSEARCH_CRF_DATA_DOC_TYPE],
                                                 logger=ner_logger,
                                                 sentences=sentences,
                                                 entity_name=entity_name,
                                                 **kwargs)
Example #2
0
    def get_crf_data_for_entity_name(self, entity_name, **kwargs):
        """
        This method is used to obtain the sentences and entities from sentences given entity name
        Args:
            entity_name (str): Entity name for which training data needs to be obtained
            kwargs:
                For Elasticsearch:
                    Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search
        Returns:
            results_dictionary(dict): Dictionary consisting of the training data for the the given entity.

        Raises:
             IndexNotFoundException if es_training_index was not found in connection settings

        Example:
            db = Datastore()
            db.get_entity_training_data(entity_name, **kwargs):
            >> {
        'sentence_list': [
            'My name is hardik',
            'This is my friend Ajay'
                        ],
        'entity_list': [
            [
                'hardik'
            ],
            [
                'Ajay'
            ]
                        ]
            }
        """
        ner_logger.debug(
            'Datastore, get_entity_training_data, entity_name %s' %
            entity_name)
        if self._client_or_connection is None:
            self._connect()
        results_dictionary = {}
        if self._engine == ELASTICSEARCH:
            es_training_index = self._connection_settings.get(
                ELASTICSEARCH_CRF_DATA_INDEX_NAME)
            if es_training_index is None:
                raise IndexNotFoundException(
                    'Index for ELASTICSEARCH_CRF_DATA_INDEX_NAME not found. '
                    'Please configure the same')
            self._check_doc_type_for_crf_data_elasticsearch()
            request_timeout = self._connection_settings.get(
                'request_timeout', 20)
            results_dictionary = elastic_search.query.get_crf_data_for_entity_name(
                connection=self._client_or_connection,
                index_name=es_training_index,
                doc_type=self.
                _connection_settings[ELASTICSEARCH_CRF_DATA_DOC_TYPE],
                entity_name=entity_name,
                request_timeout=request_timeout,
                **kwargs)
            ner_logger.debug(
                'Datastore, get_entity_training_data, results_dictionary %s' %
                str(entity_name))
        return results_dictionary
Example #3
0
    def transfer_data_internal(es_url, index_to_backup, backup_index):
        """
        Transfer data from index_to_backup to backup_index in ES

        Args
            es_url (str): The elasticsearch URL
            index_to_backup (str): The name of the index to take the backup for
            backup_index (str): The name of the backup index
        """
        index_to_backup_url = '{es_url}/{index_to_backup}'.\
            format(**{"es_url": es_url, "index_to_backup": index_to_backup})

        backup_index_url = '{es_url}/{backup_index}'.\
            format(**{"es_url": es_url, "backup_index": backup_index})

        # Fetch index to backup config
        index_to_backup_url_response = requests.get(index_to_backup_url)

        if index_to_backup_url_response.status_code != 200:
            message = "index to backup details could not be fetched"
            raise IndexNotFoundException(message)

        index_to_backup_config = json.loads(
            index_to_backup_url_response.content)[index_to_backup]
        index_to_backup_config["settings"]["index"].pop("creation_date", None)
        index_to_backup_config["settings"]["index"].pop("uuid", None)
        index_to_backup_config["settings"]["index"].pop("provided_name", None)
        index_to_backup_config["settings"]["index"].pop("version", None)
        index_to_backup_config.pop("aliases", None)

        requests.delete(backup_index_url)

        requests.put(backup_index_url, json=index_to_backup_config)

        final_request_dict = {
            "source": {
                "index": index_to_backup,
                "size": 10000
            },
            "dest": {
                "index": backup_index
            }
        }
        reindex_response = requests.post(
            '{es_url}/_reindex'.format(**{'es_url': es_url}),
            json=final_request_dict,
            params={
                "refresh": "true",
                "wait_for_completion": "true"
            })
        if reindex_response.status_code != 200:
            message = "transfer from " + index_to_backup + "to " + backup_index + " failed"
            raise InternalBackupException(message)
Example #4
0
    def check_if_index_exits(es_url, index_name):
        """
        This function checks if index exists in es_url

        Args
        es_url (string): The elsticsearch URL
        index_name (string): Name of the index to check the existence for
        """
        index_response = requests.get('{es_url}/_cat/indices?v'.format(**{"es_url": es_url}))
        # check if index is present in source
        if " " + index_name + " " not in index_response.content:
            message = index_name + " does not exist in " + es_url
            ner_logger.debug("check_if_index_exits - " + str(message))
            raise IndexNotFoundException(message)
Example #5
0
    def update_entity_crf_data(self, entity_name, entity_list, language_script,
                               sentence_list, **kwargs):
        """
        This method is used to populate the training data for a given entity
        Args:
            entity_name (str): Name of the entity for which the training data has to be populated
            entity_list (list): List consisting of the entities corresponding to the sentence_list
            sentence_list (list): List of sentences for training
            language_script (str): Language code for the language script used.
            **kwargs:
                For Elasticsearch:
                Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk

        Raises:
            IndexNotFoundException if es_training_index was not found in connection settings
        """
        if self._client_or_connection is None:
            self._connect()

        if self._engine == ELASTICSEARCH:
            self._check_doc_type_for_crf_data_elasticsearch()

            es_training_index = self._connection_settings.get(
                ELASTICSEARCH_CRF_DATA_INDEX_NAME)
            if es_training_index is None:
                raise IndexNotFoundException(
                    'Index for ELASTICSEARCH_CRF_DATA_INDEX_NAME not found. '
                    'Please configure the same')

            elastic_search.populate.update_entity_crf_data_populate(
                connection=self._client_or_connection,
                index_name=es_training_index,
                doc_type=self.
                _connection_settings[ELASTICSEARCH_CRF_DATA_DOC_TYPE],
                logger=ner_logger,
                entity_list=entity_list,
                sentence_list=sentence_list,
                entity_name=entity_name,
                language_script=language_script,
                **kwargs)