Example #1
0
    def flush(self):
        """
        Flushes the buffer into ES

        :return: None
        """
        if self._timer is not None and self._timer.is_alive():
            self._timer.cancel()
        self._timer = None

        if self._buffer:
            try:
                with self._buffer_lock:
                    logs_buffer = self._buffer
                    self._buffer = []
                actions = ({
                    '_index':
                    self._index_name_func.__func__(self.es_index_name),
                    '_source':
                    log_record
                } for log_record in logs_buffer)
                eshelpers.bulk(client=self.__get_es_client(),
                               actions=actions,
                               stats_only=True)
            except Exception as exception:
                if self.raise_on_indexing_exceptions:
                    raise exception
Example #2
0
def load_file(test_executor, filename="accounts.json"):
    opensearch = test_executor.client

    filepath = "./test_data/" + filename

    # generate iterable data
    def load_json():
        with open(filepath, "r") as f:
            for line in f:
                yield json.loads(line)

    helpers.bulk(opensearch, load_json(), index=TEST_INDEX_NAME)
Example #3
0
    def add_data_bulk(
        self,
        data: List[IndexableData],
        index_base_name: Optional[str] = None,
    ) -> None:
        index_name = self._get_wip_alias(index_base_name
                                         or self.index_base_names[0])

        body = [{
            "_index": index_name,
            "_source": asdict(d) if is_dataclass(d) else d
        } for d in data]
        try:
            bulk(self.es, body)
        except ConnectionError as e:
            logger.error(e)
Example #4
0
def save_case_list(event, context=None):
    """
  End-point: Saves list of case instances
  Creates index for the casebase if one does not exist
  """
    # try:
    doc_list = json.loads(event['body'])  # parameters in request body
    es = getESConn()
    pid = event['pathParameters']['id']
    proj = utility.getByUniqueField(es, projects_db, "_id", pid)  # project
    # create index with mapping if it does not exist already
    project.indexMapping(es, proj)

    # Add documents to created index
    # print("Adding a hash field to each case for duplicate-checking")
    for x in doc_list:  # generate a hash after ordering dict by key
        x = retrieve.add_vector_fields(proj['attributes'],
                                       x)  # add vectors to Semantic USE fields
        x = retrieve.add_lowercase_fields(
            proj['attributes'],
            x)  # use lowercase values for EqualIgnoreCase fields
        x['hash__'] = str(
            hashlib.md5(
                json.dumps(OrderedDict(sorted(x.items()))).encode('utf-8')).
            digest())  # case hash for easy detection of duplicates
    # print("Attempting to index the list of docs using helpers.bulk()")
    resp = helpers.bulk(es, doc_list, index=proj['casebase'], doc_type="_doc")

    # Indicate that the project has a casebase
    # print("Casebase added. Attempting to update project detail. Set hasCasebase => True")
    proj['hasCasebase'] = True
    source_to_update = {'doc': proj}
    # print(source_to_update)
    res = es.update(index=projects_db, id=pid, body=source_to_update)
    # print(res)

    # create the ontology similarity if specified as part of project attributes (can be a lengthy operation for mid to large ontologies!)
    for attrib in proj['attributes']:
        if attrib['type'] == "Ontology Concept" and attrib.get(
                'similarityType') is not None and attrib.get(
                    'options') is not None and retrieve.checkOntoSimilarity(
                        attrib['options'].get('id'))['statusCode'] != 200:
            sim_method = 'san' if attrib[
                'similarityType'] == 'Feature-based' else 'wup'
            retrieve.setOntoSimilarity(
                attrib['options'].get('id'),
                attrib['options'].get('sources'),
                relation_type=attrib['options'].get('relation_type'),
                root_node=attrib['options'].get('root'),
                similarity_method=sim_method)

    response = {
        "statusCode": 201,
        "headers": headers,
        "body": json.dumps(resp)
    }
    return response
Example #5
0
def bulk_insert_data_to_es(elasticsearch_connection, data, index, bulk_size=100):
    try:
        batch_data = get_list_by_chunk_size(data, bulk_size)
        for batch in batch_data:
            count = 0
            actions = []
            while count <= len(batch) - 1:
                action = {
                    "_index": index,
                    "_source": {}
                }
                action["_source"] = batch[count]
                actions.append(action)
                count = count + 1
            helpers.bulk(elasticsearch_connection, actions)
        return True
    except:
        e = sys.exc_info()
        print("Bulk insertion job failed")
        print(e)
        return False
Example #6
0
def load_file(filename, index_name):
    # Create index with the mapping if mapping file exists
    mapping_file_path = './test_mapping/' + filename
    if os.path.isfile(mapping_file_path):
        with open(mapping_file_path, 'r') as f:
            test_data_client.indices.create(index=index_name, body=f.read())

    # generate iterable data
    data_file_path = './test_data/' + filename

    def load_json():
        with open(data_file_path, 'r') as f:
            for line in f:
                yield json.loads(line)

    # Need to enable refresh, because the load won't be visible to search immediately
    # https://stackoverflow.com/questions/57840161/elasticsearch-python-bulk-helper-api-with-refresh
    helpers.bulk(test_data_client,
                 load_json(),
                 stats_only=True,
                 index=index_name,
                 refresh='wait_for')
Example #7
0
    def bulk_index(self,
                   indexPrefix,
                   data=None,
                   mapping=None,
                   period="day",
                   withTimeStamp=True):
        """
        :param str indexPrefix: index name.
        :param list data: contains a list of dictionary
        :param dict mapping: the mapping used by elasticsearch
        :param str period: Accepts 'day' and 'month'. We can specify which kind of indexes will be created.
        :param bool withTimeStamp: add timestamp to data, if not there already.

        :returns: S_OK/S_ERROR
        """
        sLog.verbose("Bulk indexing",
                     "%d records will be inserted" % len(data))
        if mapping is None:
            mapping = {}

        if period is not None:
            indexName = self.generateFullIndexName(indexPrefix, period)
        else:
            indexName = indexPrefix
        sLog.debug("Bulk indexing into %s of %s" % (indexName, data))

        res = self.existingIndex(indexName)
        if not res["OK"]:
            return res
        if not res["Value"]:
            retVal = self.createIndex(indexPrefix, mapping, period)
            if not retVal["OK"]:
                return retVal

        try:
            res = bulk(client=self.client,
                       index=indexName,
                       actions=generateDocs(data, withTimeStamp))
        except (BulkIndexError, RequestError) as e:
            sLog.exception()
            return S_ERROR(e)

        if res[0] == len(data):
            # we have inserted all documents...
            return S_OK(len(data))
        else:
            return S_ERROR(res)
Example #8
0
def preload(event, context=None):
    """
  End-point: Preload Ontology index
  """
    statusCode = 200
    params = json.loads(event['body'])
    ontologyId = params.get('ontologyId', None)
    sources = params.get('sources', None)

    root_node = params.get('root_node', None)
    relation_type = params.get('relation_type', "rdfs:subClassOf")
    sim_method = params.get('similarity_method', "wup")

    graph = load_graphs(sources)
    #2. extract unique list of concepts in the graph
    concepts = all_nodes(
        graph, relation_type=relation_type,
        root=root_node)  # can specify a root node or a relation type to use
    #3. compute pairwise similarity values of concepts (there can be different similarity metrics to choose from)
    similarity_grid = []
    for c1 in concepts:
        res = {}
        for c2 in concepts:
            if str(c1) == str(c2):  # no need to compute for same values
                res.update({str(c2): 1.0})
            elif sim_method == 'san':
                res.update({str(c2): san(graph, c1, c2)})
            else:
                res.update({str(c2): wup(graph, c1, c2)})
        similarity_grid.append({"key": str(c1), "map": res})

    es = getESConn()

    if es.indices.exists(index=ontologyId):
        es.indices.delete(index=ontologyId)

    ont_mapping = getOntologyMapping()
    es.indices.create(index=ontologyId, body=ont_mapping)

    resp = helpers.bulk(es, similarity_grid, index=ontologyId, doc_type="_doc")

    response = {
        "statusCode": statusCode,
        "headers": headers,
        "body": json.dumps({"message": "Succesfully Added"})
    }
    return response
Example #9
0
    def bulk(self, index: str):
        """バルクインサート
        """

        try:
            # iterableなオブジェクトであればよいので以下どちらも可能
            # - ジェネレータで渡す
            success, failed = helpers.bulk(self.es, gendata3(index))
            # - list型で渡す
            # success, failed = helpers.bulk(self.es, bulklist())
        # except opensearchpy.ElasticsearchException as e:
        #     pprint.pprint(e)
        except Exception as e:
            pprint.pprint(e)
            return

        print('--[bulk  ]-------------------------------------------')
        pprint.pprint(success)
        pprint.pprint(failed)
def index_documents(
    client: OpenSearch,
    documents: Iterable[Mapping[str, Any]],
    index: str,
    doc_type: Optional[str] = None,
    keys_to_write: Optional[List[str]] = None,
    id_keys: Optional[List[str]] = None,
    ignore_status: Optional[Union[List[Any], Tuple[Any]]] = None,
    bulk_size: int = 1000,
    chunk_size: Optional[int] = 500,
    max_chunk_bytes: Optional[int] = 100 * 1024 * 1024,
    max_retries: Optional[int] = 5,
    initial_backoff: Optional[int] = 2,
    max_backoff: Optional[int] = 600,
    **kwargs: Any,
) -> Dict[str, Any]:
    """Index all documents to OpenSearch index.

    Note
    ----
    Some of the args are referenced from opensearch-py client library (bulk helpers)
    https://opensearch-py.readthedocs.io/en/latest/helpers.html#opensearchpy.helpers.bulk
    https://opensearch-py.readthedocs.io/en/latest/helpers.html#opensearchpy.helpers.streaming_bulk

    If you receive `Error 429 (Too Many Requests) /_bulk` please to to decrease `bulk_size` value.
    Please also consider modifying the cluster size and instance type -
    Read more here: https://aws.amazon.com/premiumsupport/knowledge-center/resolve-429-error-es/

    Parameters
    ----------
    client : OpenSearch
        instance of opensearchpy.OpenSearch to use.
    documents : Iterable[Mapping[str, Any]]
        List which contains the documents that will be inserted.
    index : str
        Name of the index.
    doc_type : str, optional
        Name of the document type (for Elasticsearch versions 5.x and earlier).
    keys_to_write : List[str], optional
        list of keys to index. If not provided all keys will be indexed
    id_keys : List[str], optional
        list of keys that compound document unique id. If not provided will use `_id` key if exists,
        otherwise will generate unique identifier for each document.
    ignore_status:  Union[List[Any], Tuple[Any]], optional
        list of HTTP status codes that you want to ignore (not raising an exception)
    bulk_size: int,
        number of docs in each _bulk request (default: 1000)
    chunk_size : int, optional
        number of docs in one chunk sent to es (default: 500)
    max_chunk_bytes: int, optional
        the maximum size of the request in bytes (default: 100MB)
    max_retries : int, optional
        maximum number of times a document will be retried when
        ``429`` is received, set to 0 (default) for no retries on ``429`` (default: 2)
    initial_backoff : int, optional
        number of seconds we should wait before the first retry.
        Any subsequent retries will be powers of ``initial_backoff*2**retry_number`` (default: 2)
    max_backoff: int, optional
        maximum number of seconds a retry will wait (default: 600)
    **kwargs :
        KEYWORD arguments forwarded to bulk operation
        elasticsearch >= 7.10.2 / opensearch: \
https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters
        elasticsearch < 7.10.2: \
https://opendistro.github.io/for-elasticsearch-docs/docs/elasticsearch/rest-api-reference/#url-parameters

    Returns
    -------
    Dict[str, Any]
        Response payload
        https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#response.

    Examples
    --------
    Writing documents

    >>> import awswrangler as wr
    >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT')
    >>> wr.opensearch.index_documents(
    ...     documents=[{'_id': '1', 'value': 'foo'}, {'_id': '2', 'value': 'bar'}],
    ...     index='sample-index1'
    ... )
    """
    if not isinstance(documents, list):
        documents = list(documents)
    total_documents = len(documents)
    _logger.debug("indexing %s documents into %s", total_documents, index)

    actions = _actions_generator(documents,
                                 index,
                                 doc_type,
                                 keys_to_write=keys_to_write,
                                 id_keys=id_keys,
                                 bulk_size=bulk_size)

    success = 0
    errors: List[Any] = []
    refresh_interval = None
    try:
        widgets = [
            progressbar.Percentage(),
            progressbar.SimpleProgress(
                format=" (%(value_s)s/%(max_value_s)s)"),
            progressbar.Bar(),
            progressbar.Timer(),
        ]
        progress_bar = progressbar.ProgressBar(widgets=widgets,
                                               max_value=total_documents,
                                               prefix="Indexing: ").start()
        for i, bulk_chunk_documents in enumerate(actions):
            if i == 1:  # second bulk iteration, in case the index didn't exist before
                refresh_interval = _get_refresh_interval(client, index)
                _disable_refresh_interval(client, index)
            _logger.debug("running bulk index of %s documents",
                          len(bulk_chunk_documents))
            _success, _errors = bulk(
                client=client,
                actions=bulk_chunk_documents,
                ignore_status=ignore_status,
                chunk_size=chunk_size,
                max_chunk_bytes=max_chunk_bytes,
                max_retries=max_retries,
                initial_backoff=initial_backoff,
                max_backoff=max_backoff,
                request_timeout=30,
                **kwargs,
            )
            success += _success
            errors += _errors  # type: ignore
            _logger.debug("indexed %s documents (%s/%s)", _success, success,
                          total_documents)
            progress_bar.update(success, force=True)
    except TransportError as e:
        if str(e.status_code) == "429":  # Too Many Requests
            _logger.error(
                "Error 429 (Too Many Requests):"
                "Try to tune bulk_size parameter."
                "Read more here: https://aws.amazon.com/premiumsupport/knowledge-center/resolve-429-error-es"
            )
            raise e

    finally:
        _set_refresh_interval(client, index, refresh_interval)

    return {"success": success, "errors": errors}
Example #11
0
def query_cache(event, context=None):
    """
  End-point: Cache Ontology based on Query
  """
    statusCode = 200
    params = json.loads(event['body'])
    ontologyId = params.get('ontologyId', None)
    sources = params.get('sources', None)
    key = str(params.get('key', None))
    root_node = params.get('root_node', None)
    relation_type = params.get('relation_type', "rdfs:subClassOf")
    sim_method = params.get('similarity_method', "wup")

    es = getESConn()

    if not es.indices.exists(index=ontologyId):
        ont_mapping = getOntologyMapping()
        es.indices.create(index=ontologyId, body=ont_mapping)

    # retrieve if ES index does exist
    result = getByUniqueField(es, ontologyId, "key", key)

    # If existing in the index already return the value
    if (result):
        response = {
            "statusCode": 200,
            "headers": headers,
            "body": json.dumps(result)
        }
        return response

    graph = load_graphs(sources)
    #2. extract unique list of concepts in the graph
    concepts = all_nodes(
        graph, relation_type=relation_type,
        root=root_node)  # can specify a root node or a relation type to use

    #3. compute pairwise similarity values of concepts (there can be different similarity metrics to choose from)
    similarity_grid = []
    for c1 in concepts:
        if str(c1) != key:
            continue
        res = {}
        for c2 in concepts:
            if str(c1) == str(c2):  # no need to compute for same values
                res.update({str(c2): 1.0})
            elif sim_method == 'san':
                res.update({str(c2): san(graph, c1, c2)})
            else:
                res.update({str(c2): wup(graph, c1, c2)})
        similarity_grid.append({"key": str(c1), "map": res})

    resp = helpers.bulk(es, similarity_grid, index=ontologyId, doc_type="_doc")

    if len(similarity_grid) > 0:
        response = {
            "statusCode": statusCode,
            "headers": headers,
            "body": json.dumps(similarity_grid[0])
        }
    else:
        response = {
            "statusCode": statusCode,
            "headers": headers,
            "body": json.dumps({})
        }
    return response