def flush(self): """ Flushes the buffer into ES :return: None """ if self._timer is not None and self._timer.is_alive(): self._timer.cancel() self._timer = None if self._buffer: try: with self._buffer_lock: logs_buffer = self._buffer self._buffer = [] actions = ({ '_index': self._index_name_func.__func__(self.es_index_name), '_source': log_record } for log_record in logs_buffer) eshelpers.bulk(client=self.__get_es_client(), actions=actions, stats_only=True) except Exception as exception: if self.raise_on_indexing_exceptions: raise exception
def load_file(test_executor, filename="accounts.json"): opensearch = test_executor.client filepath = "./test_data/" + filename # generate iterable data def load_json(): with open(filepath, "r") as f: for line in f: yield json.loads(line) helpers.bulk(opensearch, load_json(), index=TEST_INDEX_NAME)
def add_data_bulk( self, data: List[IndexableData], index_base_name: Optional[str] = None, ) -> None: index_name = self._get_wip_alias(index_base_name or self.index_base_names[0]) body = [{ "_index": index_name, "_source": asdict(d) if is_dataclass(d) else d } for d in data] try: bulk(self.es, body) except ConnectionError as e: logger.error(e)
def save_case_list(event, context=None): """ End-point: Saves list of case instances Creates index for the casebase if one does not exist """ # try: doc_list = json.loads(event['body']) # parameters in request body es = getESConn() pid = event['pathParameters']['id'] proj = utility.getByUniqueField(es, projects_db, "_id", pid) # project # create index with mapping if it does not exist already project.indexMapping(es, proj) # Add documents to created index # print("Adding a hash field to each case for duplicate-checking") for x in doc_list: # generate a hash after ordering dict by key x = retrieve.add_vector_fields(proj['attributes'], x) # add vectors to Semantic USE fields x = retrieve.add_lowercase_fields( proj['attributes'], x) # use lowercase values for EqualIgnoreCase fields x['hash__'] = str( hashlib.md5( json.dumps(OrderedDict(sorted(x.items()))).encode('utf-8')). digest()) # case hash for easy detection of duplicates # print("Attempting to index the list of docs using helpers.bulk()") resp = helpers.bulk(es, doc_list, index=proj['casebase'], doc_type="_doc") # Indicate that the project has a casebase # print("Casebase added. Attempting to update project detail. Set hasCasebase => True") proj['hasCasebase'] = True source_to_update = {'doc': proj} # print(source_to_update) res = es.update(index=projects_db, id=pid, body=source_to_update) # print(res) # create the ontology similarity if specified as part of project attributes (can be a lengthy operation for mid to large ontologies!) for attrib in proj['attributes']: if attrib['type'] == "Ontology Concept" and attrib.get( 'similarityType') is not None and attrib.get( 'options') is not None and retrieve.checkOntoSimilarity( attrib['options'].get('id'))['statusCode'] != 200: sim_method = 'san' if attrib[ 'similarityType'] == 'Feature-based' else 'wup' retrieve.setOntoSimilarity( attrib['options'].get('id'), attrib['options'].get('sources'), relation_type=attrib['options'].get('relation_type'), root_node=attrib['options'].get('root'), similarity_method=sim_method) response = { "statusCode": 201, "headers": headers, "body": json.dumps(resp) } return response
def bulk_insert_data_to_es(elasticsearch_connection, data, index, bulk_size=100): try: batch_data = get_list_by_chunk_size(data, bulk_size) for batch in batch_data: count = 0 actions = [] while count <= len(batch) - 1: action = { "_index": index, "_source": {} } action["_source"] = batch[count] actions.append(action) count = count + 1 helpers.bulk(elasticsearch_connection, actions) return True except: e = sys.exc_info() print("Bulk insertion job failed") print(e) return False
def load_file(filename, index_name): # Create index with the mapping if mapping file exists mapping_file_path = './test_mapping/' + filename if os.path.isfile(mapping_file_path): with open(mapping_file_path, 'r') as f: test_data_client.indices.create(index=index_name, body=f.read()) # generate iterable data data_file_path = './test_data/' + filename def load_json(): with open(data_file_path, 'r') as f: for line in f: yield json.loads(line) # Need to enable refresh, because the load won't be visible to search immediately # https://stackoverflow.com/questions/57840161/elasticsearch-python-bulk-helper-api-with-refresh helpers.bulk(test_data_client, load_json(), stats_only=True, index=index_name, refresh='wait_for')
def bulk_index(self, indexPrefix, data=None, mapping=None, period="day", withTimeStamp=True): """ :param str indexPrefix: index name. :param list data: contains a list of dictionary :param dict mapping: the mapping used by elasticsearch :param str period: Accepts 'day' and 'month'. We can specify which kind of indexes will be created. :param bool withTimeStamp: add timestamp to data, if not there already. :returns: S_OK/S_ERROR """ sLog.verbose("Bulk indexing", "%d records will be inserted" % len(data)) if mapping is None: mapping = {} if period is not None: indexName = self.generateFullIndexName(indexPrefix, period) else: indexName = indexPrefix sLog.debug("Bulk indexing into %s of %s" % (indexName, data)) res = self.existingIndex(indexName) if not res["OK"]: return res if not res["Value"]: retVal = self.createIndex(indexPrefix, mapping, period) if not retVal["OK"]: return retVal try: res = bulk(client=self.client, index=indexName, actions=generateDocs(data, withTimeStamp)) except (BulkIndexError, RequestError) as e: sLog.exception() return S_ERROR(e) if res[0] == len(data): # we have inserted all documents... return S_OK(len(data)) else: return S_ERROR(res)
def preload(event, context=None): """ End-point: Preload Ontology index """ statusCode = 200 params = json.loads(event['body']) ontologyId = params.get('ontologyId', None) sources = params.get('sources', None) root_node = params.get('root_node', None) relation_type = params.get('relation_type', "rdfs:subClassOf") sim_method = params.get('similarity_method', "wup") graph = load_graphs(sources) #2. extract unique list of concepts in the graph concepts = all_nodes( graph, relation_type=relation_type, root=root_node) # can specify a root node or a relation type to use #3. compute pairwise similarity values of concepts (there can be different similarity metrics to choose from) similarity_grid = [] for c1 in concepts: res = {} for c2 in concepts: if str(c1) == str(c2): # no need to compute for same values res.update({str(c2): 1.0}) elif sim_method == 'san': res.update({str(c2): san(graph, c1, c2)}) else: res.update({str(c2): wup(graph, c1, c2)}) similarity_grid.append({"key": str(c1), "map": res}) es = getESConn() if es.indices.exists(index=ontologyId): es.indices.delete(index=ontologyId) ont_mapping = getOntologyMapping() es.indices.create(index=ontologyId, body=ont_mapping) resp = helpers.bulk(es, similarity_grid, index=ontologyId, doc_type="_doc") response = { "statusCode": statusCode, "headers": headers, "body": json.dumps({"message": "Succesfully Added"}) } return response
def bulk(self, index: str): """バルクインサート """ try: # iterableなオブジェクトであればよいので以下どちらも可能 # - ジェネレータで渡す success, failed = helpers.bulk(self.es, gendata3(index)) # - list型で渡す # success, failed = helpers.bulk(self.es, bulklist()) # except opensearchpy.ElasticsearchException as e: # pprint.pprint(e) except Exception as e: pprint.pprint(e) return print('--[bulk ]-------------------------------------------') pprint.pprint(success) pprint.pprint(failed)
def index_documents( client: OpenSearch, documents: Iterable[Mapping[str, Any]], index: str, doc_type: Optional[str] = None, keys_to_write: Optional[List[str]] = None, id_keys: Optional[List[str]] = None, ignore_status: Optional[Union[List[Any], Tuple[Any]]] = None, bulk_size: int = 1000, chunk_size: Optional[int] = 500, max_chunk_bytes: Optional[int] = 100 * 1024 * 1024, max_retries: Optional[int] = 5, initial_backoff: Optional[int] = 2, max_backoff: Optional[int] = 600, **kwargs: Any, ) -> Dict[str, Any]: """Index all documents to OpenSearch index. Note ---- Some of the args are referenced from opensearch-py client library (bulk helpers) https://opensearch-py.readthedocs.io/en/latest/helpers.html#opensearchpy.helpers.bulk https://opensearch-py.readthedocs.io/en/latest/helpers.html#opensearchpy.helpers.streaming_bulk If you receive `Error 429 (Too Many Requests) /_bulk` please to to decrease `bulk_size` value. Please also consider modifying the cluster size and instance type - Read more here: https://aws.amazon.com/premiumsupport/knowledge-center/resolve-429-error-es/ Parameters ---------- client : OpenSearch instance of opensearchpy.OpenSearch to use. documents : Iterable[Mapping[str, Any]] List which contains the documents that will be inserted. index : str Name of the index. doc_type : str, optional Name of the document type (for Elasticsearch versions 5.x and earlier). keys_to_write : List[str], optional list of keys to index. If not provided all keys will be indexed id_keys : List[str], optional list of keys that compound document unique id. If not provided will use `_id` key if exists, otherwise will generate unique identifier for each document. ignore_status: Union[List[Any], Tuple[Any]], optional list of HTTP status codes that you want to ignore (not raising an exception) bulk_size: int, number of docs in each _bulk request (default: 1000) chunk_size : int, optional number of docs in one chunk sent to es (default: 500) max_chunk_bytes: int, optional the maximum size of the request in bytes (default: 100MB) max_retries : int, optional maximum number of times a document will be retried when ``429`` is received, set to 0 (default) for no retries on ``429`` (default: 2) initial_backoff : int, optional number of seconds we should wait before the first retry. Any subsequent retries will be powers of ``initial_backoff*2**retry_number`` (default: 2) max_backoff: int, optional maximum number of seconds a retry will wait (default: 600) **kwargs : KEYWORD arguments forwarded to bulk operation elasticsearch >= 7.10.2 / opensearch: \ https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters elasticsearch < 7.10.2: \ https://opendistro.github.io/for-elasticsearch-docs/docs/elasticsearch/rest-api-reference/#url-parameters Returns ------- Dict[str, Any] Response payload https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#response. Examples -------- Writing documents >>> import awswrangler as wr >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT') >>> wr.opensearch.index_documents( ... documents=[{'_id': '1', 'value': 'foo'}, {'_id': '2', 'value': 'bar'}], ... index='sample-index1' ... ) """ if not isinstance(documents, list): documents = list(documents) total_documents = len(documents) _logger.debug("indexing %s documents into %s", total_documents, index) actions = _actions_generator(documents, index, doc_type, keys_to_write=keys_to_write, id_keys=id_keys, bulk_size=bulk_size) success = 0 errors: List[Any] = [] refresh_interval = None try: widgets = [ progressbar.Percentage(), progressbar.SimpleProgress( format=" (%(value_s)s/%(max_value_s)s)"), progressbar.Bar(), progressbar.Timer(), ] progress_bar = progressbar.ProgressBar(widgets=widgets, max_value=total_documents, prefix="Indexing: ").start() for i, bulk_chunk_documents in enumerate(actions): if i == 1: # second bulk iteration, in case the index didn't exist before refresh_interval = _get_refresh_interval(client, index) _disable_refresh_interval(client, index) _logger.debug("running bulk index of %s documents", len(bulk_chunk_documents)) _success, _errors = bulk( client=client, actions=bulk_chunk_documents, ignore_status=ignore_status, chunk_size=chunk_size, max_chunk_bytes=max_chunk_bytes, max_retries=max_retries, initial_backoff=initial_backoff, max_backoff=max_backoff, request_timeout=30, **kwargs, ) success += _success errors += _errors # type: ignore _logger.debug("indexed %s documents (%s/%s)", _success, success, total_documents) progress_bar.update(success, force=True) except TransportError as e: if str(e.status_code) == "429": # Too Many Requests _logger.error( "Error 429 (Too Many Requests):" "Try to tune bulk_size parameter." "Read more here: https://aws.amazon.com/premiumsupport/knowledge-center/resolve-429-error-es" ) raise e finally: _set_refresh_interval(client, index, refresh_interval) return {"success": success, "errors": errors}
def query_cache(event, context=None): """ End-point: Cache Ontology based on Query """ statusCode = 200 params = json.loads(event['body']) ontologyId = params.get('ontologyId', None) sources = params.get('sources', None) key = str(params.get('key', None)) root_node = params.get('root_node', None) relation_type = params.get('relation_type', "rdfs:subClassOf") sim_method = params.get('similarity_method', "wup") es = getESConn() if not es.indices.exists(index=ontologyId): ont_mapping = getOntologyMapping() es.indices.create(index=ontologyId, body=ont_mapping) # retrieve if ES index does exist result = getByUniqueField(es, ontologyId, "key", key) # If existing in the index already return the value if (result): response = { "statusCode": 200, "headers": headers, "body": json.dumps(result) } return response graph = load_graphs(sources) #2. extract unique list of concepts in the graph concepts = all_nodes( graph, relation_type=relation_type, root=root_node) # can specify a root node or a relation type to use #3. compute pairwise similarity values of concepts (there can be different similarity metrics to choose from) similarity_grid = [] for c1 in concepts: if str(c1) != key: continue res = {} for c2 in concepts: if str(c1) == str(c2): # no need to compute for same values res.update({str(c2): 1.0}) elif sim_method == 'san': res.update({str(c2): san(graph, c1, c2)}) else: res.update({str(c2): wup(graph, c1, c2)}) similarity_grid.append({"key": str(c1), "map": res}) resp = helpers.bulk(es, similarity_grid, index=ontologyId, doc_type="_doc") if len(similarity_grid) > 0: response = { "statusCode": statusCode, "headers": headers, "body": json.dumps(similarity_grid[0]) } else: response = { "statusCode": statusCode, "headers": headers, "body": json.dumps({}) } return response