def __del__(self): try: self.bulk_result = helpers.bulk(self.eS, self.actions, stats_only = False) helpers.parallel_bulk(self.eS, self.actions) except: logging.error("ERROR in final elasticsearch BULK...") #self.bulk_result = helpers.bulk(self.eS, self.actions, stats_only = False) raise SubmitterError() if(hasattr(self,'bulk_result')): logging.info("result of final elasticsearch bulk: "+str(self.bulk_result)) return
def index(self, queryset: QuerySet): self._init_index() for success, info in parallel_bulk(self.es_connection, generate_es_query(queryset)): if not success: raise RuntimeError(f"Error indexando query a ES: {info}")
def build_custom_dict(self): df = pd.read_excel(os.path.join(MEDIA_ROOT, 'dict.xlsx')) number_of_words = len(df) index = es.Index(ES_INDEX_CUSTOM_DICTIONARY_WORD, using=ES_CLIENT) index.delete(ignore=404) print("Creating index") CustomDictionaryWord.init() failed, success = 0, 0 batch_size = 1000 for ok, result in parallel_bulk(ES_CLIENT, self.word_generator(df), index=ES_INDEX_CUSTOM_DICTIONARY_WORD, chunk_size=batch_size, raise_on_error=False, thread_count=6): if ok: success += 1 else: failed += 1 action, result = result.popitem() print("!!!", action, result) if failed > 3: raise Exception("Too many failed!!") if (success + failed) % batch_size == 0: print(f'{success+failed}/{number_of_words} processed')
def es_index(self, articles): def annotatearticle(r): if r['_id'] in self.variantreferences: r['variants'] = list(self.variantreferences[r['_id']]) if r['_id'] in self.genereferences: r['genes'] = list(self.genereferences[r['_id']]) r['geneids'] = list(self.gids[r['_id']]) r_ = [self.uniprot[gid] for gid in self.gids[r['_id']] if gid in self.uniprot ] r__ = list() for i in r_: for j in i: r__.append(j) r['uniprotids'] = r__ _id = str(r['_id']) def preparearticles(): for r in articles: if 'references' in r: del r['references'] annotatearticle(r) yield r for ok, result in parallel_bulk( self.es, preparearticles(), thread_count=14, queue_size=1400, index=self.index, chunk_size=140 ): if not ok: action, result = result.popitem() doc_id = '/%s/commits/%s' % (self.index, result['_id']) print('Failed to %s document %s: %r' % (action, doc_id, 'result'))
def send_elastic(self): success = 0 failed = 0 qs = Document.objects.filter(id__gt=self.from_id) if self.to_id: qs = qs.filter(id__lte=self.to_id) qs = qs.order_by('id') # import datetime # qs = qs.filter(datetime__gte=datetime.date(2019, 1, 1), datetime__lte=datetime.date(2019, 3, 1)).order_by('id') # qs = qs.filter(source__corpus__name="gos").order_by('id') print("Start build") number_of_documents = qs.count() for ok, result in parallel_bulk(self.client, self.document_generator(qs), index=ES_INDEX_DOCUMENT, chunk_size=self.batch_size, raise_on_error=False, thread_count=6): if ok: success += 1 else: failed += 1 action, result = result.popitem() print("!!!", action, result) if failed > self.batch_size / 100: raise Exception("Too many failed!!") if (success + failed) % self.batch_size == 0: print(f'{success+failed}/{number_of_documents} processed')
def push2es_parallel(config): es = recreate_index(config) source_path = config.get('main', 'source_path') _index = config.get('main', 'index') _type = config.get('main', 'type') parser = config.get('xml', 'parser') id_field = config.get('main', 'id_field', fallback=None) #assumes data is a list of dictionaries def genereate_actions(data): for item in data: action = { '_op_type': 'index', '_index': _index, '_type': _type, '_source': item } if '_id' in item: action['_id'] = item['_id'] del item['_id'] yield action data = [] data = parseXML(source_path, parser=parser, id_field=id_field) #paralell bulk for success, info in parallel_bulk(es, genereate_actions(data), thread_count=4): if not success: print('Doc failed', info)
def insert(self, file_name, index_name): total = 0 actions = [] not_inserted = 0 chunk_size = 5000 self.es.indices.delete(index=index_name, ignore=[400, 404]) self.logger.info("deleted es index " + index_name) self.es.indices.create(index=index_name, body=self.index_settings) self.logger.info("created es index " + index_name) #self.logger.info('before chunk') for chunk in pd.read_table(file_name, sep='\t', header=None, na_filter = False,chunksize=chunk_size, low_memory=False, names=['fact_id', 'subject', 'predicate', 'object', 'number']): #self.logger.info('next chunk') if chunk.shape[1] != 5: raise Exception(file_name + ' does not contain 5 columns') actions = np.vectorize(self.add_action)(chunk['subject'], chunk['predicate'], chunk['object'], index_name) #self.logger.info('before bulk') for success, info in helpers.parallel_bulk(client=self.es, actions=actions , thread_count=5): if not success: print("Insert failed: ", info) #self.logger.info('after bulk') #del actions[0:len(actions)] total = total + chunk_size if total % 1000000 == 0: self.logger.info("sent a total of " + str(total) + " messages to es") #insert the last rows self.logger.info("total inserted : " + str(total) + " total not inserted : " + str(not_inserted))
def get_locations(**kwargs): from elasticsearch.helpers import parallel_bulk from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT_LOCATION from .util import locations_generator import datetime failed = 0 success = 0 for ok, result in parallel_bulk(ES_CLIENT, (doc.to_dict() for doc in locations_generator(**kwargs)), index=ES_INDEX_DOCUMENT_LOCATION, chunk_size=10000, raise_on_error=True, thread_count=4): if (failed + success) % 10000 == 0: print(f"!!!{failed + success} processed", datetime.datetime.now()) if failed > 5: raise Exception("Too many failed ES!!!") if not ok: failed += 1 else: success += 1 return 'Done'
def bulk( self, index, docs, chunk_size=None, max_chunk_bytes=None, queue_size=None, thread_count=None, refresh=False, ): """Bulk index, update, delete docs to Elasticsearch.""" chunk_size = chunk_size or ELASTICSEARCH_CHUNK_SIZE max_chunk_bytes = max_chunk_bytes or ELASTICSEARCH_MAX_CHUNK_BYTES thread_count = thread_count or ELASTICSEARCH_THREAD_COUNT queue_size = queue_size or ELASTICSEARCH_QUEUE_SIZE for _ in parallel_bulk( self.__es, docs, index=index, thread_count=thread_count, chunk_size=chunk_size, max_chunk_bytes=max_chunk_bytes, queue_size=queue_size, refresh=refresh, ): pass
def remove_duplicates(index): has_duplicates = True while has_duplicates: # query for duplicates buckets = es_client.search( index=index, body=query, request_timeout=120)['aggregations']['duplicated_hash']['buckets'] ids_to_delete = [] if len(buckets) > 0: for bucket in buckets: documents = bucket["documents"]["hits"]["hits"] # skip first for doc in documents[1:]: remove_doc = { '_op_type': 'delete', '_index': index, '_type': '_doc', '_id': doc["_id"] } ids_to_delete.append(remove_doc) deletes = helpers.parallel_bulk(es_client, ids_to_delete) try: for item in deletes: pass except: pass print(len(ids_to_delete), "duplicates was removed.") else: has_duplicates = False print("No duplicates found.")
def bulk_index(self, index_name='restaurants', data=[]): print('updating index ...') docs = [] for elem in data: docs.append({ '_op_type': 'index', '_index': index_name, '_id': elem['id'], 'body': { 'cuisine': elem['cuisine'] } }) # print(docs[:2]) # bulk(self.es,iter(docs),request_timeout=30) for success, info in parallel_bulk(self.es, iter(docs), request_timeout=30): if not success: print('A document failed:', info) else: print('success') if data == []: print('no data provided') else: print('done')
def _perform_index_sync(self, sql_table_cls, es_doc_cls, id_logger): es_doc = es_doc_cls() elasticsearch_conn = connections.get_connection() sync_timestamp = current_server_timestamp() pending_insertions = self._compute_dirty_documents( sql_table_cls, es_doc.doc_type) bulk_op = self._synchronisation_op(es_doc, pending_insertions) self._logging(logging.INFO, 'Performing synchronization.') for ok, info in parallel_bulk(elasticsearch_conn, bulk_op): obj_id = info['index']['_id'] \ if 'index' in info else info['update']['_id'] if ok: # Mark the task as handled so we don't retreat it next time self._logging(logging.INFO, 'Document %s has been synced successfully.' % obj_id) sql_table_cls.update_last_sync(obj_id, sync_timestamp) else: id_logger(obj_id, logging.ERROR, 'Error while syncing document %s index.' % obj_id) # Refresh indices to increase research speed elasticsearch_dsl.Index(es_doc.index).refresh()
def load(self, client, index_name, chunk_size, threads, timeout): # Open and store mapping with open(self.get_mapping_file(index_name), 'r') as mapping: # Create index with mapping. If it already exists, ignore this client.indices.create(index=index_name, ignore=400, body=mapping.read()) # check if it exists some document in index from this file es_query = Search(using=client, index=index_name).filter('term', path=self.basename)[:0] result = es_query.execute() if result.hits.total != 0: raise IndexNotEmptyError( 'There are {0} documents from this file in the index'.format( result.hits.total)) # Send docs to elasticsearch for success, info in parallel_bulk(client, self.make_docs(), thread_count=threads, chunk_size=chunk_size, request_timeout=timeout, index=index_name, doc_type='doc', raise_on_exception=False): if not success: print('Doc failed', info)
def bulk(self, docs, index = "", doc_type = "", op_type = 'index'): ''' bulk sample: {"_op_type":"index", _index" : "test", "_type" : "type1", "_id" : "1" , "_source":{"field1":"value1", "field2":"value2"}} { "_op_type":"delete" , "_index" : "test", "_type" : "type1", "_id" : "2" } ''' index_ = self.index if index == "" else index doc_type_ = self.doc_type if doc_type == "" else doc_type allow_op = ['index', 'delete'] if op_type not in allow_op: raise exceptions.RequestError(400, '{"msg":"op_type is not allowed, you can use index or delete"}') actions = [] for doc in docs: action = {} action["_index"] = index_ action["_type"] = doc_type_ action["_id"] = doc["_id"] if op_type == 'index': del doc["_id"] action["_source"] = doc action["_op_type"] = op_type actions.append(action) return helpers.parallel_bulk(self.es, actions)
async def load_bulk(self, items_chunk): payload = self.create_bulk(items_chunk) try: success, _ = parallel_bulk( self.client, payload, chunk_size=1000) except: pass
def main(self) -> None: try: instance.config = ConfigParser() instance.config.read_dict(defaultconfig()) logger().info('Started eisp with pid %s', getpid()) for i in [i for i in argv if i.startswith('--')]: try: mod('eisp.param.{}'.format(i[2:])).__dict__[i[2:]](argv) except: exit('Invalid parameter or argument to {}'.format(i[2:])) conf = dotdict(instance.config['data']) connections.create_connection(hosts=[conf.host]) delete_index(conf.index_name) create_index(conf.elastic_mapping, conf.index_name) for ok, info in helpers.parallel_bulk(connections.get_connection(), actions=index_pdfs( conf.index_name, conf.root), request_timeout=60, chunk_size=100, thread_count=8, queue_size=8): if not ok: print(info) except KeyboardInterrupt: print('\N{bomb}') except Exception as exception: logger().exception(exception) except SystemExit as exception: logger().critical(str(exception))
def insert_multiple_docs(client, data_generator, para=False): if para: bulk(client, data_generator) else: for success, info in parallel_bulk(client, data_generator): if not success: print("A document failed:", info)
def populate(self, index_name, corpus, thread=4, chunk=500): start_time = time.time() connection = sqlite3.connect(corpus, check_same_thread=False) cursor = connection.cursor() number_of_docs = 0 for row in cursor.execute('SELECT * FROM documents'): number_of_docs += 1 cursor.close() if not (self.client.indices.exists(index_name)): print("This index does not exist") else: print("Creating an index...") #create_index(self.client, index_name) print("Indexing documents...") progress = tqdm.tqdm(unit="docs", total=number_of_docs) successes = 0 for ok, action in parallel_bulk(client=self.client, index=index_name, actions=self.generate_docs(corpus), thread_count=thread, chunk_size=chunk): progress.update(1) successes += ok print("Indexed %d/%d documents" % (successes, number_of_docs)) print("--- %s seconds ---" % (time.time() - start_time))
def bulk_data(self, index, doc_type, is_parallel=True, batch_chunk_size=5000, threads_counts=8): ''' 数据批量插入 :param index: 要插入数据的index :param doc_type: index的文档类型 :param chunk_size: 批量插入的大小,只用于非并行插入 :param is_parallel: 是否要并行插入,默认为并行插入 :param threads_counts: 线程数量,默认为4,只有在并行插入数据时该参数才有效 :return: ''' if is_parallel is None or is_parallel == True: gen_action = self._gen_parallel_data(index, doc_type) print("正在并行插入数据...") start = time.time() for success, info in helpers.parallel_bulk(client=self.es, actions=gen_action, thread_count=threads_counts, chunk_size=1000): if not success: print("Insert failed: ", info) print("插入数据成功... ", time.time()-start) elif is_parallel == False: gen_action = self._gen_data(index, doc_type, batch_chunk_size) try: print("正在插入数据...") t3 = time.time() helpers.bulk(client=self.es, actions=gen_action, chunk_size=500) print("插入成功....", time.time() - t3) except Exception as e: print(e, "插入失败!") else: raise ValueError("is_parallel应该为True或False")
def _perform_index_sync(self, sql_table_cls, es_doc_cls, id_logger): es_doc = es_doc_cls() elasticsearch_conn = connections.get_connection() sync_timestamp = current_server_timestamp() pending_insertions = self._compute_dirty_documents( sql_table_cls, es_doc.doc_type) bulk_op = self._synchronisation_op(es_doc, pending_insertions) self._logging(logging.INFO, 'Performing synchronization.') for ok, info in parallel_bulk(elasticsearch_conn, bulk_op): obj_id = info['index']['_id'] \ if 'index' in info else info['update']['_id'] if ok: # Mark the task as handled so we don't retreat it next time self._logging( logging.INFO, 'Document %s has been synced successfully.' % obj_id) sql_table_cls.update_last_sync(obj_id, sync_timestamp) else: id_logger(obj_id, logging.ERROR, 'Error while syncing document %s index.' % obj_id) # Refresh indices to increase research speed elasticsearch_dsl.Index(es_doc.index).refresh()
def run(self, distribution): fields = distribution.field_set.all() fields = {field.title: field.identifier for field in fields} df = self.init_df(distribution, fields) # Aplica la operación de procesamiento e indexado a cada columna result = [ process_column(df[col], self.index_name) for col in df.columns ] if not result: # Distribución sin series cargadas return # List flatten: si el resultado son múltiples listas las junto en una sola actions = reduce(lambda x, y: x + y, result) if isinstance( result[0], list) else result self.add_catalog_keyword(actions, distribution) for success, info in parallel_bulk(self.elastic, actions): if not success: logger.warning(strings.BULK_REQUEST_ERROR, info) remove_duplicated_fields(distribution) for field in distribution.field_set.exclude(title='indice_tiempo'): field.enhanced_meta.update_or_create(key=meta_keys.AVAILABLE, value='true') # Cálculo de metadatos adicionales sobre cada serie df.apply(update_enhanced_meta, args=(distribution.dataset.catalog.identifier, distribution.identifier))
def parallel_insert(client, executors, filename): start = time.time() reader = jsonlines.open(filename, 'r') futures = [executors.submit(json_to_doc, obj) for obj in reader] kwargs = { 'total': len(futures), 'unit': 'parsed', 'unit_scale': True, 'leave': True } for f in tqdm(as_completed(futures), **kwargs): pass results = [f.result() for f in futures] print("Json preprocessing done in {:.5f}s".format(start - time.time())) t = time.time() slicing = 5000 for i in range(slicing, len(results) + slicing, slicing): docs = results[i - slicing:i] for r in parallel_bulk(client, Post.bulk_dicts(docs), thread_count=4, chunk_size=400): pass print('{} insertion done in {:.5f}s'.format(i, time.time() - t)) t = time.time() print('Total elapsed time: {:.5f}s'.format(time.time() - start))
def _buildES(self, cat, feat, reprList): """ Build the category's elasticsearch model using corpus """ if not self.reprDict.has_key(cat): self.reprDict[cat] = {} self.reprDict[cat][feat] = reprList lowerCat = cat.lower() + feat os.system( ut.rp('elastic/init_entity_search.sh ') + lowerCat + ' ' + lowerCat) actionList = [] uniqReprList = list(set(reprList)) for each in uniqReprList: action = { "_index": lowerCat, "_type": lowerCat, "_source": { "name": each } } actionList.append(action) for success, info in helpers.parallel_bulk(es_client, actionList, chunk_size=200, thread_count=12): print success, info self.save()
def _upload_to_es(payload_file, my_uuid, timestamp, es, my_node, my_pod, index_retries): documents = { "total": 0, "existent": 0, "total": 0 } def doc_stream(): for scribed in transcribe(payload_file, 'stockpile'): doc = json.loads(scribed) es_index = "%s-metadata" % doc["module"] doc["uuid"] = my_uuid _id = hashlib.sha256(str(doc).encode()).hexdigest() # This information changes depending on the node and pod where stockpile-wrapper is executed # Don't include it in the _id calculation to avoid indexing several times documents not # specific to a node doc["node_name"] = my_node doc["pod_name"] = my_pod doc["timestamp"] = timestamp documents["total"] += 1 yield {"_index": es_index, "_source": doc, "_id": _id, "_op_type": "create"} failed_docs = [] for r in range(index_retries): documents["failed"] = 0 documents["existent"] = 0 try: for ok, resp in parallel_bulk(es, doc_stream()): pass # Catch indexing exception except BulkIndexError as err: exception = err # An exception can refer to multiple documents for failed_doc in err.errors: # Document already exists in ES if failed_doc["create"]["status"] == 409: documents["existent"] += 1 continue documents["failed"] += 1 es_index = "%s-metadata" % failed_doc["create"]["data"]["module"] doc = {"_index": es_index, "_source": failed_doc["create"]["data"], "_id": failed_doc["create"]["_id"], "_op_type": "create"} failed_docs.append(doc) except Exception as err: print("Unknown indexing error: %s" % err) return if not documents["failed"]: break if documents["total"] > documents["failed"] + documents["existent"]: print("%d documents successfully indexed" % (documents["total"] - documents["failed"] - documents["existent"])) if documents["failed"] > 0: print("%d documents couldn't be indexed" % documents["failed"]) print("Indexing exception found %s" % exception) if documents["existent"] > 0: print("%d documents already exist in ES" % documents["existent"])
def json2es(json_data, es: Elasticsearch, index_name): """ 将json数据写入到ES :param json_data: 包含json数据的迭代器或json数据 :param es: ES客户端 :param index_name: 写入的索引名 """ actions = [] count = 0 if helper.is_json(json_data): action = es_helper.get_action(json_data, index_name) actions.append(action) elif isinstance(json_data, Iterator): for line in json_data: if helper.is_json(line): action = es_helper.get_action(line, index_name) actions.append(action) else: print(json_data + "is not json data") for success, info in helpers.parallel_bulk(es, actions, thread_count=1, chunk_size=4000, max_chunk_bytes=100 * 1024 * 1024 * 2): if not success: print('Doc failed', info) else: count = count + 1 print("total insert " + str(count) + " event to " + index_name)
def storeTweetsWithTag(self, tweets, query, event=""): tweets_not_created = [] to_update = ( { '_op_type': 'update', '_type':'tweets', '_index':self.index, '_id': tweet["id"], 'script': { 'lang': "painless", "inline" : "ctx._source.tags.contains(params.query) ? (ctx.op = \"none\") : ctx._source.tags.add(params.query)", "params": { "query": query, "event": event } }, 'upsert': tweet } for tweet in self.format_tweets(tweets, query, event) if "entities" in tweet) errors = [] for res, item in helpers.parallel_bulk(self.es,to_update,chunk_size=chunk_size, thread_count=thread_count, raise_on_error=False): if not res: errors.append(item) return errors
def run(self, corpus, index_name="fact_corpus", document_class=Fact, **kwargs): connections.create_connection(hosts=["localhost"]) document_class.init() documents = ( document_class(meta={"id": id}, fact=doc["fact"]).to_dict(True) for id, doc in corpus.items() ) logger.info(f"Building corpus index for {index_name}") # RayExecutor().run(documents, self.save_data, {}) for success, info in tqdm( parallel_bulk( connections.get_connection(), documents, thread_count=kwargs.pop("batch_size", multiprocessing.cpu_count()), chunk_size=100000, max_chunk_bytes=2 * 1024 ** 3, ) ): if not success: logger.error(f"A document failed: {info} ") logger.success("Elastic index successfully built") return index_name
def to_es(self, df, index, doc_type=None, use_index=False, thread_count=2, chunk_size=1000, request_timeout=60, success_threshold=0.9): ''' :param df: pandas DataFrame data :param index: full name of es indices :param doc_type: full name of es template :param use_index: use DataFrame index as records' _id :param delete: delete existing doc_type template if True :param thread_count: number of thread sent data to es :param chunk_size: number of docs in one chunk sent to es :param request_timeout: :param success_threshold: :return: num of the number of data written into es successfully ''' if self.es7: doc_type = '_doc' if not doc_type: doc_type = index + '_type' gen = helpers.parallel_bulk(self.es, (self.rec_to_actions(df, index, doc_type=doc_type, use_index=use_index, chunk_size=chunk_size)), thread_count=thread_count, chunk_size=chunk_size, raise_on_error=True, request_timeout=request_timeout) success_num = np.sum([res[0] for res in gen]) rec_num = len(df) fail_num = rec_num - success_num if (success_num / rec_num) < success_threshold: raise Exception('%d records write failed' % fail_num) return success_num
def WriteES(self, Index_name, Tag_name, IndexData, PreResult): from elasticsearch import helpers """ ES数据写入库连接封装类,输入数据分别是Index_name是ES数据库中的表名,tag_name为需要打标签的名字,IndexData为需要更新的索引ID编号 形式如下['J3213225318122300014', 'J3205075218122300001'] PreResult为预测的二维数组更新内容,没有则新建[[0.93480456],[0.9358239 ],[0.8241926 ],[0.9171963 ]] """ actions = [] num = len(IndexData) for line in range(num): # res = str(PreResult[line][0]) res = round(PreResult[line][0], 3) action = { '_op_type': 'update', "_index": Index_name, "_type": "_doc", "_id": IndexData[line], "doc": { Tag_name: res, } } actions.append(action) ess = helpers.parallel_bulk(self.connes, actions, self.thread, self.chunk_size) for ok, response in ess: if not ok: print(response)
def build_index(CLIENT, VCLAIMS, INDEX_FILE, INDEX_NAME, KEYS): vclaims_count = VCLAIMS.shape[0] clear_index(CLIENT, INDEX_NAME) with open(INDEX_FILE) as index_file: source = index_file.read() CLIENT.indices.create(index=INDEX_NAME, body=source) lib.logger.info(f"Embedding vclaims.") actions = [] for i, vclaim in tqdm(VCLAIMS.iterrows(), total=vclaims_count): if not CLIENT.exists(index=INDEX_NAME, id=i): body = vclaim.loc[KEYS[:-1]].replace(np.nan, "").to_dict() body["vector"] = lib.embedd(vclaim['vclaim']) actions.append({ '_op_type': 'create', '_index': INDEX_NAME, '_id': i + 1, '_source': body }) lib.logger.info( f"Adding {vclaims_count} entries to '{INDEX_NAME}' with fieldnames: {KEYS}" ) for entry in tqdm(helpers.parallel_bulk(client=CLIENT, actions=actions), total=vclaims_count): pass
def start_bulk_indexing(es_instance, dataset, dataset_name, doc_type): for success, info in helpers.parallel_bulk( es_instance, generate_actions(dataset, dataset_name, doc_type), thread_count=4, chunk_size=1000 ): if not success: print("A document failed to index: {}".format(info)) logger.info("A document failed to index: {}".format(info))
def run_and_index(directory, metadata={}): if not metadata: with open(f"{directory}/metadata.json") as reader: metadata = json.load(reader) # setup patches index data_host = environment["elasticsearch_url"] data_index = "fyp-patches" data_es = Elasticsearch([ { "host": data_host, "port": 443, "use_ssl": True, "timeout": 60, "max_retries": 10, "retry_on_timeout": True }, ]) # create index if doesn't exist mapping = {"mappings": {"properties": {"location": {"type": "geo_shape"}}}} data_es.indices.create(index=data_index, ignore=400, body=mapping) deque(helpers.parallel_bulk(client=data_es, actions=get_data(directory, metadata, data_index), chunk_size=500), maxlen=0)
def bulk(self, docs, index="", doc_type="", op_type='index'): ''' bulk sample: {"_op_type":"index", _index" : "test", "_type" : "type1", "_id" : "1" , "_source":{"field1":"value1", "field2":"value2"}} { "_op_type":"delete" , "_index" : "test", "_type" : "type1", "_id" : "2" } ''' index_ = self.index if index == "" else index doc_type_ = self.doc_type if doc_type == "" else doc_type allow_op = ['index', 'delete'] if op_type not in allow_op: raise exceptions.RequestError( 400, '{"msg":"op_type is not allowed, you can use index or delete"}' ) actions = [] for doc in docs: action = {} action["_index"] = index_ action["_type"] = doc_type_ action["_id"] = doc["_id"] if op_type == 'index': del doc["_id"] action["_source"] = doc action["_op_type"] = op_type actions.append(action) return helpers.parallel_bulk(self.es, actions)
def run(self): logger.debug("Starting thread '{}'".format(self.name)) while True: cursor = None actions = [] for entry in self.jrnl: action, cursor = self._journal_entry_to_action(entry) actions.append(action) if len(actions) > 0: for success, info in helpers.parallel_bulk(self.es, actions, thread_count=2, index=self.index): if not success: logger.error('Failed: {}'.format(info[0])) return False self.es.indices.refresh() count = self.es.count(index=self.index) logger.debug("Items counted on index '{}': {}".format(self.index, count['count'])) cursorpath = Path("/var/cache/geeft_systemd.cursor") with open(cursorpath, 'w') as cursorfile: cursorfile.write(cursor) time.sleep(1) logger.debug("Exiting thread '{}'".format(self.name))
def test_chunk_sent_from_different_threads(self, _process_bulk_chunk): actions = ({"x": i} for i in range(100)) results = list( helpers.parallel_bulk( Elasticsearch(), actions, thread_count=10, chunk_size=2 ) ) self.assertTrue(len(set([r[1] for r in results])) > 1)
def reindex(config, **kwargs): """ Recreate the Search Index. """ client = config.registry["elasticsearch.client"] db = Session(bind=config.registry["sqlalchemy.engine"]) # We use a randomly named index so that we can do a zero downtime reindex. # Essentially we'll use a randomly named index which we will use until all # of the data has been reindexed, at which point we'll point an alias at # our randomly named index, and then delete the old randomly named index. # Create the new index and associate all of our doc types with it. index_base = config.registry["elasticsearch.index"] random_token = binascii.hexlify(os.urandom(5)).decode("ascii") new_index_name = "{}-{}".format(index_base, random_token) doc_types = config.registry.get("search.doc_types", set()) new_index = get_index( new_index_name, doc_types, using=client, shards=config.registry.get("elasticsearch.shards", 1), replicas=config.registry.get("elasticsearch.replicas", 1), ) new_index.create() # From this point on, if any error occurs, we want to be able to delete our # in progress index. try: db.execute("SET statement_timeout = '600s'") for _ in parallel_bulk(client, _project_docs(db)): pass except: new_index.delete() raise finally: db.rollback() db.close() # Now that we've finished indexing all of our data, we'll point the alias # at our new randomly named index and delete the old index. if client.indices.exists_alias(name=index_base): to_delete = set() actions = [] for name in client.indices.get_alias(name=index_base): to_delete.add(name) actions.append({"remove": {"index": name, "alias": index_base}}) actions.append({"add": {"index": new_index_name, "alias": index_base}}) client.indices.update_aliases({"actions": actions}) client.indices.delete(",".join(to_delete)) else: client.indices.put_alias(name=index_base, index=new_index_name)
def create_es_indices_bulk_parallel(es, data_list, thread_cnt): ''' For creating elastic search indices in bulk ( Works with parallel threading, thus faster than es_indices_bulk) :param es: :param data_list: :param thread_cnt: :return: ''' for success, info in helpers.parallel_bulk(es, data_list, thread_count=thread_cnt, chunk_size=1000000, request_timeout=30): if not success: print('A document failed:', info)
def flush_cache(self): if len(self.cache) == 0: return True retry = 2 for i in range(retry): try: to_upload = helpers.parallel_bulk( self.es, self.cache_insertable_iterable()) counter = 0 num_items = len(self.cache) for item in to_upload: self.logger.debug( "{} of {} Elastic objects uploaded".format( num_items, counter)) counter = counter + 1 output = "Pushed {} items to Elasticsearch to index {}".format( num_items, self.index) output += " and browbeat UUID {}".format(str(browbeat_uuid)) self.logger.info(output) self.cache = deque() self.last_upload = datetime.datetime.utcnow() return True except Exception as Err: self.logger.error( "Error pushing data to Elasticsearch, going to retry" " in 10 seconds") self.logger.error("Exception: {}".format(Err)) time.sleep(10) if i == (retry - 1): self.logger.error( "Pushing Data to Elasticsearch failed in spite of retry," " dumping JSON for {} cached items".format( len( self.cache))) for item in self.cache: filename = item['test_name'] + '-' + item['identifier'] filename += '-elastic' + '.' + 'json' elastic_file = os.path.join(item['result_dir'], filename) with open(elastic_file, 'w') as result_file: json.dump(item['result'], result_file, indent=4, sort_keys=True) self.logger.info( "Saved Elasticsearch consumable result JSON to {}". format( elastic_file)) self.cache = deque() self.last_upload = datetime.datetime.utcnow() return False
def bulk(self,actions): if len(actions) > 0: self.tempData.extend(actions) #If more than # MB, store on elasticsearch and clear list if sys.getsizeof(self.tempData) > 1000000: print 'UPLOADING ########### The SIZE IS: ' + str(sys.getsizeof(self.tempData)) #try: for success, info in helpers.parallel_bulk(self.es, self.tempData, thread_count=4): if not success: print('Doc failed', info) #except: #print 'Error while uploading : ', actions print 'uploaded!' self.tempData = []
def PushMessage(self, es): try: #r = requests.post('%s/_bulk?' % args.elasticserver, data=data, timeout=args.timeout) #helpers.parallel_bulk(es, data, chunk_size=5) for success, info in helpers.parallel_bulk(es, self.send_data, chunk_size=1500): #print '\n', info, success print info, success if not success: print('A document failed:', info) self.data = {} self.send_data = [] loggerIndex.info('Bulk API request to Elasticsearch returned with code ' ) except Exception, e: loggerIndex.error('Failed to send to Elasticsearch: %s' % e)
def _create_class_docs(self, routing, binary_id, classes): bulk_actions = [] for objc_class in classes: bulk_actions.append({ "_index": self.index, "_type": "class", "_parent": binary_id, "_routing": routing, "_source": dict(objc_class) }) for success, info in helpers.parallel_bulk(self.es, bulk_actions): if not success: raise Exception("A class document failed: %s" % info)
def _create_request_docs(self, routing, analysis_id, requests): bulk_actions = [] for request in requests: bulk_actions.append({ "_index": self.index, "_type": "network_request", "_parent": analysis_id, "_routing": routing, "_source": dict(request) }) for success, info in helpers.parallel_bulk(self.es, bulk_actions): if not success: raise Exception( "A network_request document failed: %s" % info)
def _create_file_docs(self, routing, analysis_id, files): bulk_actions = [] for a_file in files: bulk_actions.append({ "_index": self.index, "_type": "file_access", "_parent": analysis_id, "_routing": routing, "_source": dict(a_file) }) for success, info in helpers.parallel_bulk(self.es, bulk_actions): if not success: raise Exception("A file_access document failed: %s" % info)
def _geocomplete_index_batch(self, elasticsearch_conn, to_index): log_msg = 'Indexing documents.' self._logging(logging.INFO, log_msg) for ok, info in parallel_bulk(elasticsearch_conn, to_index): if not ok: doc_id = info['create']['_id'] doc_type = info['create']['_type'] doc_index = info['create']['_index'] logging_level = logging.ERROR err_msg = "Couldn't index document: '%s', of type: %s, " \ "under index: %s." % (doc_id, doc_type, doc_index) self._logging(logging_level, err_msg)
def do_commit(rrr): print ('COMMITTING BATCH...',vectors_model_name,len(rrr)) if mc_config.LOW_LEVEL: ii = parallel_bulk(es, rrr, thread_count = 1, chunk_size = 500, max_chunk_bytes = 100 * 1024 * 1024, #100MB ) else: ii = nes.parallel_bulk(rrr) for is_success,res in ii: #print ('COMMITTED_VECTORS',vectors_model_name,is_success,res) pass rrr[:] = [] print ('COMMITTED')
def do_commit(rrr): print ('COMMITTING BATCH...',len(rrr)) #print ('SERVER_RESULTS_SAMPLE', rrr[5:]) #raw_input_enter() from elasticsearch.helpers import parallel_bulk, scan ii = parallel_bulk(es, rrr, thread_count = 1, chunk_size = 500, max_chunk_bytes = 100 * 1024 * 1024, #100MB ) for is_success,res in ii: #print ('COMMITTED',is_success,res) pass rrr[:] = [] print ('COMMITTED')
def reindex_project(self, request, project_name): r = redis.StrictRedis.from_url(request.registry.settings["celery.scheduler_url"]) try: with SearchLock(r, timeout=15, blocking_timeout=1): client = request.registry["elasticsearch.client"] doc_types = request.registry.get("search.doc_types", set()) index_name = request.registry["elasticsearch.index"] get_index( index_name, doc_types, using=client, shards=request.registry.get("elasticsearch.shards", 1), replicas=request.registry.get("elasticsearch.replicas", 0), ) for _ in parallel_bulk( client, _project_docs(request.db, project_name), index=index_name ): pass except redis.exceptions.LockError as exc: raise self.retry(countdown=60, exc=exc)
def update_thread(thread, es=None): """Put into the index new child posts""" if not es: es = Elasticsearch() currents_posts = fetch_hn_data(thread)['kids'] query = { "_source": False, 'query': { 'term': {"parent": thread} } } older_posts_gen = helpers.scan(es, query) old_posts_ids = {int(item['_id']) for item in older_posts_gen} new_posts_ids = set(currents_posts) - old_posts_ids if new_posts_ids: print("There are {} new posts!".format(len(new_posts_ids))) actions = [format_data_for_action(r) for r in new_posts_ids if format_data_for_action(r)] list(helpers.parallel_bulk(es, actions))
import csv from collections import deque import elasticsearch from elasticsearch import helpers def readMovies(): csvfile = open('ml-latest-small/movies.csv', 'r') titleLookup = readMovies() reader = csv.DictReader( csvfile ) for line in reader: rating = {} rating['movie_id'] = int(line['movieId']) rating['title'] = titleLookup[line['title']] rating['genres'] = titleLookup[line['genres']] yield rating es = elasticsearch.Elasticsearch() es.indices.delete(index="movies",ignore=404) deque(helpers.parallel_bulk(es,readMovies(),index="movies",doc_type="movie"), maxlen=0) es.indices.refresh()
def parallel_bulk(self,thread_count=4): if self.client and (len(self.actions) > 0): helpers.parallel_bulk(self.client, self.actions, thread_count) else: raise _EsError('commit:_Es have no redis or actions = 0')
def reindex(self, request): """ Recreate the Search Index. """ r = redis.StrictRedis.from_url(request.registry.settings["celery.scheduler_url"]) try: with SearchLock(r, timeout=30 * 60, blocking_timeout=30): p = urllib.parse.urlparse(request.registry.settings["elasticsearch.url"]) client = elasticsearch.Elasticsearch( [urllib.parse.urlunparse(p[:2] + ("",) * 4)], verify_certs=True, ca_certs=certifi.where(), timeout=30, retry_on_timeout=True, serializer=serializer.serializer, ) number_of_replicas = request.registry.get("elasticsearch.replicas", 0) refresh_interval = request.registry.get("elasticsearch.interval", "1s") # We use a randomly named index so that we can do a zero downtime reindex. # Essentially we'll use a randomly named index which we will use until all # of the data has been reindexed, at which point we'll point an alias at # our randomly named index, and then delete the old randomly named index. # Create the new index and associate all of our doc types with it. index_base = request.registry["elasticsearch.index"] random_token = binascii.hexlify(os.urandom(5)).decode("ascii") new_index_name = "{}-{}".format(index_base, random_token) doc_types = request.registry.get("search.doc_types", set()) shards = request.registry.get("elasticsearch.shards", 1) # Create the new index with zero replicas and index refreshes disabled # while we are bulk indexing. new_index = get_index( new_index_name, doc_types, using=client, shards=shards, replicas=0, interval="-1", ) new_index.create(wait_for_active_shards=shards) # From this point on, if any error occurs, we want to be able to delete our # in progress index. try: request.db.execute("SET statement_timeout = '600s'") for _ in parallel_bulk( client, _project_docs(request.db), index=new_index_name ): pass except: # noqa new_index.delete() raise finally: request.db.rollback() request.db.close() # Now that we've finished indexing all of our data we can update the # replicas and refresh intervals. client.indices.put_settings( index=new_index_name, body={ "index": { "number_of_replicas": number_of_replicas, "refresh_interval": refresh_interval, } }, ) # Point the alias at our new randomly named index and delete the old index. if client.indices.exists_alias(name=index_base): to_delete = set() actions = [] for name in client.indices.get_alias(name=index_base): to_delete.add(name) actions.append({"remove": {"index": name, "alias": index_base}}) actions.append({"add": {"index": new_index_name, "alias": index_base}}) client.indices.update_aliases({"actions": actions}) client.indices.delete(",".join(to_delete)) else: client.indices.put_alias(name=index_base, index=new_index_name) except redis.exceptions.LockError as exc: raise self.retry(countdown=60, exc=exc)
def es_bulk_indexing_of_model(self, model, force_reindexing=False): """Perform a bulk action on documents of a given model. Use the ``objects_per_batch`` property to index. See http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.bulk and http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.parallel_bulk .. attention:: + Currently only implemented with "index" and "update" ! + Currently only working with ``AbstractESDjangoIndexable``. :param model: and model :type model: class :param force_reindexing: force all document to be returned :type force_reindexing: bool :return: the number of documents indexed :rtype: int """ if not self.connected_to_es: return if not self.index_exists: raise NeedIndex() # better safe than sorry if model.__name__ == 'FakeChapter': self.logger.warn('Cannot index FakeChapter model. Please index its parent model.') return 0 documents_formatter = partial(es_document_mapper, force_reindexing, self.index) objects_per_batch = getattr(model, 'objects_per_batch', 100) indexed_counter = 0 if model.__name__ == 'PublishedContent': generate = model.get_es_indexable(force_reindexing) while True: with transaction.atomic(): try: # fetch a batch objects = next(generate) except StopIteration: break if not objects: break if hasattr(objects[0], 'parent_model'): model_to_update = objects[0].parent_model pks = [o.parent_id for o in objects] else: model_to_update = model pks = [o.pk for o in objects] formatted_documents = list(map(documents_formatter, objects)) for _, hit in parallel_bulk( self.es, formatted_documents, chunk_size=objects_per_batch, request_timeout=30 ): action = list(hit.keys())[0] self.logger.info('{} {} with id {}'.format(action, hit[action]['_type'], hit[action]['_id'])) # mark all these objects as indexed at once model_to_update.objects.filter(pk__in=pks) \ .update(es_already_indexed=True, es_flagged=False) indexed_counter += len(objects) return indexed_counter else: then = time.time() prev_obj_per_sec = False last_pk = 0 object_source = model.get_es_indexable(force_reindexing) while True: with transaction.atomic(): # fetch a batch objects = list(object_source.filter(pk__gt=last_pk)[:objects_per_batch]) if not objects: break formatted_documents = list(map(documents_formatter, objects)) for _, hit in parallel_bulk( self.es, formatted_documents, chunk_size=objects_per_batch, request_timeout=30 ): if self.logger.getEffectiveLevel() <= logging.INFO: action = list(hit.keys())[0] self.logger.info('{} {} with id {}'.format( action, hit[action]['_type'], hit[action]['_id'])) # mark all these objects as indexed at once model.objects.filter(pk__in=[o.pk for o in objects]) \ .update(es_already_indexed=True, es_flagged=False) indexed_counter += len(objects) # basic estimation of indexed objects per second now = time.time() last_batch_duration = int(now - then) or 1 then = now obj_per_sec = round(float(objects_per_batch) / last_batch_duration, 2) if force_reindexing: print(' {} so far ({} obj/s, batch size: {})'.format( indexed_counter, obj_per_sec, objects_per_batch)) if prev_obj_per_sec is False: prev_obj_per_sec = obj_per_sec else: ratio = obj_per_sec / prev_obj_per_sec # if we processed this batch 10% slower/faster than the previous one, # shrink/increase batch size if abs(1 - ratio) > 0.1: objects_per_batch = int(objects_per_batch * ratio) if force_reindexing: print(' {}x, new batch size: {}'.format(round(ratio, 2), objects_per_batch)) prev_obj_per_sec = obj_per_sec # fetch next batch last_pk = objects[-1].pk return indexed_counter
def forceBulk(self): if len(self.tempData) > 0: print 'UPLOADING ############# The SIZE IS: ' + str(sys.getsizeof(self.tempData)) for success, info in helpers.parallel_bulk(self.es, self.tempData, thread_count=4): if not success: print('Doc failed', info) self.tempData = []
def lambda_handler(event, context): host = 'your_elasticsearchservice_endpoint' region = "your_elasticsearchservice_cluster_region" awsauth = AWS4Auth(os.environ['AWS_ACCESS_KEY_ID'], os.environ['AWS_SECRET_ACCESS_KEY'], region, 'es', session_token=os.environ['AWS_SESSION_TOKEN']) mappings={"mappings" : { "cloudtrail":{ "properties" : { "userIdentity" : { "type":"object", "properties":{ "arn" : { "type" : "string", "index" : "not_analyzed" }, "accountId": { "type" : "string", "index" : "not_analyzed" }, "invokedBy": { "type" : "string", "index" : "not_analyzed" }, "userName": { "type" : "string", "index" : "not_analyzed" } } }, "eventSource": { "type" : "string", "index" : "not_analyzed" }, "awsRegion": { "type" : "string", "index" : "not_analyzed" }, "userAgent": { "type" : "string", "index" : "not_analyzed" } }, "dynamic_templates":[ {"resourceIdentifiers":{ "match":"*Id", "match_mapping_type":"string", "mapping":{ "type":"string", "index":"not_analyzed" } } }, {"resourceIdentifiersLower":{ "match":"*id", "match_mapping_type":"string", "mapping":{ "type":"string", "index":"not_analyzed" } } }, {"resourceIdentifiersUpper":{ "match":"*ID", "match_mapping_type":"string", "mapping":{ "type":"string", "index":"not_analyzed" } } } ] } } } index = 'cloudtrail-' + datetime.strftime(datetime.utcnow(),'%Y-%m-%d') es = Elasticsearch( hosts=[{'host': host, 'port': 443}], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection ) if not es.indices.exists(index): es.indices.create(index=index,body=mappings) s3 = boto3.client('s3') bucket_name=event['Records'][0]['s3']['bucket']['name'] key_name=event['Records'][0]['s3']['object']['key'] if not "CloudTrail-Digest" in key_name: obj = s3.get_object(Bucket=bucket_name,Key=key_name) compressed = StringIO(obj['Body'].read()) decompressed = gzip.GzipFile(fileobj=compressed,mode='rb') raw_event = json.loads(decompressed.read()) bulk_json = [] for record in raw_event['Records']: entry={"_op_type":"index","_index":index,"_type":"cloudtrail","_id":record['eventID'],"_source":record} bulk_json.append(entry) for success,info in helpers.parallel_bulk(es,bulk_json,thread_count=4): if not success: print('Failed to index document: ',info) else: print("File is a digest. Skipping ",key_name)
sniff_on_connection_fail=True, sniffer_timeout=60 ) def dump_json(dir): s = {} for dir2 in os.listdir(dir): print dir2 if dir2 !="_temporary": for file in os.listdir(os.path.join(dir,dir2)): file_name = os.path.join(dir,dir2,file) print file_name f = open(file_name, 'r') for line in f: line = str(line.rstrip()) # print s.format(line) # if ("""[{"target": "a phone number"}]""" in line) or ("""[{"target": "an email"}]""" in line): # print line # else: s = {'_op_type':'create','_type':'transaction','_source':line} yield(s) # print line #dump_json(args.directory) success, _ = parallel_bulk(es,dump_json(args.directory),index='venmo2018',thread_count=2,raise_on_error=False) print('Performed %d actions' % success) #count = 0 #for item in dump_json(args.directory): # print count # print item # count += 1
reader = csv.DictReader( csvfile ) titleLookup = {} for movie in reader: titleLookup[movie['movieId']] = movie['title'] return titleLookup def readTags(): csvfile = open('ml-latest-small/tags.csv', 'r') titleLookup = readMovies() reader = csv.DictReader( csvfile ) for line in reader: tag = {} tag['user_id'] = int(line['userId']) tag['movie_id'] = int(line['movieId']) tag['title'] = titleLookup[line['movieId']] tag['tag'] = line['tag'] tag['timestamp'] = int(line['timestamp']) yield tag es = elasticsearch.Elasticsearch() es.indices.delete(index="tags",ignore=404) deque(helpers.parallel_bulk(es,readTags(),index="tags",doc_type="tag"), maxlen=0) es.indices.refresh()
'title': { 'type': 'string', 'analyzer': 'spanish', }, 'narr': { 'type': 'string', 'analyzer': 'spanish', }, } } } }) def actions(docs): for doc in docs: body = {child.tag: child.text for child in doc.iterchildren()} yield { '_op_type': 'create', '_index': DB_INDEX, '_type': DOC_TYPE, '_id': body['docid'], '_source': body, } for filename in sorted(os.listdir('efe95')): with open('efe95/{}'.format(filename), encoding='iso-8859-1') as data: docs = lxml.html.fragments_fromstring(data.read()) for success, info in es_helpers.parallel_bulk(es, actions(docs)): pass
def test_all_chunks_sent(self, _process_bulk_chunk): actions = ({'x': i} for i in range(100)) list(helpers.parallel_bulk(Elasticsearch(), actions, chunk_size=2)) self.assertEquals(50, _process_bulk_chunk.call_count)
user["all_years"] = list(set(user["all_years"])) yield user num_users += 1 if num_users % 10000 == 0: print("Indexed %s users" % (num_users)) user.clear() user["userId"] = int(row["userId"]) user["liked"] = [] user["disliked"] = [] user["indifferent"] = [] user["all_rated"] = [] user["all_years"] = [] user["liked_years"] = [] user["all_rated"].append(title) if "year" in movies[row["movieId"]]: user["all_years"].append(movies[row["movieId"]]["year"]) if rating >= 4.0: user["liked_years"].append(movies[row["movieId"]]["year"]) user["liked"].append(title) if rating >= 4.0 else ( user["indifferent"].append(title) if rating > 2.0 else user["disliked"].append(title)) yield user index = "movie_lens_users" doc_type = "user" es.indices.delete(index=index, ignore=404) es.indices.create(index=index, body=open(mapping_file,"r").read(), ignore=404) print("Indexing users...") deque(helpers.parallel_bulk(es, read_users(ratings_file, read_movies(movies_file)), index = index, doc_type = doc_type), maxlen = 0) print("Indexing Complete") es.indices.refresh()
# print json.dumps(mapping) # if put error, see # https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-put-mapping.html#merging-conflicts try: client.indices.put_mapping( index=db, doc_type=table_name, body=json.dumps(mapping), update_all_types=True) except Exception, e: print( "put mapping failed, maybe error u'illegal_argument_exception', u'mapper [xxxx] cannot be changed from type [long] to [string]'") try: # get bulk actions actions = get_actions(cursor=cursor, table_name=table_name, index=db) failed_count = 0 responses = helpers.parallel_bulk( client=client, actions=actions, thread_count=thread_count, chunk_size=chunk_size) for success, msg in responses: status = msg['create']['status'] if not success or status != 201: failed_count += 1 if failed_count > 0: print("----------------> failed records %i" % failed_count) else: print("migrate success") migrate_fail_count += failed_count except ConnectionError, e: print e sys.exit(1) except Exception, e: print "migrate table: %s error" % table_name, type(e) # print e