def main(): es = connections.get_connection() dry = '--dry' in sys.argv if not dry: utils.add_file_logger(logger, __file__) preprints = Preprint.objects.filter(primary_file__isnull=False).select_related('primary_file', 'provider') total_preprints = preprints.count() logger.info('Collecting data on {} preprints...'.format(total_preprints)) batch_to_update = [] for i, preprint in enumerate(preprints, 1): preprint_id = preprint._id provider_id = preprint.provider._id file_id = preprint.primary_file._id page_counters = ( PageCounter.objects .filter( _id__startswith='download:{preprint_id}:{file_id}:'.format( preprint_id=preprint_id, file_id=file_id ) ).values_list('_id', 'date') ) for page_counter in page_counters: page_counter__id, date = page_counter version_num = page_counter__id.split(':')[-1] for date, totals in date.items(): timestamp = datetime.datetime.strptime(date, '%Y/%m/%d').replace(tzinfo=pytz.utc) batch_to_update.append({ '_index': 'osf_preprintdownload_{}'.format(timestamp.strftime(settings.ELASTICSEARCH_METRICS_DATE_FORMAT)), '_source': { 'count': totals['total'], 'path': '/{}'.format(file_id), 'preprint_id': preprint_id, 'provider_id': provider_id, 'timestamp': timestamp, 'user_id': None, # Pagecounter never tracked this 'version': int(version_num) + 1 }, '_type': 'doc' }) if len(batch_to_update) >= MAX_BATCH_SIZE: logger.info('Bulk-indexing data from {} PageCounter records'.format(len(batch_to_update))) if not dry: bulk(es, batch_to_update, max_retries=3, chunk_size=CHUNK_SIZE, request_timeout=REQUEST_TIMEOUT) batch_to_update = [] # Allow elasticsearch to catch up print('{}/{} preprints completed ({:.2f}%)'.format(i + 1, total_preprints, (i + 1) / total_preprints * 100)) sleep(THROTTLE_PERIOD) # Index final batch if len(batch_to_update): logger.info('Bulk-indexing data from {} PageCounter records'.format(len(batch_to_update))) if not dry: bulk(es, batch_to_update, max_retries=3, chunk_size=CHUNK_SIZE, request_timeout=REQUEST_TIMEOUT) logger.info('This will migrate {} Pagecounter entries to Elasticsearch'.format(len(batch_to_update)))
def insert(self): global config sys.stderr.flush() js_arr = [] i = 0 for entry in self.json: js = entry mid = js['mid'] if self.dtype == 'mbox_source': del js['mid'] js_arr.append({ '_op_type': 'index', '_consistency': self.wc, '_index': dbname, '_type': self.dtype, '_id': mid, 'doc': js, '_source': js }) if not args.dry: try: helpers.bulk(self.xes, js_arr) except Exception as err: print("%s: Warning: Could not bulk insert: %s into %s" % (self.id,err,self.dtype))
def run(self): """ Run task, namely: * purge existing index, if requested (`purge_existing_index`), * create the index, if missing, * apply mappings, if given, * set refresh interval to -1 (disable) for performance reasons, * bulk index in batches of size `chunk_size` (2000), * set refresh interval to 1s, * refresh Elasticsearch, * create entry in marker index. """ if self.purge_existing_index: self.delete_index() self.create_index() es = self._init_connection() if self.mapping: es.indices.put_mapping(index=self.index, doc_type=self.doc_type, body=self.mapping) es.indices.put_settings({"index": {"refresh_interval": "-1"}}, index=self.index) bulk(es, self._docs(), chunk_size=self.chunk_size, raise_on_error=self.raise_on_error) es.indices.put_settings({"index": {"refresh_interval": "1s"}}, index=self.index) es.indices.refresh() self.output().touch()
def sync_events_elasticsearch(): # Sync update and inserts index_count = redis_store.scard('event_index') index_event_data = ({'_type': 'event', '_index': 'events', '_id': event_[0], 'name': event_[1], 'description': event_[2] or None, 'searchable_location_name': event_[3] or None, 'organizer_name': event_[4] or None, 'organizer_description': event_[5] or None} for event_ in EventIterator(index_count, 'event_index')) try: helpers.bulk(es_store, index_event_data) except Exception as e: print(e) # sync both soft and hard deletes del_count = redis_store.scard('event_delete') del_event_data = ({'_type': 'event', '_index': 'events', '_id': event_[0], 'name': event_[1], 'description': event_[2] or None, 'searchable_location_name': event_[3] or None, 'organizer_name': event_[4] or None, 'organizer_description': event_[5] or None} for event_ in EventIterator(del_count, 'event_delete')) try: helpers.bulk(es_store, del_event_data) except Exception as e: print(e)
def cli(host, index_name, doc_type, import_file, mapping_file, id_field_idx, delete_index, quiet): """ Bulk import a delimited file into a target Elasticsearch instance. Common delimited files include things like CSV. Load a CSV file: data2es --index-name myindex --doc-type mydoc --import-file test.csv """ echo('Using host: %s' % host, quiet) es = Elasticsearch(hosts=[host]) if es.indices.exists(index_name): echo('Index %s already exist' % index_name, False) if delete_index: es.indices.delete(index=index_name) echo('Deleted: %s' % index_name, quiet) es.indices.create(index=index_name) echo('Created new index: %s' % index_name, quiet) else: es.indices.create(index=index_name) echo('Created new index: %s' % index_name, quiet) echo('Using document type: %s' % doc_type, quiet) if mapping_file: echo('Applying mapping from: %s' % mapping_file, quiet) with open(mapping_file) as f: mapping = json.loads(f.read()) es.indices.put_mapping(doc_type, mapping, [index_name,]) action_g = docs_from_file(import_file, index_name, doc_type, id_field_idx, quiet) helpers.bulk(es, action_g())
def capture(pcap_files, node, chunk, trace): try: es = None if node != None: es = Elasticsearch(node) print "Loading packet capture file(s)" for pcap_file in pcap_files: print pcap_file stats = os.stat(pcap_file) file_date_utc = datetime.utcfromtimestamp(stats.st_ctime) capture = pyshark.FileCapture(pcap_file) # If no Elasticsearch node specified, dump to stdout if node == None: dump_packets(capture, file_date_utc) else: helpers.bulk( es, index_packets(capture, pcap_file, file_date_utc), chunk_size=chunk, raise_on_error=False ) except Exception as e: print "error: ", e if trace == True: traceback.print_exc(file=sys.stdout)
def insert_headers(headers): elastic_host = "localhost" elastic_index = 'indextest' elastic_index_settings = { "settings": { "number_of_shards": 5, "number_of_replicas": 0, } } elastic_mapping = { "nntp": { "properties": { "Subject": { "type": "string", "index": "not_analyzed" } } } } print "inserting " + str(len(headers)) + " into ES" es = Elasticsearch([elastic_host], sniff_on_start=True) es.indices.create(index=elastic_index, body=elastic_index_settings, ignore=400) es.indices.put_mapping(index=elastic_index, doc_type="nntp", body=elastic_mapping) if len(headers) > 0: helpers.bulk(es, headers) return True
def commit(self): """Process list of dict (yes) and push them to DB """ self.total_objs += len(self.nlist) count = 0 full_body = "" items = [] for evt_array in self.nlist: for entry in evt_array['events']: items.append({"index" : {}}) entry['whitelisted'] = "false" entry['comments'] = "import:"+str(datetime.datetime.now()) # go utf-8 ? for x in entry.keys(): if isinstance(entry[x], basestring): entry[x] = unicode(entry[x], errors='replace') items.append(entry) count += 1 mapfunc = partial(json.dumps, ensure_ascii=False) try: full_body = "\n".join(map(mapfunc,items)) + "\n" except: print "Unexpected error:", sys.exc_info()[0] print "Unable to json.dumps : " pprint.pprint(items) bulk(self.es, items, index=self.cfg["elastic"]["index"], doc_type="events", raise_on_error=True) self.total_commits += count logging.debug("Written "+str(self.total_commits)+" events") print "Written "+str(self.total_commits)+" events" del self.nlist[0:len(self.nlist)]
def bulk_insert(es, records=[]): from elasticsearch import helpers if records == None or len(records) == 0: record = { "_index": "chunzhi", "_type": "study", "_id": "111", "_source": { "request": "request", "response": "response"} } records = [] records.append(record) records.clear() record=[] import time for page in search_data(es): for r in page: s=r["_source"]["started_at"] s = time.strftime('%Y-%m-%dT%H:%M:%S.123Z',time.localtime(1466561471)) r["_source"]["started_at"] = s r["_source"]["started_year"] = s[0:4] r["_source"]["started_month"] =s[5:7] r["_source"]["started_day"] = s[8:10] r["_source"]["started_hour"] = s[11:13] r["_source"]["started_minute"] = s[14:16] r["_source"]["started_second"] = s[17:19] record={"_index": "kongtest-2016.06", "_type":"log", "_source": r["_source"]} records.append(record) helpers.bulk(es, records)
def restoreIndex(hosts, fileName): es = Elasticsearch(hosts) fileName = fileName fd = open(fileName, 'r') data = (json.loads(line) for line in fd.readlines()) helpers.bulk(es, data) return
def index_many_crashes( self, number, processed_crash=None, raw_crash=None, loop_field=None ): processed_crash = processed_crash or {} raw_crash = raw_crash or {} actions = [] for i in range(number): crash_id = str(uuid.UUID(int=random.getrandbits(128))) if loop_field is not None: processed_copy = processed_crash.copy() processed_copy[loop_field] = processed_crash[loop_field] % i else: processed_copy = processed_crash doc = { 'crash_id': crash_id, 'processed_crash': processed_copy, 'raw_crash': raw_crash, } action = { '_index': self.config.elasticsearch.elasticsearch_index, '_type': self.config.elasticsearch.elasticsearch_doctype, '_id': crash_id, '_source': doc, } actions.append(action) bulk( client=self.connection, actions=actions, ) self.refresh_index()
def flush(self): """ Flushes the buffer into ES :return: None """ if self._timer is not None and self._timer.is_alive(): self._timer.cancel() self._timer = None if self._buffer: try: with self._buffer_lock: logs_buffer = self._buffer self._buffer = [] actions = ( { '_index': self._index_name_func.__func__(self.es_index_name), '_type': self.es_doc_type, '_source': log_record } for log_record in logs_buffer ) eshelpers.bulk( client=self.__get_es_client(), actions=actions, stats_only=True ) except Exception as exception: if self.raise_on_indexing_exceptions: raise exception
def send_messages(self, message_batch): if not self._init_es(): return start_time = time.monotonic() try: actions = [] for msg in message_batch: message = json.loads(msg.decode("utf8")) timestamp = message.get("timestamp") if "__REALTIME_TIMESTAMP" in message: timestamp = datetime.datetime.utcfromtimestamp(message["__REALTIME_TIMESTAMP"]) else: timestamp = datetime.datetime.utcnow() message["timestamp"] = timestamp index_name = "{}-{}".format(self.index_name, datetime.datetime.date(timestamp)) if index_name not in self.indices: self.create_index_and_mappings(index_name) actions.append({ "_index": index_name, "_type": "journal_msg", "_source": message, }) if actions: helpers.bulk(self.es, actions) self.log.debug("Sent %d log events to ES, took: %.2fs", len(message_batch), time.monotonic() - start_time) except Exception as ex: # pylint: disable=broad-except self.log.warning("Problem sending logs to ES: %r", ex) return False return True
def add_bulk(self, obj_list): # Group all objects by their type type_set = {} for obj in obj_list: # Object must be a decendant of Indexed and be a django model if not self.object_can_be_indexed(obj): continue # Get object type obj_type = obj.indexed_get_content_type() # If type is currently not in set, add it if obj_type not in type_set: type_set[obj_type] = [] # Add object to set type_set[obj_type].append(obj.indexed_build_document()) # Loop through each type and bulk add them for type_name, type_objects in type_set.items(): # Get list of actions actions = [] for obj in type_objects: action = { '_index': self.es_index, '_type': type_name, '_id': obj['id'], } action.update(obj) actions.append(action) bulk(self.es, actions)
def es_insert(cls, mock=False): """ Insert documents. Args: mock (bool): If true, generate mock data. """ if not mock: actions = cls.es_stream_docs() else: actions = cls.es_stream_mock_docs() # Clear the index. cls.es_reset() # Batch-insert the documents. bulk( client=config.es, actions=actions, raise_on_exception=False, raise_on_error=False, doc_type=cls.es_index, index=cls.es_index ) # Commit the index. config.es.indices.flush(cls.es_index)
def delete_source(source_id): """Delete all documents from a particular source.""" q = {'query': {'term': {'source_id': source_id}}, '_source': False} def deletes(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_parent'), '_type': res.get('_type'), '_id': res.get('_id') } for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_DOCUMENT]): yield { '_op_type': 'delete', '_index': get_es_index(), '_type': res.get('_type'), '_id': res.get('_id') } try: bulk(get_es(), deletes(), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception: log.debug("Failed to clear documents: %r", source_id)
def bulk_insert(es): fintput = open('/home/liangzx/parse_result'); single_jsstr = ''; actions = []; # format = '%Y-%m-%d %X'; format = '%Y.%m.%d'; current_time = time.strftime(format,time.localtime(time.time())) while 1: line = fintput.readline(); if not line: break if line == '\n': es_object = json.loads(single_jsstr); action = { "_index":"php_tracer", "_type":es_object['doc_type'], "_source":es_object['content'] } actions.append(action); single_jsstr = ''; else: single_jsstr += line; helpers.bulk(es,actions) del actions[0:len(actions)] fintput.close(); foutput = open('/home/liangzx/parse_result','w'); foutput.write(''); foutput.close();
def import_from_iterable(self, iterable, field_to_hash='text', batch_size=500): """Load data into Elasticsearch from iterable. iterable: generally a list of dicts, but possibly a list of strings This is your data. Your dictionary structure defines the schema of the elasticsearch index. field_to_hash: string identifier of field to hash for content ID. For list of dicts, a valid key value in the dictionary is required. For list of strings, a dictionary with one key, "text" is created and used. """ if field_to_hash: self.hash_field = field_to_hash batch = [] for item in iterable: if isinstance(item, basestring): item = {field_to_hash: item} id = hash(item[field_to_hash]) action = {'_op_type': 'update', '_index': self.index, '_type': self.doc_type, '_id': id, 'doc': item, 'doc_as_upsert': "true", } batch.append(action) if len(batch) >= batch_size: helpers.bulk(client=self.instance, actions=batch, index=self.index) batch = [] if batch: helpers.bulk(client=self.instance, actions=batch, index=self.index) self.instance.indices.refresh(self.index) else: raise ValueError("A field_to_hash is required for import_from_iterable")
def handle(self, **options): self._initialize(**options) if (options['rebuild'] and not options['dry_run'] and self.es.indices.exists(self.INDEX_NAME)): self.es.indices.delete(index=self.INDEX_NAME) if (not options['dry_run'] and not self.es.indices.exists(self.INDEX_NAME)): self.es.indices.create(index=self.INDEX_NAME) if self.is_local_tm: self._set_latest_indexed_revision(**options) if isinstance(self.parser, FileParser): helpers.bulk(self.es, self._parse_translations(**options)) return # If we are parsing from DB. tp_qs = TranslationProject.objects.all() if options['disabled_projects']: tp_qs = tp_qs.exclude(project__disabled=True) for tp in tp_qs: self.parser.tp_pk = tp.pk helpers.bulk(self.es, self._parse_translations(**options))
def _index_alias_multiple_indexes_bulk(self, documents=None, actions=None, versions=None): """A bulk operation failed by trying to access an alias that has multiple indexes. To rememdy this we will need to iterate on all indexes within the alias and retry the bulk operation individually. """ indexes = self.engine.indices.get_alias(index=self.alias_name) for index_name in indexes: try: if documents: result = helpers.bulk( client=self.engine, index=index_name, doc_type=self.document_type, chunk_size=self.index_chunk_size, actions=self._prepare_actions(documents, versions)) if actions: result = helpers.bulk( client=self.engine, index=index_name, doc_type=self.document_type, chunk_size=self.index_chunk_size, actions=actions) LOG.debug("Indexing result: %s", result) except Exception as e: # Log the error and continue to the next index. format_msg = { 'doc': self.document_type, 'msg': str(e) } LOG.error("Failed Indexing %(doc)s: %(msg)s" % format_msg)
def write_to_elasticsearch(conn, data, xen_index): conn.indices.create(index=xen_index, body=PS_REVIEWERS_MAPPING, ignore=400) columns = data.columns.values.tolist() uniq_id = 0 bulk_doc = [] for row in data.itertuples(): uniq_id = uniq_id + 1 doc = to_dict(row, columns) header = { "_index": xen_index, "_type": "patchserie", "_id": uniq_id, "_source": doc } bulk_doc.append(header) if uniq_id % 5000 == 0: helpers.bulk(conn, bulk_doc) bulk_doc = [] helpers.bulk(conn, bulk_doc)
def clear(self, schema=None, source_id=None): filter_ = {'bool': {'must': []}} if schema is not None: filter_['bool']['must'].append({ 'term': {'$schema': schema} }) if source_id is not None: filter_['bool']['must'].append({ 'term': {'$sources': source_id} }) q = {'filtered': {'query': {'match_all': {}}, 'filter': filter_}} q = {'query': q, 'fields': []} log.info('Deleting existing entries matching index criteria') def gen_deletes(): for res in scan(self.config.elastic_client, query=q, index=self.config.elastic_index): yield { '_op_type': 'delete', '_index': self.config.elastic_index, '_type': res.get('_type'), '_id': res.get('_id') } bulk(self.config.elastic_client, gen_deletes(), stats_only=True, chunk_size=self.chunk, request_timeout=60.0)
def execute(self): """ Index data of specified queryset """ start_time = time.time() for qs, progress in self.batch_qs(): elapsed = time.time() - start_time total_left = (1 / (progress + 0.001)) * elapsed - elapsed progres_msg = \ 'PART: %s %.3f : duration: %.2f left: %.2f' % ( self.part, progress, elapsed, total_left ) log.info(progres_msg) helpers.bulk( self.client, (self.convert(obj).to_dict(include_meta=True) for obj in qs), raise_on_error=True, ) if settings.TESTING and self.index: idx = es.Index(self.index) # refresh index, make sure its ready for queries idx.refresh()
def do_bulk_index(self, bulk_reqs): try: helpers.bulk(self.es, bulk_reqs) except Exception as e: raise CuckooReportError( "Failed to save results in ElasticSearch for " "task #%d: %s" % (self.task["id"], e) )
def process_provider_into_es(fname, es, conn): status = False with open(fname, 'r') as infile: actions = [] try: for doc in ijson.items(infile, "item"): if doc['type'] == 'INDIVIDUAL': action = { "_index": "data", "_type": "provider", "_source": doc } else: action = { "_index": "data", "_type": "facility", "_source": doc } actions.append(action) if len(actions) > 0 and len(actions) % 50 == 0: helpers.bulk(es, actions) status = True actions = [] except (KeyboardInterrupt, SystemExit): conn.rollback() raise except (UnicodeDecodeError, ValueError, ijson.JSONError): print "{0}\n".format(str(ex)) return status
def index_document(document_id): clear_session() document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_children(document) try: if document.type == Document.TYPE_TEXT: bulk(es, generate_pages(document), stats_only=True, chunk_size=2000, request_timeout=60.0) if document.type == Document.TYPE_TABULAR: bulk(es, generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex)
def delete_source(source_id): q = {'query': {'term': {'source_id': source_id}}} def deletes(): q['_source'] = ['id', 'document_id'] for res in scan(es, query=q, index=es_index, doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': es_index, '_parent': res.get('_source', {}).get('document_id'), '_type': res.get('_type'), '_id': res.get('_id') } q['_source'] = ['id'] for res in scan(es, query=q, index=es_index, doc_type=[TYPE_DOCUMENT]): yield { '_op_type': 'delete', '_index': es_index, '_type': res.get('_type'), '_id': res.get('_id') } try: bulk(es, deletes(), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex)
def save_to_elasticsearch(pItemId, pItemName, ls_top_sim): bulk_data = [] INDEX_NAME = "recsys_tfidf_result" es = elasticsearch.Elasticsearch([{"host": "10.220.83.22", "port": 9206}]) # es.indices.delete(INDEX_NAME) # es.indices.create(INDEX_NAME) ls_rel_p = [] for rel_prod in ls_top_sim: rel_prod_id = rel_prod[0] rel_prod_name = rel_prod[1] rel_score = rel_prod[2] rel_item = {} rel_item["similar_score"] = rel_score rel_item["similar_product_id"] = rel_prod_id rel_item["similar_product_name"] = rel_prod_name ls_rel_p.append(rel_item) op_dict = { "_type": "tfidf_result_type", "_index": INDEX_NAME, "_product_id": pItemId, "_product_name": pItemName, "_ls_similar_products": ls_rel_p, # "_rating": row[2] } bulk_data.append(op_dict) print bulk_data # bulk index the data print ("bulk indexing...") # res = es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) helpers.bulk(es, bulk_data)
def bulk_add_data(self, _index, _doc_type, dict_data=None): """ 批量添加数据 :param _index: 索引名 :param _doc_type: 类型名 :param dict_data: 数据列表 :return: """ if not dict_data: dict_data = [] # j = 0 actions = [] for value in dict_data: action = { "_index": _index, "_type": _doc_type, # "_id": j + 1, "_source": { "timestamp": datetime.now()} } action["_source"].update(value) actions.append(action) # j += 1 if (len(actions) == 500000): helpers.bulk(self.es, actions) del actions[0:len(actions)] if (len(actions) > 0): helpers.bulk(self.es, actions)
def show_details(): banking_list = driver.find_elements_by_xpath( '//table[@class="data yen_nyushutsukin_001"]/tbody/tr') actions = [] for banking in banking_list: detail = banking.find_elements_by_tag_name('td') date = datetime.datetime.strptime(detail[0].text.replace('\n', ''), '%Y年%m月%d日') # JSTに変換 date = date + datetime.timedelta(hours=-9) payment = in_or_out_payment(detail[1].text, detail[2].text) remark = detail[3].text if is_salary(remark, date.day): # 給料は翌月1日にする date = (date + datetime.timedelta(days=30)).replace(day=1) doc = {'@timestamp': date, 'payment': payment, 'remark': remark} actions.append({'_index': 'mufg', '_type': 'mufg', '_source': doc}) if len(actions) > 0: helpers.bulk(es, actions) try: driver.find_element_by_xpath('//a/img[@alt="新しい明細"]').click() except NoSuchElementException: return else: # 例外がない(=次のページがある)から次のページへ show_details()
actions.append({ "_index": "pttpokemongo", "_op_type": "index", "_source": article }) f.close() return actions if __name__ == "__main__": es = Elasticsearch(hosts='localhost', port=9200) actions = load_data_and_convert_to_actions() # print(actions[0]) print("original cases:", len(actions)) success = 0 for index in range(len(actions)): try: helpers.bulk(es, actions[index:index + 1]) success += 1 except Exception as e: print(str(e)[0:500]) continue print("success cases:", success)
"zipcode": str(line['zipcode']), "geo": i18n['geo'] }) except: bulk_batch.append({ "id": n, "district_code": int(line['district_code']), "amphoe_code": int(line['amphoe_code']), "province_code": int(line['province_code']), "district": { "th": line['district'] }, "amphoe": { "th": line['amphoe'] }, "province": { "th": line['province'] }, "zipcode": str(line['zipcode']) }) if (n + 1) % size == 0: print('Batch:', n + 1) try: helpers.bulk(es, genBulk(bulk_batch)) bulk_batch = [] except Exception as e: print(str(e)) print('End:', n + 1) helpers.bulk(es, genBulk(bulk_batch)) print('Total', len(data))
def index(): try: ES_CLIENT.indices.delete("news") except Exception as e: print(e) ES_CLIENT.indices.create("news", ES_MAPPINGS) items = [] total_indexed, total_failed = 0, 0 for folder in CLEANDIR.iterdir(): if folder.name == '.gitignore': continue for i, file in enumerate(sorted(folder.iterdir())): if file.name == ".gitignore": continue print("Processing:", file.name) with open(file, "r") as _file: items.extend(json.loads(_file.read())) if i > 0 and i % CHUNKS[folder.name] == 0: print("Indexing", len(items)) for item in items: if 'sentiment' not in item['_source']: print("Fault", file.name) indexed, failed = helpers.bulk(ES_CLIENT, items, stats_only=True, raise_on_error=False) print("Indexed:", indexed) print("Failed:", failed) total_indexed += indexed total_failed += failed items = [] print("Final Indexing", len(items)) if len(items) != 0: indexed, failed = helpers.bulk(ES_CLIENT, items, stats_only=True, raise_on_error=False) print("Final Indexed:", indexed) print("Final Failed:", failed) total_indexed += indexed total_failed += failed print("Total Indexed:", total_indexed) print("Total Failed:", total_failed)
embeddings = model.encode(questions[start_idx:end_idx], show_progress_bar=False) bulk_data = [] for qid, question, embedding in zip( qids[start_idx:end_idx], questions[start_idx:end_idx], embeddings): bulk_data.append({ "_index": 'quora', "_id": qid, "_source": { "question": question, "question_vector": embedding } }) helpers.bulk(es, bulk_data) pbar.update(chunk_size) except: print("During index an exception occured. Continue\n\n") #Interactive search queries while True: inp_question = input("Please enter a question: ") encode_start_time = time.time() question_embedding = model.encode(inp_question) encode_end_time = time.time() #Lexical search bm25 = es.search(index="quora",
# print(fprecs[fp]) docs = toESDocs(fprecs, fp_index) # for doc in docs: # print(doc) if UPDATE_ELASTICSEARCH: es.update(index=docindex, doc_type="document", id=docid, body=encodedData, _source=False, refresh="false") bulk(es, docs) except Exception as e: timestampPrint("ERROR: {}".format(e)) else: bads[badcount] = j_content badcount += 1 overallcount += 1 # update average ipend = time.time() thistime = ipend - ipstart peripaverage = ( (overallcount * peripaverage) + thistime) / (overallcount + 1) if overallcount % 5 == 0: print >> sys.stderr, "Reading fingerprints and rdns, did: " + str(overallcount) + \
print('Reading files ...') # Reads all the documents in a directory tree and generates an index operation for each ldocs = [] for f in lfiles: if nfiles <= 0: break nfiles -= 1 ftxt = codecs.open(f, "r", encoding='iso-8859-1') text = '' for line in ftxt: text += line # Insert operation for a document with fields' path' and 'text' ldocs.append({'_op_type': 'index', '_index': index, '_type': 'document', 'path': f, 'text': text}) # Working with ElasticSearch client = Elasticsearch() try: # Drop index if it exists ind = Index(index, using=client) ind.delete() except NotFoundError: pass # then create it ind.settings(number_of_shards=1) ind.create() # Bulk execution of elasticsearch operations (faster than executing all one by one) print('Indexing ...') bulk(client, ldocs)
def main(): logFileName = '{}.log'.format(os.path.splitext(sys.argv[0])[0]) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler(logFileName), logging.StreamHandler()]) auth = get_config_params('config.ini') args = parse_args() view = "psra_{province}_uhs".format(**{'province': args.province.lower()}) limit = 10000 offset = 0 # create index es = Elasticsearch([auth.get('es', 'es_endpoint')], http_auth=(auth.get('es', 'es_un'), auth.get('es', 'es_pw'))) if es.indices.exists(view): es.indices.delete(view) # id_field = 'AssetID' settings = { 'settings': { 'number_of_shards': 1, 'number_of_replicas': 0 }, 'mappings': { 'properties': { 'coordinates': { 'type': 'geo_point' }, 'geometry': { 'type': 'geo_shape' } } } } es.indices.create(index=view, body=settings, request_timeout=90) while True: sqlquerystring = 'SELECT *, ST_AsGeoJSON(geom) \ FROM results_psra_{province}.{view} \ ORDER BY {view}."geom" \ LIMIT {limit} \ OFFSET {offset}'.format( **{ 'province': args.province.lower(), 'view': view, 'limit': limit, 'offset': offset }) offset += limit connection = None try: # Connect to the PostGIS database connection = psycopg2.connect( user=auth.get('rds', 'postgres_un'), password=auth.get('rds', 'postgres_pw'), host=auth.get('rds', 'postgres_host'), port=auth.get('rds', 'postgres_port'), database=auth.get('rds', 'postgres_db')) # Query the entire view with the geometries in geojson format cur = connection.cursor() cur.execute(sqlquerystring) rows = cur.fetchall() if rows: columns = [name[0] for name in cur.description] geomIndex = columns.index('st_asgeojson') feature_collection = { 'type': 'FeatureCollection', 'features': [] } # Format table into a geojson format for ES/Kibana consumption for row in rows: coordinates = json.loads(row[geomIndex])['coordinates'] feature = { 'type': 'Feature', 'geometry': json.loads(row[geomIndex]), 'coordinates': coordinates, 'properties': {}, } for index, column in enumerate(columns): if column != "st_asgeojson": value = row[index] feature['properties'][column] = value feature_collection['features'].append(feature) geojsonobject = json.dumps(feature_collection, indent=2, default=decimal_default) d = json.loads(geojsonobject) helpers.bulk(es, gendata(d, view), raise_on_error=False) else: if (connection): connection.close() return except (Exception, psycopg2.Error) as error: logging.error(error)
def bulk_indexing(): BlogPostIndex.init() es = Elasticsearch() bulk(client=es, actions=(b.indexing() for b in models.BlogPost.objects.all().iterator()))
from elasticsearch import helpers, Elasticsearch import sys import json sqli_file = sys.argv[1] urls = [] with open(sqli_file) as f: url_data = f.readlines() for data in url_data: urls.append(data.strip()) my_group = group([bipolar.sqli_check.s(url) for url in urls]) group_results = my_group.apply_async() print(group_results) while not group_results.ready(): print('waiting for jobs to complete') sleep(10) group_results = group_results.get() output = [] for results in group1_results: if results is not None: for i in results: output.append(json.dumps(i)) print(output) es = Elasticsearch(timeout=999999) helpers.bulk(es, output, index='fnsqli', doc_type="doc")
def _bulk_operate_realize(self, to_be_done_list): helpers.bulk(self.middleware_opration_python_instance, to_be_done_list)
# end of event group if thisEntityId != lastEntityId: if events: yield get_action(events) events = [] lastEntityId = thisEntityId events.append(doc["_source"]) numDocsProcessed += 1 if numDocsProcessed % 10000 == 0: elapsedSecs = int(time.time() - start) dps = numDocsProcessed / max(1, elapsedSecs) print numDocsProcessed, "docs per second=", dps # load last event group too if events: yield get_action(events) print "Processed", numDocsProcessed, "docs" start = time.time() helpers.bulk(es, generate_actions(), index=args.entityIndexName, doc_type=args.entityDocType, chunk_size=args.actionsPerBulk) elapsed = (time.time() - start) print "elapsed time=", elapsed
"_type": "doc", "_source": { "id": lst_line[0], "judge_name": lst_line[1], "judge_status": lst_line[2], "person_cnt": lst_line[3], "company_cnt": lst_line[4], "court": lst_line[5], "courtlevel": lst_line[6], "docid": lst_line[7], "casereason": lst_line[8], "casetype": lst_line[9], "doctype": lst_line[10], "trialprocedure": lst_line[11], "judgeyear": lst_line[12], "judgemonth": lst_line[13], "judgetime": lst_line[14], "lawyers": lst_line[15], "partners": lst_line[16], "persons": lst_line[17], "companies": lst_line[18] } } for lst_line in lst) try: helpers.bulk(es, action) except: print('出现问题了') print('传输了{}条审判人员'.format(str(flag))) lst = [] file_open.close() print('上传完成{}'.format(name))
def bulk(self, actions, **kwargs): return bulk(client=self._get_connection(), actions=actions, **kwargs)
from elasticsearch import Elasticsearch, helpers import sys, json es = Elasticsearch() def load_json(filename): if filename.endswith('.json'): with open(filename, 'r') as open_file: yield json.load(open_file) helpers.bulk(es, load_json(sys.argv[1]), index='sfn-tag-details')
def data_client(client): """Connects to client and stores some test index data in elasticsearch """ create_index(client) bulk(client, get_index_data(), raise_on_error=True, refresh=True) yield client
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries in Elasticsearch. When using explicit document IDs, any existing document with the same ID gets updated. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta":{"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary should be changed to what you have set for self.text_field and self.name_field. :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used. :return: None """ if index and not self.client.indices.exists(index=index): self._create_document_index(index) if index is None: index = self.index # Make sure we comply to Document class format documents_objects = [ Document.from_dict(d, field_map=self._create_document_field_map()) if isinstance(d, dict) else d for d in documents ] documents_to_index = [] for doc in documents_objects: _doc = { "_op_type": "index" if self.update_existing_documents else "create", "_index": index, **doc.to_dict(field_map=self._create_document_field_map()) } # type: Dict[str, Any] # cast embedding type as ES cannot deal with np.array if _doc[self.embedding_field] is not None: if type(_doc[self.embedding_field]) == np.ndarray: _doc[self.embedding_field] = _doc[ self.embedding_field].tolist() # rename id for elastic _doc["_id"] = str(_doc.pop("id")) # don't index query score and empty fields _ = _doc.pop("score", None) _ = _doc.pop("probability", None) _doc = {k: v for k, v in _doc.items() if v is not None} # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores, # we "unnest" all value within "meta" if "meta" in _doc.keys(): for k, v in _doc["meta"].items(): _doc[k] = v _doc.pop("meta") documents_to_index.append(_doc) bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)
rtv += flattenChildren(child_key, child.get('children', [])) if 'children' in child: del child['children'] rtv.append(child) return rtv def processDocs(): data = parse_file() rtv = [] for d in data: _id = unicode(uuid.uuid4()) rtv += flattenChildren(_id, d.get('children', [])) if 'children' in d: del d['children'] d['rkw'] = _id d['keyword'] += ' ' + 'root' rtv.append(d) return rtv if __name__ == '__main__': data = processDocs() print json.dumps(data) es.delete_by_query(index=config.index_name, body={"query": { "match_all": {} }}) res = helpers.bulk(es, data, index=config.index_name, doc_type='doc') # pprint(res)
# print type(doc_sanitized['oncology_sub_indication']) # print doc_sanitized['oncology_sub_indication'][0] # normalize investigators to authors if 'inventors' in doc_sanitized.keys(): doc_sanitized['authors'] = doc_sanitized['inventors'] del doc_sanitized['inventors'] action = { "_index": "kols_patents_new", "_type": "patents_new", "_id": objectId, "_source": doc_sanitized } actions.append(action) # print doc_sanitized # break # ret_val = es.index(index="kols_congresses_new",doc_type="congresses_new",ignore=400,body=doc_sanitized,request_timeout=60) # if ret_val['created'] != True: # print ret_val['created'] # print ret_val except Exception, e: print "error...", e break # print x['_id'] # print ret_val if (len(actions) > 0): helpers.bulk(es, actions, chunk_size=50, request_timeout=50)
def bulk(self, actions, **kwargs): return bulk(client=self.connection, actions=actions, **kwargs)
# if row count equals or exceeds max rows if args.max_rows > 0 and row_count >= args.max_rows: # break out of reading loop break # if row count is modulus # of the flush count value if row_count % args.flush_rows == 0: # flush accumulated # rows to target file out_file.flush() if args.out_elastic_search == 'Y' and len(es_actions) > 0: helpers.bulk(es, es_actions) es_actions.clear() # ending time hack end_time = time() # compute records/second seconds = end_time - bgn_time if seconds > 0: rcds_per_second = row_count / seconds else: rcds_per_second = 0 # output progress message message = "Processed: {:,} rows in {:,.0f} seconds @ {:,.0f} records/second".format( row_count, seconds, rcds_per_second) print(message)
def index(self): action_generator = self.get_document_create_bulk_op return es_helpers.bulk(self.es, action_generator())
while temp_i < temp_max_i: temp_i += 1 t_actions, t_wiki_pages_id_index = procs.pop(0).get() actions += t_actions wiki_pages_id_index.update(t_wiki_pages_id_index) temp_max_i = raw_temp_max_i for proc in procs: t_actions, t_wiki_pages_id_index = proc.get() actions += t_actions wiki_pages_id_index.update(t_wiki_pages_id_index) procs = [] temp_time = time.time() print("bulk index batch") temp_success, temp_fail = elastic_helper.bulk(es, actions) print("Time taken: ", time.strftime("%H:%M:%S", time.gmtime(time.time() - temp_time))) es.indices.refresh(index="wiki-pages-index") json_filename = "wiki_pages_id_index.json" dict_path = os.path.join(output_index_path, json_filename) temp_time = time.time() print("saving wiki id index as json file") helper.save_dict_json(wiki_pages_id_index, dict_path) print("Time taken: ", time.strftime("%H:%M:%S", time.gmtime(time.time() - temp_time))) print("-----Total time taken: ", time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
# outputs epoch seconds, ES can read without any weird configs created_at_epoch = int( dateutil.parser.parse(tweet["created_at"]).timestamp()) # tweet schema esTweet = { "created_at": created_at_epoch, "urls": expandedUrls, "text": text, "favorite_count": tweet["favorite_count"], "_id": tweet["id_str"], "user-screen_name": tweet["user"]["screen_name"], "quote_count": tweet["quote_count"], "reply_count": tweet["reply_count"], "retweet_count": tweet["retweet_count"], "lang": tweet["lang"], "s3_file_path": s3Object["Key"], } # append tweet to list to be stored tweets.append(esTweet) # needed to set the index to store the tweet year_month = (str(s3Object["LastModified"].year) + "-" + str(s3Object["LastModified"].month)) es_index = "tweets-" + year_month # stores all tweets in list to Elasticsearch helpers.bulk(es, tweets, index=es_index) lastKey = s3Object["Key"]
def main(): print("Connecting to ES...") es = Elasticsearch(hosts=[{"host":'elasticsearch'}]) if not es.ping(): raise ValueError("Connection failed") else: print('Connected to ES') print("Connecting to MySQL...") conn= pymysql.connect(host='conceptlights_db_1',user='******',password='******',db='dboe_1',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor) if conn.open: print('Connected to MySQL') else: print('Connection to MySQL failed') if es.indices.exists(index='dboe'): print('dboe index exists, deleting...') if es.indices.delete(index='dboe'): print('dboe index deleted, will reindex now.') body = { "settings" : { "number_of_shards": 1, "number_of_replicas": 0 }, "mappings": { "dboe-type": { "properties": { "location" : { "type" : "geo_point" } } } }} es.indices.create( index='dboe', ignore=400, body=body ) global db_cur db_cur = conn.cursor() actions = [] rootPath = './data' pattern = '*1_qdb-TEI-02.xml' #WIP: Test only with entries starting with 'm,n,o,p,q' for the moment listplace_path = './data/helper_tables/listPlace-id.xml' fragebogen_concepts_path = './data/frage-fragebogen-full-tgd01.xml' q_regex = r"^(\d+)(\w+)" # q_head_regex = r"pc> (.*)<" with open(listplace_path, "r", encoding="utf-8") as listplace_file, \ open(fragebogen_concepts_path, "r", encoding="utf-8") as fragebogen_concepts_file: listplace_soup = BeautifulSoup(listplace_file, 'xml') fragebogen_concepts_soup = BeautifulSoup(fragebogen_concepts_file,'xml') stop_words = get_stop_words('de') #Walk data dir extracting the different entries for root, dirs, files in os.walk(rootPath): for filename in fnmatch.filter(files, pattern): print(os.path.join(root, filename)) soup = BeautifulSoup(open(os.path.join(root, filename), "r", encoding="utf-8"), 'xml') for entry in soup.find_all("entry"): entry_obj= {} questionnaire = entry.findAll( "ref", {"type": "fragebogenNummer"}) if len(questionnaire) > 0: entry_obj['source_question_title'] = questionnaire[0].string match = re.match(q_regex, entry_obj['source_question_title']) if match: entry_obj['questionnaire_number'] = match.group(1) entry_obj['question'] = match.group(2) questionnaire_label = fragebogen_concepts_soup.find("label", text="Fragebogen " + entry_obj['questionnaire_number']) if questionnaire_label: questionnaire_head = questionnaire_label.parent entry_obj['questionnaire_label'] = questionnaire_head.contents[4] questionnaire = questionnaire_head.parent question = questionnaire.find('item', {"n" : entry_obj['question']}) if question: if question.label: entry_obj['question_label'] = question.label.string #label ? concepts = question.find_all('seg', attrs={"xml:id":True}) if len(concepts) > 0: # print('Question {} relates to the following concepts:'.format(item.get('n'))) concepts_set = set() for concept in concepts: # print(concept.string) if concept.string is not None and concept.string not in stop_words and "." not in concept.string and len(concept.string) > 1: concepts_set.add(concept.string) entry_obj['question_concepts'] = list(concepts_set) else: continue else: print('Questionnaire ' + entry_obj['questionnaire_number'] + ' could not be found') else: continue entry_obj['main_lemma'] = str(entry.form.orth.string) if len(entry_obj['main_lemma']) == 0: continue entry_obj['id'] = entry['xml:id'] #part of speech entry_obj['pos'] = str(entry.gramGrp.pos.string) if entry.sense: entry_obj['sense'] = entry.sense.text.replace('\n', '') if entry.note: entry_obj['note'] = entry.note.text.replace('\n', '') source = entry.findAll( "ref", {"type": "quelle"}) if len(source) > 0: entry_obj['source'] = source[0].string revised_source = entry.findAll( "ref", {"type": "quelleBearbeitet"}) if len(revised_source) > 0: entry_obj['revised_source'] = revised_source[0].text usg = entry.find('usg') if not usg: continue else: list_place = usg.find("listPlace", recursive=False) if not list_place: continue else: geo_dict = process_listplace_node(listplace_soup, list_place) entry_obj.update(geo_dict) actions.append({ '_index': 'dboe', '_type': 'dboe-type', '_source': entry_obj}) if len(actions) > 50: bulk(es, actions) actions = [] print('Done') conn.close() exit(0)
def bulk_add_documents(self, listOfDocs): bulk(self.es, listOfDocs, raise_on_error=False, refresh='wait_for')
def update_index_for_model(model, batch_size=256, batches_per_commit=10, firstpk=0): """ More efficient update of the search index for large models such as Paper :param batch_size: the number of instances to retrieve for each query :param batches_per_commit: the number of batches after which we should commit to the search engine :param firstpk: the instance to start with. """ using_backends = haystack.connection_router.for_write() if len(using_backends) != 1: raise ValueError("Don't know what search index to use") engine = haystack.connections[using_backends[0]] backend = engine.get_backend() index = engine.get_unified_index().get_index(model) qs = model.objects.order_by('pk') lastpk_object = list(model.objects.order_by('-pk')[:1]) if not lastpk_object: # No object in the model return lastpk = lastpk_object[0].pk batch_number = 0 # rate reporting indexed = 0 starttime = datetime.utcnow() while firstpk < lastpk: batch_number += 1 prepped_docs = [] for obj in qs.filter(pk__gt=firstpk)[:batch_size]: firstpk = obj.pk try: prepped_data = index.full_prepare(obj) final_data = {} # Convert the data to make sure it's happy. for key, value in list(prepped_data.items()): final_data[key] = backend._from_python(value) final_data['_id'] = final_data[ID] prepped_docs.append(final_data) except SkipDocument: continue documents_sent = False while not documents_sent: try: bulk(backend.conn, prepped_docs, index=backend.index_name, doc_type='modelresult') documents_sent = True except ConnectionTimeout as e: logger.warning(e) logger.info('retrying') sleep(30) indexed += len(prepped_docs) if batch_number % batches_per_commit == 0: backend.conn.indices.refresh(index=backend.index_name) if indexed >= 5000: curtime = datetime.utcnow() rate = int(indexed / (curtime - starttime).total_seconds()) logger.info("%d obj/s, %d / %d" % (rate, firstpk, lastpk)) starttime = curtime indexed = 0
def single_bulk_to_es(bulk, config): bulk = bulk_builder(bulk, config) helpers.bulk(config['es_conn'], bulk, chunk_size=config['bulk_size'])
result = WordCloud(max_words=50).generate(string) result.to_file(text_query + ".png") if __name__ == "__main__": es = ES es.indices.delete("final") if not es.indices.exists("final"): print("No index found.") es.indices.create("final") for folder in os.listdir(FOLDER): print("Now indexing folder", folder) folder = FOLDER + folder + '/' num = (len(os.listdir(folder)) // BULK_SIZE) + 1 for i in range(num): print("Bulk", i, "of", num) i *= BULK_SIZE bulk(es, json_to_bulk(folder, 'final', i, BULK_SIZE), stats_only=True) print("\nTest index is build.") query = text_to_query("theory date = '2001:2003'") print(query, end='\n\n') results = es.search(index="final", body=query)['hits']['hits'] print("Found", len(results), "results") word_cloud("theory date = '2001:2003'", results)
def execute_query(self, querystring=""): """ Not a test. This method is doing the heavy lifting for the tests in this class: create and fill the index with our courses so we can run our queries and check our facet counts. It also executes the query and returns the result from the API. """ # Create the subject category page. This is necessary to link the subjects we # defined above with the "subjects" filter # As it is the only page we create, we expect it to have the path "0001" CategoryFactory(page_reverse_id="subjects", should_publish=True) # Index these 4 courses in Elasticsearch indices_client = IndicesClient(client=ES_CLIENT) # Delete any existing indices so we get a clean slate indices_client.delete(index="_all") # Create an index we'll use to test the ES features indices_client.create(index="test_courses") indices_client.close(index="test_courses") indices_client.put_settings(body=ANALYSIS_SETTINGS, index="test_courses") indices_client.open(index="test_courses") # Use the default courses mapping from the Indexer indices_client.put_mapping( body=CoursesIndexer.mapping, doc_type="course", index="test_courses" ) # Add the sorting script ES_CLIENT.put_script(id="score", body=CoursesIndexer.scripts["score"]) ES_CLIENT.put_script( id="state_field", body=CoursesIndexer.scripts["state_field"] ) # Actually insert our courses in the index actions = [ { "_id": course["id"], "_index": "test_courses", "_op_type": "create", "_type": "course", "absolute_url": {"en": "url"}, "cover_image": {"en": "image"}, "title": {"en": "title"}, **course, "course_runs": [ { "languages": course_run["languages"], "start": arrow.utcnow().datetime, "end": arrow.utcnow().datetime, "enrollment_start": arrow.utcnow().datetime, "enrollment_end": arrow.utcnow().datetime, } for course_run in course["course_runs"] ], } for course in COURSES ] bulk(actions=actions, chunk_size=500, client=ES_CLIENT) indices_client.refresh() response = self.client.get(f"/api/v1.0/courses/?{querystring:s}") self.assertEqual(response.status_code, 200) return json.loads(response.content)
def store(self, report): sample_ids = {} sample_list = [] for filename in report: report[filename]['filename'] = filename try: sample_id = report[filename]['SHA256'] except KeyError: sample_id = uuid4() # Store metadata with the sample, not the report sample = {'filename': filename, 'tags': []} for field in METADATA_FIELDS: if field in report[filename]: if len(report[filename][field]) != 0: sample[field] = report[filename][field] del report[filename][field] # If there is Cuckoo results in the report, some # cleanup is needed for the report if 'Cuckoo Sandbox' in report[filename].keys(): cuckoo_report = report[filename]['Cuckoo Sandbox'] cuckoo_doc = { 'target': cuckoo_report.get('target'), 'summary': cuckoo_report.get('behavior', {}).get('summary'), 'info': cuckoo_report.get('info') } signatures = cuckoo_report.get('signatures') if signatures: cuckoo_doc['signatures'] = process_cuckoo_signatures( signatures) dropped = cuckoo_report.get('dropped') if dropped: cuckoo_doc['dropped'] = dropped procmemory = cuckoo_report.get('procmemory') if procmemory: cuckoo_doc['procmemory'] = procmemory # TODO: add the API calls to the Cuckoo Report document # for process in cuckoo_report.get('behavior', {}).get('processes', []): # process_pid = process['pid'] # cuckoo_doc['calls'] = {} # cuckoo_doc['calls'][process_pid] = [] # for call in process['calls']: # cuckoo_doc['calls'][process_pid].append(call) report[filename]['Cuckoo Sandbox'] = cuckoo_doc # Store report; let ES autogenerate the ID so we can save it with the sample try: report_result = self.es.index(index=self.index, doc_type=self.doc_type, body=report[filename], parent=sample_id, pipeline='dedot') except (TransportError, UnicodeEncodeError) as e: # If fail, index empty doc instead print('Failed to index that report!\n{}'.format(e)) report_body_fail = { 'ERROR': 'Failed to index the full report in Elasticsearch', 'Scan Time': report[filename]['Scan Time'] } report_result = self.es.index(index=self.index, doc_type=self.doc_type, body=report_body_fail, parent=sample_id, pipeline='dedot') report_id = report_result.get('_id') sample['report_id'] = report_id sample_ids[sample_id] = report_id sample_list.append({ '_op_type': 'create', '_index': self.index, '_type': 'sample', '_id': sample_id, '_source': sample, 'pipeline': 'dedot' }) result = helpers.bulk(self.es, sample_list, raise_on_error=False) creation_errors = result[1] if not creation_errors: return sample_ids # Some samples already exist; update them to ref the new reports updates_list = [] for err in creation_errors: if err['create']['status'] == 409: sid = err['create']['_id'] rid = sample_ids[sid] updates_list.append({ '_op_type': 'update', '_index': self.index, '_type': 'sample', '_id': sample_id, 'doc': { 'report_id': rid }, 'pipeline': 'dedot' }) result = helpers.bulk(self.es, updates_list, raise_on_error=False) return sample_ids