def execute(self): """ Index data of specified queryset """ start_time = time.time() for qs, progress in self.batch_qs(): elapsed = time.time() - start_time total_left = (1 / (progress + 0.001)) * elapsed - elapsed progres_msg = \ 'PART: %s %.3f : duration: %.2f left: %.2f' % ( self.part, progress, elapsed, total_left ) log.info(progres_msg) helpers.bulk( self.client, (self.convert(obj).to_dict(include_meta=True) for obj in qs), raise_on_error=True, ) if settings.TESTING and self.index: idx = es.Index(self.index) # refresh index, make sure its ready for queries idx.refresh()
def __connect_to_elastic(self): # Creates a connection to elastic connections.create_connection( hosts=settings.ELASTIC_SEARCH_HOSTS, retry_on_timeout=True, ) return es.Index(self.index)
def _perform_index_sync(self, sql_table_cls, es_doc_cls, id_logger): es_doc = es_doc_cls() elasticsearch_conn = connections.get_connection() sync_timestamp = current_server_timestamp() pending_insertions = self._compute_dirty_documents( sql_table_cls, es_doc.doc_type) bulk_op = self._synchronisation_op(es_doc, pending_insertions) self._logging(logging.INFO, 'Performing synchronization.') for ok, info in parallel_bulk(elasticsearch_conn, bulk_op): obj_id = info['index']['_id'] \ if 'index' in info else info['update']['_id'] if ok: # Mark the task as handled so we don't retreat it next time self._logging( logging.INFO, 'Document %s has been synced successfully.' % obj_id) sql_table_cls.update_last_sync(obj_id, sync_timestamp) else: id_logger(obj_id, logging.ERROR, 'Error while syncing document %s index.' % obj_id) # Refresh indices to increase research speed elasticsearch_dsl.Index(es_doc.index).refresh()
def build_custom_dict(self): df = pd.read_excel(os.path.join(MEDIA_ROOT, 'dict.xlsx')) number_of_words = len(df) index = es.Index(ES_INDEX_CUSTOM_DICTIONARY_WORD, using=ES_CLIENT) index.delete(ignore=404) print("Creating index") CustomDictionaryWord.init() failed, success = 0, 0 batch_size = 1000 for ok, result in parallel_bulk(ES_CLIENT, self.word_generator(df), index=ES_INDEX_CUSTOM_DICTIONARY_WORD, chunk_size=batch_size, raise_on_error=False, thread_count=6): if ok: success += 1 else: failed += 1 action, result = result.popitem() print("!!!", action, result) if failed > 3: raise Exception("Too many failed!!") if (success + failed) % batch_size == 0: print(f'{success+failed}/{number_of_words} processed')
def execute(self): idx = es.Index(self.index) for dt in self.doc_types: idx.document(dt) idx.create() idx.refresh()
def create_new_connection(address, index='training_jobs'): """ Creates a new connection to elasticsearch. """ elasticsearch_dsl.connections.create_connection( hosts=[address] ) return elasticsearch_dsl.Index(index)
def test_index(self): """Initialize test index""" DataDocType.init() index = es.Index("test") try: index.delete() except elasticexceptions.NotFoundError: assert False
def create_index(index_name): """ Creates a new index, destroying any existing index with the same name. """ index = edsl.Index(index_name) index.settings(number_of_shards=1) if index.exists(): index.delete() print('old index deleted') index.create() return index
def execute(self): idx = es.Index(self.index) try: idx.delete(ignore=404) log.info("Deleted index %s", self.index) except AttributeError: log.warning("Could not delete index '%s', ignoring", self.index) except NotFoundError: log.warning("Index '%s' not found, ignoring", self.index)
def create_indices(scanner): for regex in scanner.regexes: id_ = regex.id.lower() index_name = f'{INDEX_PREFIX}-{id_}'.lower() index = es_dsl.Index(index_name) if index.exists(): index.delete() index.create() mapping = es_dsl.Mapping() add_field_mappings(id_, regex, mapping) mapping.save(index_name)
def execute(self): idx = es.Index(self.index) try: idx.delete(ignore=404) log.info("Deleted index %s", self.index) except AttributeError: log.warning("Could not delete index '%s', ignoring", self.index) for dt in self.doc_types: idx.document(dt) idx.create()
def handle(self, *args, **options): index_name = es_index_name elasticsearch_dsl.connections.connections.create_connection(hosts=["127.0.0.1"]) if elasticsearch_dsl.Index(name=index_name).exists(): elasticsearch_dsl.Index(name=index_name).delete() for process in process_all: print("Now process %r" % process.pk) host = Hosts.objects.get(server_uuid=process.server_uuid) esp = es_docs.EsProcess(meta={'id': process.pk}, id=process.pk, p_name=process.p_name, p_status=process.p_status, p_cwd=process.p_cwd, p_exe=process.p_exe, p_username=process.p_username, p_create_time=process.p_create_time, p_cmdline=process.p_cmdline, listen_ip_port=process.listen_ip_port, # server_uuid=host.server_uuid, server=host.ip_addresses, old_mark=process.old_mark ) esp.save()
def es_client(): client = _es_client() yield client # Push all changes to segments to make sure all annotations that were added get removed. elasticsearch_dsl.Index(client.index, using=client.conn).refresh() client.conn.delete_by_query( index=client.index, body={"query": {"match_all": {}}}, # This query occassionally fails with a version conflict. # Forcing the deletion resolves the issue, but the exact # cause of the version conflict has not been found yet. conflicts="proceed", # Add refresh to make deletion changes show up in search results. refresh=True, )
def clear_index(cls, index=None): """ Clears the index. """ if not index: index = cls.get_index_config() if not index: raise Exception('Index not found!') connections.create_connection(hosts=index['connection']['hosts']) try: index_instance = es.Index(index['index_name']) index_instance.delete() except Exception: pass connections.remove_connection(index['connection_name'])
def handle(self, *args, **options): self.batch_size = options['batch_size'] self.from_id = 0 if "from_id" in options: self.from_id = options['from_id'] self.to_id = None if "to_id" in options: self.to_id = options['to_id'] self.client = ES_CLIENT if not self.from_id: print("Deleting index") index = es.Index(ES_INDEX_DOCUMENT, using=self.client) index.delete(ignore=404) print("Creating index") ESDocument.init() self.send_elastic()
def _get_elasticsearch_index_samples(elasticsearch_index): sample_field_suffix = '_num_alt' index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index), using=get_es_client()) try: field_mapping = index.get_field_mapping(fields=['*{}'.format(sample_field_suffix)], doc_type=[VARIANT_DOC_TYPE]) except NotFoundError: raise Exception('Index "{}" not found'.format(elasticsearch_index)) except TransportError as e: raise Exception(e.error) samples = set() for index in field_mapping.values(): samples.update([key.split(sample_field_suffix)[0] for key in index.get('mappings', {}).get(VARIANT_DOC_TYPE, {}).keys()]) if not samples: raise Exception('No sample fields found for index "{}"'.format(elasticsearch_index)) return samples
def execute(self): index = current_app.config['ELASTIC_INDICES'][self.index_key] idx = es.Index(index) try: idx.delete(ignore=404) except NotFoundError: log.warning("Could not delete index '%s', ignoring", index) else: log.info("Deleted index %s", index) # create doc types for dt in self.doc_types: idx.doc_type(dt) # create index idx.create()
def _perform_index_purge(self, index_name, index_settings, doc_type_class): log_msg = 'Dropping %s index.' % index_name self._logging(logging.INFO, log_msg) index = elasticsearch_dsl.Index(index_name) index.settings(**index_settings) index.doc_type(doc_type_class) try: index.delete(ignore=404) index.create() except elasticsearch.exceptions.ElasticsearchException as e: log_msg = 'Error while dropping %s index: %s.' % (index_name, e) self._logging(logging.ERROR, log_msg) return log_msg = 'Index %s has been dropped successfully.' % index_name self._logging(logging.INFO, log_msg)
def create_index(self, index_name, run_logs): """ Creates a new index, destroying any existing index with the same name. """ index = edsl.Index(index_name) index.settings(number_of_shards=3) if index.exists(): index.delete() run_logs.insert_log('old {0} index deleted'.format(index_name)) if index_name == "enc_dates_NOT_NOW": run_logs.insert_log("entered analyzer for {0}".format(index_name)) enc_analyzer = analyzer('enc_analyzer', tokenizer="whitespace", filter=['lowercase']) index.analyzer(enc_analyzer) index.create() run_logs.insert_log('new {0} index created'.format(index_name)) return index
def es_client(): client = _es_client() yield client # Push all changes to segments to make sure all annotations that were added get removed. elasticsearch_dsl.Index(client.index, using=client.conn).refresh() # Pylint can't understand the ES library # pylint: disable=unexpected-keyword-arg client.conn.delete_by_query( index=client.index, body={"query": {"match_all": {}}}, # This query occasionally fails with a version conflict. # Forcing the deletion resolves the issue, but the exact # cause of the version conflict has not been found yet. conflicts="proceed", # Add refresh to make deletion changes show up in search results. refresh=True, ) # Close connection to ES server to avoid ResourceWarning about a leaked TCP socket. client.close()
def _get_elasticsearch_index_samples(elasticsearch_index, project): sample_field_suffix = '_num_alt' es_client = get_es_client(timeout=30) index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index), using=es_client) try: field_mapping = index.get_field_mapping( fields=['*{}'.format(sample_field_suffix), 'join_field'], doc_type=[VARIANT_DOC_TYPE]) except NotFoundError: raise Exception('Index "{}" not found'.format(elasticsearch_index)) except TransportError as e: raise Exception(e.error) # Nested genotypes if field_mapping.get(elasticsearch_index, {}).get('mappings', {}).get(VARIANT_DOC_TYPE, {}).get('join_field'): max_samples = Individual.objects.filter( family__project=project).count() s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket( 'sample_ids', elasticsearch_dsl.A('terms', field='sample_id', size=max_samples)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets] samples = set() for index in field_mapping.values(): samples.update([ key.split(sample_field_suffix)[0] for key in index.get( 'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys() ]) if not samples: raise Exception('No sample fields found for index "{}"'.format( elasticsearch_index)) return samples
def _perform_geocomplete_index_population(self, max_doc): elasticsearch_conn = connections.get_connection() to_index = list() for i, document in enumerate(self._geocompletion_documents()): if i % max_doc == 0: log_msg = 'Computing required geoloc-entry documents.' self._logging(logging.INFO, log_msg) to_index.append(document.to_dict(True)) if len(to_index) < max_doc: continue self._geocomplete_index_batch(elasticsearch_conn, to_index) to_index = list() if len(to_index) != 0: self._geocomplete_index_batch(elasticsearch_conn, to_index) elasticsearch_dsl.Index('geocomplete').refresh()
def _get_elasticsearch_index_samples(elasticsearch_index): es_client = get_es_client() # Nested genotypes if is_nested_genotype_index(elasticsearch_index): s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket( 'sample_ids', elasticsearch_dsl.A('terms', field='samples_num_alt_1', size=10000)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets] sample_field_suffix = '_num_alt' index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index), using=es_client) try: field_mapping = index.get_field_mapping( fields=['*{}'.format(sample_field_suffix)], doc_type=[VARIANT_DOC_TYPE]) except NotFoundError: raise Exception('Index "{}" not found'.format(elasticsearch_index)) except TransportError as e: raise Exception(e.error) samples = set() for index in field_mapping.values(): samples.update([ key.split(sample_field_suffix)[0] for key in index.get( 'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys() ]) if not samples: raise Exception('No sample fields found for index "{}"'.format( elasticsearch_index)) return samples
import ast import logging import pathlib import re import typing from collections import namedtuple import elasticsearch_dsl import elasticsearch_dsl.connections import pandas as pd from pandas.errors import EmptyDataError TRAINING_JOBS = 'training_jobs' VALIDATION_JOBS = 'validation_jobs' JOB_INDEX = elasticsearch_dsl.Index(TRAINING_JOBS) VALIDATION_JOB_INDEX = elasticsearch_dsl.Index(VALIDATION_JOBS) Metrics = namedtuple('Metrics', [ 'epochs', 'train_acc', 'final_val_acc', 'best_val_acc', 'final_val_loss', 'best_val_loss', 'final_val_sensitivity', 'best_val_sensitivity', 'final_val_specificity', 'best_val_specificity' ]) class TrainingJob(elasticsearch_dsl.Document): id = elasticsearch_dsl.Integer() schema_version = elasticsearch_dsl.Integer() job_name = elasticsearch_dsl.Keyword() author = elasticsearch_dsl.Keyword() created_at = elasticsearch_dsl.Date() ended_at = elasticsearch_dsl.Date()
import elasticsearch_dsl as es from django.conf import settings from elasticsearch_dsl import analyzer, tokenizer dutch_analyzer = es.analyzer('dutchanalyzer', type='standard', stopwords='_dutch_') base_analyzer = analyzer('zorg_base_txt', tokenizer=tokenizer('trigram', 'nGram', min_gram=2, max_gram=20), filter=['lowercase']) _index = es.Index(settings.ELASTIC_INDEX) @_index.doc_type class Term(es.DocType): term = es.Text() gewicht = es.Integer() @_index.doc_type class Organisatie(es.DocType): ext_id = es.String(index='not_analyzed') naam = es.String(analyzer=dutch_analyzer) # ngram beschrijving = es.String(analyzer=dutch_analyzer) afdeling = es.String(index='not_analyzed')
import datetime from datetime import timedelta import logging from six import iteritems, itervalues import elasticsearch_dsl as esd from .. import app PhotoIndex = esd.Index(app.config["ELASTICSEARCH_INDEX"]) # So this allows bigger pagination in /photos, probably going past 10000/20 pages doesn't # make a whole lot of sense, and this should be solved differently, but until that time.. PhotoIndex.settings(max_result_window=500000) class ExtendedDateHistogramFacet(esd.DateHistogramFacet): # Temporary until the elasticsearch-dsl library includes the 'year' range DATE_INTERVALS = { 'year': lambda d: (d+timedelta(days=366)).replace(day=1), 'month': lambda d: (d+timedelta(days=32)).replace(day=1), 'week': lambda d: d+timedelta(days=7), 'day': lambda d: d+timedelta(days=1), 'hour': lambda d: d+timedelta(hours=1), } @PhotoIndex.doc_type class PhotoDocument(esd.DocType): date = esd.Date() aperture = esd.Float() exposure = esd.Float()
division, print_function, unicode_literals, ) import elasticsearch import elasticsearch_dsl as dsl from flask import current_app as app from ..versioning import ArchivingDocType from .exceptions import ConflictError, NotFoundError from .aliases import get_alias, unalias auth_index = dsl.Index('auth') auth_index.settings( number_of_shards=2, number_of_replicas=1 ) class Customer(ArchivingDocType): """Model a customer.""" name = dsl.Keyword() permissions = dsl.Object() cycles = dsl.Object() class Meta: index = auth_index._name
doc_id = es_dsl.Integer() if __name__ == '__main__': parser = argparse.ArgumentParser(description='Create or reupload index') # parser.add_argument('--input', type=str, # help='Json file with preprocessed data') parser.add_argument('-i', '--index', type=str, help='Index name to create') args = parser.parse_args() indexName = args.index # "query_tips_index_v0.3" # path = args.input # "../data/data/data_clear_es_pm.json" properties = ["request", "request_lemms", "popularity"] # without id esClient = Elasticsearch() index = es_dsl.Index(indexName, using=esClient) index.delete(ignore=404) index.settings(number_of_shards=1, number_of_replicas=0, analysis=analysis) index.doc_type(queryTipsIndex_doctype) # <-- CHANGE NAME index.create() queryTipsIndex_doctype.init(using=esClient) with open('./vidal_total_dict.json', 'r', encoding='utf-8') as f_v: vidal_data = json.load(f_v) i = 0 # json имеет вид: [1:{original:.. norm:.. llt_id:.. pt_id:..} 2:...] for elem in vidal_data.items(): # print(normalize_text(elem[0])) # if i % 800 == 0: