def get_fields(self): """ Get provider field information (names, types) :returns: dict of fields """ fields_ = {} ic = IndicesClient(self.es) ii = ic.get(self.index_name) try: if '*' not in self.index_name: p = ii[self.index_name]['mappings']['properties']['properties'] else: LOGGER.debug('Wildcard index; setting from first match') index_name_ = list(ii.keys())[0] p = ii[index_name_]['mappings']['properties']['properties'] except KeyError: LOGGER.debug('ES index looks generated by GDAL') self.is_gdal = True p = ii[self.index_name]['mappings'] for k, v in p['properties'].items(): if 'type' in v: if v['type'] == 'text': fields_[k] = {'type': 'string'} elif v['type'] == 'date': fields_[k] = {'type': 'string', 'format': 'date'} else: fields_[k] = {'type': v['type']} return fields_
def get_fields(self): """ Get provider field information (names, types) :returns: dict of fields """ fields_ = {} ic = IndicesClient(self.es) ii = ic.get(self.index_name) try: p = ii[self.index_name]['mappings']['properties'][ 'properties'] # noqa except KeyError: LOGGER.debug('ES index looks generated by GDAL') self.is_gdal = True p = ii[self.index_name]['mappings'] for k, v in p['properties'].items(): if 'type' in v: if v['type'] == 'text': type_ = 'string' else: type_ = v['type'] fields_[k] = type_ return fields_
def __init__(self, host='localhost', port=9200, index_name='pubmed', cred_path='.cred'): self.host = host self.port = port self.index_name = index_name self.cred_path = cred_path # self.doc_type = 'papers' self.es = self.__connect() self.ic = IndicesClient(self.es) self.page_cache = shelve.open("/Users/rmn/git/BioSum/biosum-supervised/cache/pages.p", writeback=False)
def check_index(esconn, index_name): index = IndicesClient(esconn) try: if index.exists(index=index_name): print(index.get_settings(index=index_name)) return True else: return False except Exception as ex: raise ES_INDEX_ERROR(ex)
def check_index_data(esconn, index_name): index = IndicesClient(esconn) try: idx = index.stats( index = index_name ) print("Found: " + str(idx["_all"]["total"]["docs"]["count"]) + " Documents in the " + index_name + " Index") if idx["_all"]["total"]["docs"]["count"] > 0: return True return False except Exception as ex: raise ES_INDEX_ERROR(ex)
def __init__(self, host, port, username, password, indexname): """ Initializes this Elasticsearch Client. :param host: the HTTP address of the Elasticsearch server. :param port: the HTTP port of the Elasticsearch server. :param username: the username for connecting to the index. :param password: the password for connecting to the index. :param indexname: the name of the Elasticsearch index. """ self.indexname = indexname self.client = Elasticsearch(connection_class = SafeRequestsHttpConnection, host = host, port = int(port), http_auth = [username, password]) self.snapshotclient = SnapshotClient(self.client) self.indicesclient = IndicesClient(self.client)
def import_populate_db(): es_host = app.config['ELASTICSEARCH'] es_idx = app.config['ES_IDX'] es = Elasticsearch(f'{es_host}:9200') idx = IndicesClient(es) if idx.exists(index=es_idx): existing_docs = es.count(index=es_idx).get('count') if existing_docs: return existing_docs, 0 app.logger.info("Drop and creating index and mapping definition") idx.delete(es_idx) else: idx.create(index=es_idx, body=mapping) app.logger.info("Populating elasticsearch with documents") errors = [] for pdv in pdvs: document_id = ''.join(filter(str.isdigit, pdv.get('document'))) pdv['document'] = document_id del pdv['id'] try: es.index(index=es_idx, body=pdv, id=document_id) except Exception as ex: app.logger.exception(ex) errors.append({'id': pdv.get('id'), 'description': ex.args}) inserted = len(pdvs) - len(errors) return errors, inserted
def get_fields(self): """ Get provider field information (names, types) :returns: dict of fields """ fields_ = {} ic = IndicesClient(self.es) ii = ic.get(self.index_name) p = ii[self.index_name]['mappings'][self.type_name]['properties']['properties'] # noqa for k, v in p['properties'].items(): if v['type'] == 'text': type_ = 'string' else: type_ = v['type'] fields_[k] = {'type': type_} return fields_
def __init__(self, host='localhost', port=9200, index_name='biosum', cred_path='.cred'): self.host = host self.port = port self.index_name = index_name self.cred_path = cred_path # self.doc_type = 'papers' self.es = self.__connect() self.ic = IndicesClient(self.es) try: cache_file = constants.get_path()['cache'] self.page_cache = shelve.open(cache_file + '/pages.p', writeback=False) except: print 'Not found: %s' % cache_file print sys.exc_info()[0] sys.exit()
def __init__(self, host="localhost", port=9200, index_name="biosum", cred_path=".cred"): self.host = host self.port = port self.index_name = index_name self.cred_path = cred_path # self.doc_type = 'papers' self.es = self.__connect() self.ic = IndicesClient(self.es) try: cache_file = constants.get_path()["cache"] self.page_cache = shelve.open(cache_file + "/pages.p", writeback=False) except: print "Not found: %s" % cache_file print sys.exc_info()[0] sys.exit()
def initialize_elastic_search() -> Tuple[Elasticsearch, IndicesClient]: elastic_search = Elasticsearch(hosts=[{"host": "localhost", "port": 9200}]) indices_client = IndicesClient(client=elastic_search) try: indices_client.create( index=INDEX, body={ "mappings": { "properties": { "doc": { "type": "text" }, "vector": { "type": "dense_vector", "dims": 768 }, } } }, ) except RequestError: pass return elastic_search, indices_client
def create_index(esconn, index_name, data_file, shard_count): index = IndicesClient(esconn) try: index_json = open(data_file) body = index_json.read() json_body = json.loads(body) # Work out number of shards == no. of data nodes x 2 print("Setting Index Shard Count to: " + str(shard_count)) # Update json doc json_body["settings"]["index"]["number_of_shards"] = shard_count # For single node clusters (shard_count will be 2)- no replicas possible if shard_count == 2: print("Single node cluster detected - disabling replicas") json_body["settings"]["index"]["number_of_replicas"] = 0 # Create Index and Apply any settings & mappings idx = index.create( index = index_name, body = json_body ) if idx['acknowledged'] != True: raise ES_INDEX_ERROR('Failed to create Index. Response: ', idx) print("SUCCESS: Created Index: " + index_name) except Exception as ex: raise ES_PIPELINE_ERROR(ex)
class ElasticIndiceDriver: def __init__(self, client: Elasticsearch): self.client = IndicesClient(client) def create_index(self, index: str, mapping: dict): self.client.create(index, json.dumps(mapping)) def clean_index(self, index: str): self.client.delete(index) self.client.delete(f'{index}-finished')
def create(cls, user, **kwargs): """Create user index.""" # Create index for user client = Elasticsearch(cls.__url__) indice = IndicesClient(client) if indice.exists(index=user.user_id): if 'delete_existing' in kwargs and kwargs['delete_existing']: log.warn('Deleting existing index for user %s' % user.user_id) indice.delete(index=user.user_id) else: log.warn('Index already exists for user %s' % user.user_id) return False log.info('Creating index for user %s' % user.user_id) indice.create(index=user.user_id) return True
def recreate_index_model(self, model: Union[type[Gallery], type[Archive]]): from elasticsearch.client.indices import IndicesClient indices_client = IndicesClient(client=self.es_client) index_name = model._meta.es_index_name # type: ignore if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.close(index=index_name) indices_client.put_settings( index=index_name, body={ "index": { "max_result_window": settings.MAX_RESULT_WINDOW }, "analysis": { "filter": { "edge_ngram_filter": { "type": "edge_ngram", "min_gram": 2, "max_gram": 20 } }, "analyzer": { "edge_ngram_analyzer": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "edge_ngram_filter"] } } } }) indices_client.put_mapping( body=model._meta.es_mapping, # type: ignore index=index_name, ) indices_client.open(index=index_name)
class ElasticSearchClient: """ Class used as a client to the Elasticsearch server. """ def __init__(self, host, port, username, password, indexname): """ Initializes this Elasticsearch Client. :param host: the HTTP address of the Elasticsearch server. :param port: the HTTP port of the Elasticsearch server. :param username: the username for connecting to the index. :param password: the password for connecting to the index. :param indexname: the name of the Elasticsearch index. """ self.indexname = indexname self.client = Elasticsearch(connection_class = SafeRequestsHttpConnection, host = host, port = int(port), http_auth = [username, password]) self.snapshotclient = SnapshotClient(self.client) self.indicesclient = IndicesClient(self.client) def delete_index_and_mappings(self): """ Deletes the index and all its mappings. """ try: self.client.indices.delete(index = self.indexname) except NotFoundError: pass def create_index_and_mappings(self, update_mappings = False): """ Creates or updates the index and its mappings. :param update_mappings: boolean denoting whether the mappings should be created (False) or updated (True). """ if not self.client.indices.exists(self.indexname): self.client.indices.create(index = self.indexname, body = load_file_to_json("properties/indexsettings.json")) mappings = {} if self.indexname in self.client.indices.get_mapping(self.indexname): mappings = self.client.indices.get_mapping(self.indexname)[self.indexname]['mappings'] if update_mappings: self.client.indices.close(self.indexname) if 'files' not in mappings or update_mappings: self.client.indices.put_mapping(index = self.indexname, doc_type = 'files', body = load_file_to_json("properties/filesproperties.json")) if 'projects' not in mappings or update_mappings: self.client.indices.put_mapping(index = self.indexname, doc_type = 'projects', body = load_file_to_json("properties/projectsproperties.json")) if update_mappings: self.client.indices.open(self.indexname) def has_project(self, project_id): """ Checks if the index contains a project. :param project_id: the id of the project to check if it is contained in the index. :returns: True if the index contains the project, or False otherwise. """ return self.client.exists(index = self.indexname, doc_type = 'projects', id = project_id) def has_file(self, file_id): """ Checks if the index contains a file. :param file_id: the id of the file to check if it is contained in the index. :returns: True if the index contains the file, or False otherwise. """ return self.client.exists(index = self.indexname, doc_type = 'files', id = file_id) def create_project(self, project): """ Creates a project in the index. :param project: the data of the project in JSON format. """ self.client.create(index = self.indexname, doc_type = 'projects', id = project['fullname'], body = project) def create_file(self, afile): """ Creates a file in the index. :param afile: the data of the file in JSON format. """ self.client.create(index = self.indexname, doc_type = 'files', id = afile['fullpathname'], parent = afile['project'], body = afile) def update_file(self, afile): """ Updates a file in the index. :param afile: the data of the file in JSON format. """ self.client.update(index = self.indexname, doc_type = 'files', id = afile['fullpathname'], parent = afile['project'], body = {'doc': afile}) def delete_file(self, afile_id): """ Deletes a file from the index. :param afile_id: the id of the file to be deleted. """ self.client.delete(index = self.indexname, doc_type = 'files', id = afile_id, routing = '/'.join(afile_id.split('/')[0:2])) def delete_project(self, project_id): """ Deletes a project from the index. Note that this function also deletes all the files of the project. :param project_id: the id of the project to be deleted. """ self.client.delete_by_query(index = self.indexname, doc_type = 'files', body = {"query": { "bool": { "must": { "match_all": {} }, "filter": { "term": { "_routing": project_id } } } } }) self.client.delete(index = self.indexname, doc_type = 'projects', id = project_id) def get_project_fileids_and_shas(self, project_id): """ Returns all the files and their corresponding shas for a project. :param project_id: the id of the project of which the files and the shas are returned. :returns: a dict containing the files of the project as keys and their shas as values. """ sourcefiles = self.client.search(index = self.indexname, doc_type = 'files', body = {"query": { "term" : { "_routing": project_id } } }, routing = project_id, size = 100000000)['hits']['hits'] # Limitation! Each project must have no more than 100000000 files fileidsandshas = {} for afile in sourcefiles: fileidsandshas[afile['_id']] = afile['_source']['sha'] return fileidsandshas def execute_query(self, query, doc_type = 'files'): """ Executes a query on the index. :param query: the body of the query. :param doc_type: the document type to which the query is executed, either 'projects' or 'files'. :returns: the response of the query. """ return self.client.search(index = self.indexname, doc_type = doc_type, body = query) def test_analyzer(self, analyzer, text): """ Tests an analyzer of the index. :param analyzer: the analyzer to be tested. :param text: the text to be analyzed as a test. :returns: the analyzed text. """ result = self.indicesclient.analyze(index = self.indexname, analyzer = analyzer, body = text) return [r['token'] for r in result['tokens']] def backup(self, backupdir): """ Backups the index. :param backupdir: the directory used to backup the index. """ repositoryname = os.path.basename("backup" + self.indexname) try: self.snapshotclient.get_repository(repository = repositoryname) except: self.snapshotclient.create_repository(repository = repositoryname, body = {"type": "fs", "settings": {"location": backupdir + os.sep + self.indexname}}) try: self.snapshotclient.get(repository = repositoryname, snapshot = self.indexname + "snapshot") except: self.snapshotclient.create(repository = repositoryname, snapshot = self.indexname + "snapshot", body = {"indices": self.indexname}, wait_for_completion = True) def delete_backup(self): """ Removes any backups of the index. If there are no backups, this function does nothing. """ repositoryname = os.path.basename("backup" + self.indexname) try: self.snapshotclient.delete(repository = repositoryname, snapshot = self.indexname + "snapshot") except: pass def restore_backup(self): """ Restores a backup of the index. """ repositoryname = os.path.basename("backup" + self.indexname) if not self.client.indices.exists(self.indexname): self.client.indices.create(index = self.indexname, body = load_file_to_json("properties/indexsettings.json")) self.client.indices.close(self.indexname) self.snapshotclient.restore(repository = repositoryname, snapshot = self.indexname + "snapshot", body = {"indices": self.indexname}, wait_for_completion = True) self.client.indices.open(self.indexname) def flush(self): """ Flushes the index. """ self.indicesclient.flush(index = self.indexname)
def __init__(self, client: Elasticsearch): self.client = IndicesClient(client)
class ESInterface(): """Interface for ElasticSearch""" _count_total = -1 # N: Number of docs _idf = None # dict for storing idf values def __init__(self, host='localhost', port=9200, index_name='pubmed', cred_path='.cred'): self.host = host self.port = port self.index_name = index_name self.cred_path = cred_path # self.doc_type = 'papers' self.es = self.__connect() self.ic = IndicesClient(self.es) self.page_cache = shelve.open("/Users/rmn/git/BioSum/biosum-supervised/cache/pages.p", writeback=False) def login(self, username, password): pass @property def description(self): # get mapping, clean it up m = self.es.indices.get_mapping(self.index_name) m = m[self.index_name]['mappings'] description = {'host': self.host, 'port': self.port, 'index_name': self.index_name, 'mapping': m} return description @property def size(self): stats = self.es.indices.stats()['indices'][self.index_name] return stats['total']['docs']['count'] def __connect(self): '''Private method used to connect to the ElasticSearch instance.''' es = ES(hosts=[{'host': self.host, 'port': self.port}]) # checks if server exists if not es.ping(): err = ('It appears that nothing is running at http://%s:%s' % (self.host, self.port)) raise OSError(err) # load the credentials file (if possible) # with file(self.cred_path) as cf: # username, password = [l.strip() for l in cf.readlines()][:2] # data = json.dumps({'username': username, 'password': password}) url = 'http://%s:%s/login' % (self.host, self.port) resp = json.loads(requests.post(url).text) # if resp['status'] == 200: # self.auth_token = resp['token'] # else: # self.auth_token = '' # checks if index exists try: es.indices.get_mapping(self.index_name) except TransportError as e: if e.args[0] == 403: err = list(e.args) err[1] = ('Credentials not valid for %s:%s/%s' % (self.host, self.port, self.index_name)) e.args = tuple(err) elif e.args[0] == 404: self.__del__() err = list(e.args) err[1] = ('No index named "%s" is avaliable at %s:%s' % (self.index_name, self.host, self.port)) e.args = tuple(err) raise return es def __del__(self): requests.post('http://%s:%s/logout' % (self.host, self.port)) # def get_scroll(self, scroll_size, scroll_timeout): # q_body = {"query": {"match_all": {}}} # return self.es.search(self.index_name, self.doc_type, q_body, # search_type='scan', scroll='100m', # size='10000') # def scroll(self, scroll_id): # return self.es.scroll(scroll_id, scroll='10m') # def scan_and_scroll(self, doc_type, scroll_size=50, scroll_timeout=10): # """ # The scan search type allows to efficiently scroll a large result set. # The response will include no hits, with two important results, # the total_hits will include the total hits that match the query # and the scroll_id that allows to start the scroll process. # @param scroll_size: scroll size # @param scroll_timeout: rountdtrip timeout # """ # q_body = {"query": { # "match_all": {} # }} # result = self.es.search(self.index_name, # doc_type, # q_body, # search_type='scan', # scroll=str(scroll_timeout) + # 'm', # size=scroll_size) # res = self.es.scroll( # result['_scroll_id'], scroll=str(scroll_timeout) + 'm') # finalres = [] # while len(res['hits']['hits']) > 0: # finalres.append(res) # res = self.es.scroll( # res['_scroll_id'], scroll=str(scroll_timeout) + 'm') # return finalres # def esc(self, txt): # for e in TO_ESCAPE: # txt = txt.replace(e, '\%s' % e) # return txt def find_all(self, source_fields=None, doc_type=''): if source_fields: q_body = { "fields": source_fields, "query": { "match_all": {} } } else: q_body = { "query": { "match_all": {} } } return self.es.search( body=q_body, size=1000000, index=self.index_name, doc_type=doc_type)['hits']['hits'] def multi_field_search(self, field_vals, fields=['sentence', 'mm-concepts', 'noun_phrases'], maxsize=1000, field_boost=[1, 3, 2], offset=0, source_fields=[], doc_type='', params=None): '''Interface for simple query tasks. Parameters: - field_vals [requried]: a list of field values to query - maxsize [optional]: number of results to get. default is 1000. Returns results.''' # q_body = { # "fields": source_fields, # "query": { # "dis_max": { # "queries": [ # {"match": { # "sentence": { # "query": sentence, # "boost": field_boost[0] # }}}, # {"match": { # "mm-concepts": { # "query": concepts, # "boost": field_boost[1] # }}}, # {"match": { # "noun_phrases": { # "query": noun_phrases, # "boost": field_boost[2] # }}} # ] # } # } # } q_body = { "fields": source_fields, "query": { "dis_max": { "queries": [ ] } } } for idx in range(len(field_vals)): q_body['query']['dis_max']['queries'].append({"match": { fields[idx]: { "query": field_vals[idx], "boost": field_boost[idx] }}}) if params is not None: for key in params: q_body['query']['dis_max'][key] = params[key] return self._cursor_search(q_body, maxsize, offset, doc_type) def simple_search(self, query, field='_all', maxsize=1000, offset=0, source_fields=[], doc_type='', operator='or', phrase_slop=0, escape=False, params=None): '''Interface for simple query tasks. Parameters: - query [requried]: the string to query - maxsize [optional]: number of results to get. default is 1000. Returns results.''' if escape: query = self.esc(query) q_body = { "fields": source_fields, 'query': { 'query_string': { 'query': query, 'default_operator': operator, 'use_dis_max': True, 'auto_generate_phrase_queries': True, 'phrase_slop': phrase_slop } } } if params is not None: for key in params: q_body['query']['query_string'][key] = params[key] if field: q_body['query']['query_string']['default_field'] = field return self._cursor_search(q_body, maxsize, offset, doc_type) def count(self, query, field='_all', operator="AND"): q = { 'query': { "query_string": { "default_field": field, "default_operator": operator, "query": query } } } resp = self.es.count(body=q, index=self.index_name) if resp['_shards']['failed'] > 0: raise RuntimeError("ES count failed: %s", resp) return resp['count'] def _cursor_search(self, q, maxsize, offset, doc_type): return self.es.search(index=self.index_name, body=q, size=maxsize, from_=offset, doc_type=doc_type)['hits']['hits'] def update_field(self, docid, doc_type, field_name, field_value): ''' Update field field_name with field_value''' body = {'doc': {field_name: field_value}} self.es.update(id=docid, doc_type=doc_type, index=self.index_name, body=body) def get_page_by_res(self, res_dict, cache=False): return self.get_page(res_dict['_id'], res_dict['_type'], cache=cache) def get_page(self, docid, doc_type, cache=False): ''' Retrieve a page's source from the index Parameters: - id [required]: the ES id of the page to retrieve - doc_type [required]: the ES document type to retrieve ''' k = str("-".join((docid, self.index_name, doc_type))) if not cache or k not in self.page_cache: page = self.es.get_source(id=docid, index=self.index_name, doc_type=doc_type) if cache: self.page_cache[k] = page self.page_cache.sync() else: page = self.page_cache[k] return page def get_index_analyzer(self): return self.ic.get_settings(index=self.index_name)\ [self.index_name]['settings']['index']\ ['analysis']['analyzer'].keys()[0] def tokenize(self, text, field="text", analyzer=None): ''' Return a list of tokenized tokens Parameters: - text [required]: the text to tokenize - field [optional]: the field whose ES analyzer should be used (default: text) ''' params = {} if analyzer is not None: params['analyzer'] = analyzer try: response = self.ic.analyze(body=text, field=field, index=self.index_name, params=params ) return [d['token'] for d in response['tokens']] except RequestError: return [] def phrase_search(self, phrase, doc_type='', field='_all', slop=0, in_order=True, maxsize=1000, offset=0, source_fields=[]): ''' Retrieve documents containing a phrase. Does not return the documents' source. ''' phraseterms = self.tokenize(phrase, field=field) if len(phraseterms) == 0: return [] q = { "fields": source_fields, "query": { "span_near": { "clauses": [{"span_term": {field: term}} for term in phraseterms], "slop": slop, # max number of intervening unmatched pos. "in_order": in_order, "collect_payloads": False } } } return self._cursor_search(q, maxsize, offset, doc_type) def phrase_count(self, phrase, field='_all', slop=0, in_order=True): phraseterms = self.tokenize(phrase, field=field) if len(phraseterms) == 0: return [] q = { "query": { "span_near": { "clauses": [{"span_term": {field: term}} for term in phraseterms], "slop": slop, # max number of intervening unmatched pos. "in_order": in_order, "collect_payloads": False } } } resp = self.es.count(body=q, index=self.index_name) if resp['_shards']['failed'] > 0: raise RuntimeError("ES count failed: %s", resp) return resp['count'] def index_hash(self): ''' Weak hash (only considers mapping and size) of index_name ''' ic_sts = self.ic.stats(index=self.index_name)['_all']['total']['store'] ic_map = self.ic.get_mapping(index=self.index_name) s = "_".join((unicode(ic_sts), unicode(ic_map))) return hashlib.md5(s).hexdigest() # def get_mappings(self): # mappings = self.es.indices.get_mapping(self.index_name) # return mappings[self.index_name]['mappings'] def set_mappings(self, mapdict): ''' Set mapping for documents in index according to map_dict; only documents types with an entry in map dict are updated. No input check; PLEASE FOLLOW SPECIFICATIONS! format: {<doc_type_1>: {'properties': {'doc_field_1': {<properties>} ... 'doc_filed_n': {<properties>} } } } ''' for doc_type, mapping in mapdict: self.es.indices.put_mapping(index=self.index_name, doc_type=doc_type, body=mapping) # def get_ids(self, doc_type): # res = self.scan_and_scroll(doc_type, scroll_size=5000) # return res # def get_types(self): # from subprocess import check_output # request = 'http://localhost:9200/indexname/_mapping?pretty=1' # request = request.replace('indexname', self.index_name) # res = json.loads(check_output(["curl", "-XGET", request])) # return res[self.index_name]['mappings'].keys() def get_termvector(self, doc_type, docid, fields=None): """ Return the term vector and stratistics for document docid of type doc_type. If fields is not provided, term vectors are returned for each field. """ if fields is None: fields = [] body = { "fields": fields, "offsets": True, "payloads": True, "positions": True, "term_statistics": True, "field_statistics": True } resp = self.es.termvector(index=self.index_name, doc_type=doc_type, id=docid, body=body) return resp def add(self, index, doc_type, entry, docid=None): self.es.index(index=index, doc_type=doc_type, body=entry, id=docid) def get_avg_size(self, field): ''' Get the average document length for a the field sentence ''' q = {"fields": [ "sentence" ], "query": { "match_all": { } }, "aggs": { "my_agg": { "avg": { "script": "doc['sentence'].size()" } } } } res = self.es.search(index=self.index_name, body=q) return res['aggregations']['my_agg']['value'] def get_idf(self, term): ''' Returns the idf of a given term on the index Args: term(str) Returns: float -- idf value ''' if self._count_total == -1: self._count_total = self.count(query='*:*') if self._idf is not None: if term in self._idf: return self._idf[term] else: count = self.count(term) if count == 0: idf = 0 else: idf = math.log( (self._count_total - count + 0.5) / (count + 0.5)) self._idf[term] = idf else: count = self.count(term) if count == 0: idf = 0 else: idf = math.log( (self._count_total - count + 0.5) / (count + 0.5)) self._idf = {term: idf} return idf def scan_and_scroll(self, doc_type, scroll_size=500, scroll_timeout=10): """ The scan search type allows to efficiently scroll a large result set. The response will include no hits, with two important results, the total_hits will include the total hits that match the query and the scroll_id that allows to start the scroll process. Returns a list of results @param scroll_size: scroll size @param scroll_timeout: rountdtrip timeout """ q_body = {"query": { "match_all": {} }} result = self.es.search(self.index_name, doc_type, q_body, search_type='scan', scroll=str(scroll_timeout) + 'm', size=scroll_size) res = self.es.scroll( result['_scroll_id'], scroll=str(scroll_timeout) + 'm') finalres = [] while len(res['hits']['hits']) > 0: print len(res['hits']['hits']) finalres += res['hits']['hits'] res = self.es.scroll( res['_scroll_id'], scroll=str(scroll_timeout) + 'm') return finalres
def get_indices(self, alias_name): indices_client = IndicesClient(client=self.client) try: return list(indices_client.get_alias(name=alias_name).keys()) except NotFoundError: return []
def main(): c_parser = configparser.ConfigParser() c_parser.read("config.ini") es_config = c_parser["ELASTIC"] gtfs_config = c_parser["GTFS"] gtfs_path = gtfs_config["gtfs_path"] index_prefix = es_config["index_prefix"] stops_index = index_prefix + "_stops" shapes_index = index_prefix + "_shapes" stop_times_index = index_prefix + "_stop_times" es = Elasticsearch( host=es_config["host"], scheme=es_config["scheme"], port=es_config.getint("port"), http_auth=(es_config["username"], es_config["password"]), use_ssl=es_config.getboolean("use_ssl"), verify_certs=es_config.getboolean("verify_certs"), ca_certs=certifi.where()) with open("mappings/shapes.json", 'r' ) as shapes_mapping_file: shapes_mapping = shapes_mapping_file.read() with open("mappings/stops.json", 'r' ) as stops_mapping_file: stops_mapping = stops_mapping_file.read() with open("mappings/stop_times.json", 'r') as stop_times_file: stop_times_mapping = stop_times_file.read() indices = IndicesClient(es) indices.create(stops_index, body=stops_mapping) indices.create(shapes_index, body=shapes_mapping) indices.create(stop_times_index, body=stop_times_mapping) all_stops = gather_stops(gtfs_path) for ok, item in parallel_bulk(es, genbulkactions(stops_index, all_stops.values()), chunk_size=500): if not ok: print(item) print("Done with stops") all_shapes = gather_shapes(gtfs_path) all_trips = gather_trips(gtfs_path) all_routes = gather_routes(gtfs_path) shapes_to_route = shape_to_route_dict(all_trips.values(), all_routes) for shape_id in shapes_to_route.keys(): all_shapes[shape_id]['route'] = shapes_to_route[shape_id] all_shapes[shape_id].pop('start_seq', None) all_shapes[shape_id].pop('finish_seq', None) for ok, item in parallel_bulk(es, genbulkactions(shapes_index, all_shapes.values()), chunk_size=500): if not ok: print(item) print("Done with shapes") for trip in all_trips.values(): route_id = trip.pop("route_id", None) if route_id: trip['route'] = all_routes[int(route_id)] all_stop_times = gather_stop_times(gtfs_path) for stop_time in all_stop_times: trip_id = stop_time.pop("trip_id", None) stop_id = stop_time.pop("stop_id", None) if trip_id: stop_time['trip'] = all_trips[int(trip_id)] if stop_id: stop_time['stop'] = all_stops[int(stop_id)] for ok, item in parallel_bulk(es, genbulkactions(stop_times_index, all_stop_times), chunk_size=1000): if not ok: print(item) print("Done with stop times")
async def shutdown_elastic_search(elastic_search: Elasticsearch, indices_client: IndicesClient) -> None: indices_client.delete(index=INDEX) elastic_search.close()
def drop_index(self, using=None): from elasticsearch.client.indices import IndicesClient connection = get_connection_for_doctype(self._meta.document, using=using) return IndicesClient(connection).delete(self._meta.index)