Example #1
0
    def get_fields(self):
        """
         Get provider field information (names, types)

        :returns: dict of fields
        """

        fields_ = {}
        ic = IndicesClient(self.es)
        ii = ic.get(self.index_name)

        try:
            if '*' not in self.index_name:
                p = ii[self.index_name]['mappings']['properties']['properties']
            else:
                LOGGER.debug('Wildcard index; setting from first match')
                index_name_ = list(ii.keys())[0]
                p = ii[index_name_]['mappings']['properties']['properties']
        except KeyError:
            LOGGER.debug('ES index looks generated by GDAL')
            self.is_gdal = True
            p = ii[self.index_name]['mappings']

        for k, v in p['properties'].items():
            if 'type' in v:
                if v['type'] == 'text':
                    fields_[k] = {'type': 'string'}
                elif v['type'] == 'date':
                    fields_[k] = {'type': 'string', 'format': 'date'}
                else:
                    fields_[k] = {'type': v['type']}

        return fields_
Example #2
0
    def get_fields(self):
        """
         Get provider field information (names, types)

        :returns: dict of fields
        """

        fields_ = {}
        ic = IndicesClient(self.es)
        ii = ic.get(self.index_name)

        try:
            p = ii[self.index_name]['mappings']['properties'][
                'properties']  # noqa
        except KeyError:
            LOGGER.debug('ES index looks generated by GDAL')
            self.is_gdal = True
            p = ii[self.index_name]['mappings']

        for k, v in p['properties'].items():
            if 'type' in v:
                if v['type'] == 'text':
                    type_ = 'string'
                else:
                    type_ = v['type']
                fields_[k] = type_

        return fields_
Example #3
0
 def __init__(self, host='localhost', port=9200,
              index_name='pubmed', cred_path='.cred'):
     self.host = host
     self.port = port
     self.index_name = index_name
     self.cred_path = cred_path
     # self.doc_type = 'papers'
     self.es = self.__connect()
     self.ic = IndicesClient(self.es)
     self.page_cache = shelve.open("/Users/rmn/git/BioSum/biosum-supervised/cache/pages.p", writeback=False)
def check_index(esconn, index_name):
    index = IndicesClient(esconn)
    try:
        if index.exists(index=index_name):
            print(index.get_settings(index=index_name))
            return True
        else:
            return False
    except Exception as ex:
        raise ES_INDEX_ERROR(ex)
def check_index_data(esconn, index_name):
    index = IndicesClient(esconn)
    try:
        idx = index.stats(
            index = index_name
        )
        print("Found: " + str(idx["_all"]["total"]["docs"]["count"]) + " Documents in the " + index_name + " Index")
        if idx["_all"]["total"]["docs"]["count"] > 0:
            return True
        return False
        
    except Exception as ex:
        raise ES_INDEX_ERROR(ex)
	def __init__(self, host, port, username, password, indexname):
		"""
		Initializes this Elasticsearch Client.
		
		:param host: the HTTP address of the Elasticsearch server.
		:param port: the HTTP port of the Elasticsearch server.
		:param username: the username for connecting to the index.
		:param password: the password for connecting to the index.
		:param indexname: the name of the Elasticsearch index.
		"""
		self.indexname = indexname
		self.client = Elasticsearch(connection_class = SafeRequestsHttpConnection, host = host, port = int(port), http_auth = [username, password])
		self.snapshotclient = SnapshotClient(self.client)
		self.indicesclient = IndicesClient(self.client)
def import_populate_db():
    es_host = app.config['ELASTICSEARCH']
    es_idx = app.config['ES_IDX']

    es = Elasticsearch(f'{es_host}:9200')
    idx = IndicesClient(es)
    if idx.exists(index=es_idx):
        existing_docs = es.count(index=es_idx).get('count')
        if existing_docs:
            return existing_docs, 0

        app.logger.info("Drop and creating index and mapping definition")
        idx.delete(es_idx)
    else:
        idx.create(index=es_idx, body=mapping)

    app.logger.info("Populating elasticsearch with documents")
    errors = []
    for pdv in pdvs:
        document_id = ''.join(filter(str.isdigit, pdv.get('document')))
        pdv['document'] = document_id
        del pdv['id']
        try:
            es.index(index=es_idx, body=pdv, id=document_id)
        except Exception as ex:
            app.logger.exception(ex)
            errors.append({'id': pdv.get('id'), 'description': ex.args})
    inserted = len(pdvs) - len(errors)
    return errors, inserted
Example #8
0
    def get_fields(self):
        """
         Get provider field information (names, types)

        :returns: dict of fields
        """

        fields_ = {}
        ic = IndicesClient(self.es)
        ii = ic.get(self.index_name)
        p = ii[self.index_name]['mappings'][self.type_name]['properties']['properties']  # noqa

        for k, v in p['properties'].items():
            if v['type'] == 'text':
                type_ = 'string'
            else:
                type_ = v['type']
            fields_[k] = {'type': type_}

        return fields_
Example #9
0
 def __init__(self,
              host='localhost',
              port=9200,
              index_name='biosum',
              cred_path='.cred'):
     self.host = host
     self.port = port
     self.index_name = index_name
     self.cred_path = cred_path
     # self.doc_type = 'papers'
     self.es = self.__connect()
     self.ic = IndicesClient(self.es)
     try:
         cache_file = constants.get_path()['cache']
         self.page_cache = shelve.open(cache_file + '/pages.p',
                                       writeback=False)
     except:
         print 'Not found: %s' % cache_file
         print sys.exc_info()[0]
         sys.exit()
Example #10
0
 def __init__(self, host="localhost", port=9200, index_name="biosum", cred_path=".cred"):
     self.host = host
     self.port = port
     self.index_name = index_name
     self.cred_path = cred_path
     # self.doc_type = 'papers'
     self.es = self.__connect()
     self.ic = IndicesClient(self.es)
     try:
         cache_file = constants.get_path()["cache"]
         self.page_cache = shelve.open(cache_file + "/pages.p", writeback=False)
     except:
         print "Not found: %s" % cache_file
         print sys.exc_info()[0]
         sys.exit()
Example #11
0
def initialize_elastic_search() -> Tuple[Elasticsearch, IndicesClient]:
    elastic_search = Elasticsearch(hosts=[{"host": "localhost", "port": 9200}])
    indices_client = IndicesClient(client=elastic_search)
    try:
        indices_client.create(
            index=INDEX,
            body={
                "mappings": {
                    "properties": {
                        "doc": {
                            "type": "text"
                        },
                        "vector": {
                            "type": "dense_vector",
                            "dims": 768
                        },
                    }
                }
            },
        )
    except RequestError:
        pass

    return elastic_search, indices_client
def create_index(esconn, index_name, data_file, shard_count):
    index = IndicesClient(esconn)
    try:
        index_json = open(data_file)
        body = index_json.read()
        json_body = json.loads(body)
        # Work out number of shards == no. of data nodes x 2
        print("Setting Index Shard Count to: " + str(shard_count))
        # Update json doc
        json_body["settings"]["index"]["number_of_shards"] = shard_count
        # For single node clusters (shard_count will be 2)- no replicas possible
        if shard_count == 2:
            print("Single node cluster detected - disabling replicas")
            json_body["settings"]["index"]["number_of_replicas"] = 0
        # Create Index and Apply any settings & mappings
        idx = index.create(
            index = index_name,
            body = json_body
        )
        if idx['acknowledged'] != True:
            raise ES_INDEX_ERROR('Failed to create Index. Response: ', idx)
        print("SUCCESS: Created Index: " + index_name)
    except Exception as ex:
        raise ES_PIPELINE_ERROR(ex)
class ElasticIndiceDriver:
    def __init__(self, client: Elasticsearch):
        self.client = IndicesClient(client)

    def create_index(self, index: str, mapping: dict):
        self.client.create(index, json.dumps(mapping))

    def clean_index(self, index: str):
        self.client.delete(index)
        self.client.delete(f'{index}-finished')
Example #14
0
 def create(cls, user, **kwargs):
     """Create user index."""
     # Create index for user
     client = Elasticsearch(cls.__url__)
     indice = IndicesClient(client)
     if indice.exists(index=user.user_id):
         if 'delete_existing' in kwargs and kwargs['delete_existing']:
             log.warn('Deleting existing index for user %s' % user.user_id)
             indice.delete(index=user.user_id)
         else:
             log.warn('Index already exists for user %s' % user.user_id)
             return False
     log.info('Creating index for user %s' % user.user_id)
     indice.create(index=user.user_id)
     return True
Example #15
0
 def create(cls, user, **kwargs):
     """Create user index."""
     # Create index for user
     client = Elasticsearch(cls.__url__)
     indice = IndicesClient(client)
     if indice.exists(index=user.user_id):
         if 'delete_existing' in kwargs and kwargs['delete_existing']:
             log.warn('Deleting existing index for user %s' % user.user_id)
             indice.delete(index=user.user_id)
         else:
             log.warn('Index already exists for user %s' % user.user_id)
             return False
     log.info('Creating index for user %s' % user.user_id)
     indice.create(index=user.user_id)
     return True
    def recreate_index_model(self, model: Union[type[Gallery], type[Archive]]):

        from elasticsearch.client.indices import IndicesClient

        indices_client = IndicesClient(client=self.es_client)
        index_name = model._meta.es_index_name  # type: ignore
        if indices_client.exists(index_name):
            indices_client.delete(index=index_name)
        indices_client.create(index=index_name)
        indices_client.close(index=index_name)
        indices_client.put_settings(
            index=index_name,
            body={
                "index": {
                    "max_result_window": settings.MAX_RESULT_WINDOW
                },
                "analysis": {
                    "filter": {
                        "edge_ngram_filter": {
                            "type": "edge_ngram",
                            "min_gram": 2,
                            "max_gram": 20
                        }
                    },
                    "analyzer": {
                        "edge_ngram_analyzer": {
                            "type": "custom",
                            "tokenizer": "standard",
                            "filter": ["lowercase", "edge_ngram_filter"]
                        }
                    }
                }
            })
        indices_client.put_mapping(
            body=model._meta.es_mapping,  # type: ignore
            index=index_name,
        )
        indices_client.open(index=index_name)
class ElasticSearchClient:
	"""
	Class used as a client to the Elasticsearch server.
	"""
	def __init__(self, host, port, username, password, indexname):
		"""
		Initializes this Elasticsearch Client.
		
		:param host: the HTTP address of the Elasticsearch server.
		:param port: the HTTP port of the Elasticsearch server.
		:param username: the username for connecting to the index.
		:param password: the password for connecting to the index.
		:param indexname: the name of the Elasticsearch index.
		"""
		self.indexname = indexname
		self.client = Elasticsearch(connection_class = SafeRequestsHttpConnection, host = host, port = int(port), http_auth = [username, password])
		self.snapshotclient = SnapshotClient(self.client)
		self.indicesclient = IndicesClient(self.client)

	def delete_index_and_mappings(self):
		"""
		Deletes the index and all its mappings.
		"""
		try:
			self.client.indices.delete(index = self.indexname)
		except NotFoundError:
			pass

	def create_index_and_mappings(self, update_mappings = False):
		"""
		Creates or updates the index and its mappings.
		
		:param update_mappings: boolean denoting whether the mappings should be created (False) or updated (True).
		"""
		if not self.client.indices.exists(self.indexname):
			self.client.indices.create(index = self.indexname, body = load_file_to_json("properties/indexsettings.json"))
		mappings = {}
		if self.indexname in self.client.indices.get_mapping(self.indexname):
			mappings = self.client.indices.get_mapping(self.indexname)[self.indexname]['mappings']
		if update_mappings:
			self.client.indices.close(self.indexname)
		if 'files' not in mappings or update_mappings:
			self.client.indices.put_mapping(index = self.indexname, doc_type = 'files',
				body = load_file_to_json("properties/filesproperties.json"))
		if 'projects' not in mappings or update_mappings:
			self.client.indices.put_mapping(index = self.indexname, doc_type = 'projects',
				body = load_file_to_json("properties/projectsproperties.json"))
		if update_mappings:
			self.client.indices.open(self.indexname)

	def has_project(self, project_id):
		"""
		Checks if the index contains a project.
		
		:param project_id: the id of the project to check if it is contained in the index.
		:returns: True if the index contains the project, or False otherwise.
		"""
		return self.client.exists(index = self.indexname, doc_type = 'projects', id = project_id)

	def has_file(self, file_id):
		"""
		Checks if the index contains a file.
		
		:param file_id: the id of the file to check if it is contained in the index.
		:returns: True if the index contains the file, or False otherwise.
		"""
		return self.client.exists(index = self.indexname, doc_type = 'files', id = file_id)

	def create_project(self, project):
		"""
		Creates a project in the index.
		
		:param project: the data of the project in JSON format.
		"""
		self.client.create(index = self.indexname, doc_type = 'projects', id = project['fullname'], body = project)

	def create_file(self, afile):
		"""
		Creates a file in the index.
		
		:param afile: the data of the file in JSON format.
		"""
		self.client.create(index = self.indexname, doc_type = 'files', id = afile['fullpathname'], parent = afile['project'], body = afile)

	def update_file(self, afile):
		"""
		Updates a file in the index.
		
		:param afile: the data of the file in JSON format.
		"""
		self.client.update(index = self.indexname, doc_type = 'files', id = afile['fullpathname'], parent = afile['project'], body = {'doc': afile})

	def delete_file(self, afile_id):
		"""
		Deletes a file from the index.
		
		:param afile_id: the id of the file to be deleted.
		"""
		self.client.delete(index = self.indexname, doc_type = 'files', id = afile_id, routing = '/'.join(afile_id.split('/')[0:2]))

	def delete_project(self, project_id):
		"""
		Deletes a project from the index. Note that this function also deletes all the files of the project.
		
		:param project_id: the id of the project to be deleted.
		"""
		self.client.delete_by_query(index = self.indexname, doc_type = 'files', body = {"query": { "bool": { "must": { "match_all": {} }, "filter": { "term": { "_routing": project_id } } } } })
		self.client.delete(index = self.indexname, doc_type = 'projects', id = project_id)

	def get_project_fileids_and_shas(self, project_id):
		"""
		Returns all the files and their corresponding shas for a project.
		
		:param project_id: the id of the project of which the files and the shas are returned.
		:returns: a dict containing the files of the project as keys and their shas as values.
		"""
		sourcefiles = self.client.search(index = self.indexname, doc_type = 'files',
			body = {"query": { "term" : { "_routing": project_id } } }, routing = project_id, size = 100000000)['hits']['hits']  # Limitation! Each project must have no more than 100000000 files
		fileidsandshas = {}
		for afile in sourcefiles:
			fileidsandshas[afile['_id']] = afile['_source']['sha']
		return fileidsandshas

	def execute_query(self, query, doc_type = 'files'):
		"""
		Executes a query on the index.
		
		:param query: the body of the query.
		:param doc_type: the document type to which the query is executed, either 'projects' or 'files'.
		:returns: the response of the query.
		"""
		return self.client.search(index = self.indexname, doc_type = doc_type, body = query)

	def test_analyzer(self, analyzer, text):
		"""
		Tests an analyzer of the index.
		
		:param analyzer: the analyzer to be tested.
		:param text: the text to be analyzed as a test.
		:returns: the analyzed text.
		"""
		result = self.indicesclient.analyze(index = self.indexname, analyzer = analyzer, body = text)
		return [r['token'] for r in result['tokens']]

	def backup(self, backupdir):
		"""
		Backups the index.
		
		:param backupdir: the directory used to backup the index.
		"""
		repositoryname = os.path.basename("backup" + self.indexname)
		try:
			self.snapshotclient.get_repository(repository = repositoryname)
		except:
			self.snapshotclient.create_repository(repository = repositoryname, body = {"type": "fs", "settings": {"location": backupdir + os.sep + self.indexname}})
		try:
			self.snapshotclient.get(repository = repositoryname, snapshot = self.indexname + "snapshot")
		except:
			self.snapshotclient.create(repository = repositoryname, snapshot = self.indexname + "snapshot", body = {"indices": self.indexname}, wait_for_completion = True)

	def delete_backup(self):
		"""
		Removes any backups of the index. If there are no backups, this function does nothing.
		"""
		repositoryname = os.path.basename("backup" + self.indexname)
		try:
			self.snapshotclient.delete(repository = repositoryname, snapshot = self.indexname + "snapshot")
		except:
			pass

	def restore_backup(self):
		"""
		Restores a backup of the index.
		"""
		repositoryname = os.path.basename("backup" + self.indexname)
		if not self.client.indices.exists(self.indexname):
			self.client.indices.create(index = self.indexname, body = load_file_to_json("properties/indexsettings.json"))
		self.client.indices.close(self.indexname)
		self.snapshotclient.restore(repository = repositoryname, snapshot = self.indexname + "snapshot", body = {"indices": self.indexname}, wait_for_completion = True)
		self.client.indices.open(self.indexname)

	def flush(self):
		"""
		Flushes the index.
		"""
		self.indicesclient.flush(index = self.indexname)
 def __init__(self, client: Elasticsearch):
     self.client = IndicesClient(client)
Example #19
0
class ESInterface():

    """Interface for ElasticSearch"""

    _count_total = -1  # N: Number of docs
    _idf = None  # dict for storing idf values

    def __init__(self, host='localhost', port=9200,
                 index_name='pubmed', cred_path='.cred'):
        self.host = host
        self.port = port
        self.index_name = index_name
        self.cred_path = cred_path
        # self.doc_type = 'papers'
        self.es = self.__connect()
        self.ic = IndicesClient(self.es)
        self.page_cache = shelve.open("/Users/rmn/git/BioSum/biosum-supervised/cache/pages.p", writeback=False)

    def login(self, username, password):
        pass

    @property
    def description(self):
        # get mapping, clean it up
        m = self.es.indices.get_mapping(self.index_name)
        m = m[self.index_name]['mappings']

        description = {'host': self.host,
                       'port': self.port,
                       'index_name': self.index_name,
                       'mapping': m}
        return description

    @property
    def size(self):
        stats = self.es.indices.stats()['indices'][self.index_name]
        return stats['total']['docs']['count']

    def __connect(self):
        '''Private method used to connect to the ElasticSearch instance.'''
        es = ES(hosts=[{'host': self.host, 'port': self.port}])

        # checks if server exists
        if not es.ping():
            err = ('It appears that nothing is running at http://%s:%s' %
                   (self.host, self.port))
            raise OSError(err)

        # load the credentials file (if possible)
#         with file(self.cred_path) as cf:
#             username, password = [l.strip() for l in cf.readlines()][:2]
#         data = json.dumps({'username': username, 'password': password})
        url = 'http://%s:%s/login' % (self.host, self.port)
        resp = json.loads(requests.post(url).text)
#         if resp['status'] == 200:
#             self.auth_token = resp['token']
#         else:
#             self.auth_token = ''

        # checks if index exists
        try:
            es.indices.get_mapping(self.index_name)
        except TransportError as e:
            if e.args[0] == 403:
                err = list(e.args)
                err[1] = ('Credentials not valid for %s:%s/%s' %
                          (self.host, self.port, self.index_name))
                e.args = tuple(err)
            elif e.args[0] == 404:
                self.__del__()
                err = list(e.args)
                err[1] = ('No index named "%s" is avaliable at %s:%s' %
                          (self.index_name, self.host, self.port))
                e.args = tuple(err)
            raise
        return es

    def __del__(self):
        requests.post('http://%s:%s/logout' % (self.host, self.port))

    # def get_scroll(self, scroll_size, scroll_timeout):
    #     q_body = {"query": {"match_all": {}}}
    #     return self.es.search(self.index_name, self.doc_type, q_body,
    #                           search_type='scan', scroll='100m',
    #                           size='10000')

    # def scroll(self, scroll_id):
    #     return self.es.scroll(scroll_id, scroll='10m')

    # def scan_and_scroll(self, doc_type, scroll_size=50, scroll_timeout=10):
    #     """
    #     The scan search type allows to efficiently scroll a large result set.
    #     The response will include no hits, with two important results,
    #     the total_hits will include the total hits that match the query
    #     and the scroll_id that allows to start the scroll process.

    #     @param scroll_size: scroll size
    #     @param scroll_timeout: rountdtrip timeout
    #     """
    #     q_body = {"query": {
    #         "match_all": {}
    #     }}
    #     result = self.es.search(self.index_name,
    #                             doc_type,
    #                             q_body,
    #                             search_type='scan',
    #                             scroll=str(scroll_timeout) +
    #                             'm',
    #                             size=scroll_size)
    #     res = self.es.scroll(
    #         result['_scroll_id'], scroll=str(scroll_timeout) + 'm')
    #     finalres = []
    #     while len(res['hits']['hits']) > 0:
    #         finalres.append(res)
    #         res = self.es.scroll(
    #             res['_scroll_id'], scroll=str(scroll_timeout) + 'm')
    #     return finalres

    # def esc(self, txt):
    #     for e in TO_ESCAPE:
    #         txt = txt.replace(e, '\%s' % e)
    #     return txt

    def find_all(self, source_fields=None, doc_type=''):
        if source_fields:
            q_body = {
                "fields": source_fields,
                "query": {
                    "match_all": {}
                }
            }
        else:
            q_body = {
                "query": {
                    "match_all": {}
                }
            }
        return self.es.search(
            body=q_body, size=1000000, index=self.index_name, doc_type=doc_type)['hits']['hits']

    def multi_field_search(self,
                           field_vals,
                           fields=['sentence', 'mm-concepts', 'noun_phrases'],
                           maxsize=1000,
                           field_boost=[1, 3, 2],
                           offset=0,
                           source_fields=[],
                           doc_type='',
                           params=None):
        '''Interface for simple query tasks.
        Parameters:
            - field_vals [requried]: a list of field values to query
            - maxsize [optional]:   number of results to get.
                                    default is 1000.
        Returns results.'''
#         q_body = {
#             "fields": source_fields,
#             "query": {
#                 "dis_max": {
#                     "queries": [
#                         {"match": {
#                             "sentence":  {
#                                 "query": sentence,
#                                 "boost": field_boost[0]
#                             }}},
#                         {"match": {
#                             "mm-concepts":  {
#                                 "query": concepts,
#                                 "boost": field_boost[1]
#                             }}},
#                         {"match": {
#                             "noun_phrases":  {
#                                 "query": noun_phrases,
#                                 "boost": field_boost[2]
#                             }}}
#                     ]
#                 }
#             }
#         }
        q_body = {
            "fields": source_fields,
            "query": {
                "dis_max": {
                    "queries": [
                    ]
                }
            }
        }
        for idx in range(len(field_vals)):
            q_body['query']['dis_max']['queries'].append({"match": {
                fields[idx]:  {
                    "query": field_vals[idx],
                    "boost": field_boost[idx]
                }}})

        if params is not None:
            for key in params:
                q_body['query']['dis_max'][key] = params[key]

        return self._cursor_search(q_body, maxsize, offset, doc_type)

    def simple_search(self, query, field='_all', maxsize=1000,
                      offset=0, source_fields=[], doc_type='',
                      operator='or', phrase_slop=0, escape=False, params=None):
        '''Interface for simple query tasks.
        Parameters:
            - query [requried]: the string to query
            - maxsize [optional]:   number of results to get.
                                    default is 1000.
        Returns results.'''

        if escape:
            query = self.esc(query)

        q_body = {
            "fields": source_fields,
            'query': {
                'query_string': {
                    'query': query,
                    'default_operator': operator,
                    'use_dis_max': True,
                    'auto_generate_phrase_queries': True,
                    'phrase_slop': phrase_slop
                }
            }
        }
        if params is not None:
            for key in params:
                q_body['query']['query_string'][key] = params[key]

        if field:
            q_body['query']['query_string']['default_field'] = field

        return self._cursor_search(q_body, maxsize, offset, doc_type)

    def count(self, query, field='_all', operator="AND"):
        q = {
            'query': {
                "query_string": {
                    "default_field": field,
                    "default_operator": operator,
                    "query": query
                }
            }
        }
        resp = self.es.count(body=q, index=self.index_name)
        if resp['_shards']['failed'] > 0:
            raise RuntimeError("ES count failed: %s", resp)

        return resp['count']

    def _cursor_search(self, q, maxsize, offset, doc_type):
        return self.es.search(index=self.index_name,
                              body=q,
                              size=maxsize,
                              from_=offset,
                              doc_type=doc_type)['hits']['hits']

    def update_field(self, docid, doc_type,
                     field_name, field_value):
        ''' Update field field_name with field_value'''
        body = {'doc': {field_name: field_value}}
        self.es.update(id=docid, doc_type=doc_type,
                       index=self.index_name, body=body)

    def get_page_by_res(self, res_dict, cache=False):
        return self.get_page(res_dict['_id'],
                             res_dict['_type'],
                             cache=cache)

    def get_page(self, docid, doc_type, cache=False):
        ''' Retrieve a page's source from the index
        Parameters:
            - id [required]: the ES id of the page to retrieve
            - doc_type [required]: the ES document type to retrieve
        '''
        k = str("-".join((docid, self.index_name, doc_type)))

        if not cache or k not in self.page_cache:
            page = self.es.get_source(id=docid,
                                      index=self.index_name,
                                      doc_type=doc_type)

            if cache:
                self.page_cache[k] = page
                self.page_cache.sync()
        else:
            page = self.page_cache[k]

        return page

    def get_index_analyzer(self):
        return self.ic.get_settings(index=self.index_name)\
            [self.index_name]['settings']['index']\
            ['analysis']['analyzer'].keys()[0]

    def tokenize(self, text, field="text", analyzer=None):
        ''' Return a list of tokenized tokens
        Parameters:
            - text [required]: the text to tokenize
            - field [optional]: the field whose ES analyzer
                                should be used (default: text)
        '''
        params = {}
        if analyzer is not None:
            params['analyzer'] = analyzer
        try:
            response = self.ic.analyze(body=text, field=field,
                                       index=self.index_name,
                                       params=params
                                       )
            return [d['token'] for d in response['tokens']]
        except RequestError:
            return []

    def phrase_search(self, phrase, doc_type='',
                      field='_all', slop=0, in_order=True,
                      maxsize=1000, offset=0, source_fields=[]):
        ''' Retrieve documents containing a phrase.
            Does not return the documents' source. '''

        phraseterms = self.tokenize(phrase, field=field)
        if len(phraseterms) == 0:
            return []

        q = {
            "fields": source_fields,
            "query": {
                "span_near": {
                    "clauses": [{"span_term": {field: term}}
                                for term in phraseterms],
                    "slop": slop,  # max number of intervening unmatched pos.
                    "in_order": in_order,
                    "collect_payloads": False
                }
            }
        }
        return self._cursor_search(q, maxsize, offset, doc_type)

    def phrase_count(self, phrase, field='_all', slop=0, in_order=True):
        phraseterms = self.tokenize(phrase, field=field)

        if len(phraseterms) == 0:
            return []

        q = {
            "query": {
                "span_near": {
                    "clauses": [{"span_term": {field: term}}
                                for term in phraseterms],
                    "slop": slop,  # max number of intervening unmatched pos.
                    "in_order": in_order,
                    "collect_payloads": False
                }
            }
        }

        resp = self.es.count(body=q, index=self.index_name)
        if resp['_shards']['failed'] > 0:
            raise RuntimeError("ES count failed: %s", resp)

        return resp['count']

    def index_hash(self):
        ''' Weak hash (only considers mapping and size) of index_name '''
        ic_sts = self.ic.stats(index=self.index_name)['_all']['total']['store']
        ic_map = self.ic.get_mapping(index=self.index_name)
        s = "_".join((unicode(ic_sts), unicode(ic_map)))
        return hashlib.md5(s).hexdigest()

    # def get_mappings(self):
    #     mappings = self.es.indices.get_mapping(self.index_name)
    #     return mappings[self.index_name]['mappings']

    def set_mappings(self, mapdict):
        ''' Set mapping for documents in index according to map_dict;
            only documents types with an entry in map dict are updated.
            No input check; PLEASE FOLLOW SPECIFICATIONS!
            format:
            {<doc_type_1>: {'properties': {'doc_field_1': {<properties>}
                                           ...
                                           'doc_filed_n': {<properties>}
                                           }
                            }
            }
        '''
        for doc_type, mapping in mapdict:
            self.es.indices.put_mapping(index=self.index_name,
                                        doc_type=doc_type,
                                        body=mapping)

    # def get_ids(self, doc_type):
    #     res = self.scan_and_scroll(doc_type, scroll_size=5000)
    #     return res

    # def get_types(self):
    #     from subprocess import check_output
    #     request = 'http://localhost:9200/indexname/_mapping?pretty=1'
    #     request = request.replace('indexname', self.index_name)
    #     res = json.loads(check_output(["curl", "-XGET", request]))
    #     return res[self.index_name]['mappings'].keys()

    def get_termvector(self, doc_type, docid, fields=None):
        """ Return the term vector and stratistics
            for document docid of type doc_type.
            If fields is not provided, term vectors
            are returned for each field.
        """
        if fields is None:
            fields = []
        body = {
            "fields": fields,
            "offsets": True,
            "payloads": True,
            "positions": True,
            "term_statistics": True,
            "field_statistics": True
        }
        resp = self.es.termvector(index=self.index_name,
                                  doc_type=doc_type,
                                  id=docid,
                                  body=body)
        return resp

    def add(self, index,
            doc_type,
            entry,
            docid=None):
        self.es.index(index=index, doc_type=doc_type, body=entry,
                      id=docid)

    def get_avg_size(self, field):
        '''
        Get the average document length for a the field sentence
        '''
        q = {"fields": [
            "sentence"
        ],
            "query": {
            "match_all": {

            }
        },
            "aggs": {
            "my_agg": {
                "avg": {
                    "script": "doc['sentence'].size()"
                }
            }
        }
        }
        res = self.es.search(index=self.index_name, body=q)
        return res['aggregations']['my_agg']['value']

    def get_idf(self, term):
        '''
        Returns the idf of a given term on the index

        Args:
            term(str)

        Returns:
            float -- idf value
        '''
        if self._count_total == -1:
            self._count_total = self.count(query='*:*')
        if self._idf is not None:
            if term in self._idf:
                return self._idf[term]
            else:
                count = self.count(term)
                if count == 0:
                    idf = 0
                else:
                    idf = math.log(
                        (self._count_total - count + 0.5) / (count + 0.5))
                self._idf[term] = idf
        else:
            count = self.count(term)
            if count == 0:
                idf = 0
            else:
                idf = math.log(
                    (self._count_total - count + 0.5) / (count + 0.5))
            self._idf = {term: idf}
        return idf

    def scan_and_scroll(self, doc_type, scroll_size=500, scroll_timeout=10):
        """
        The scan search type allows to efficiently scroll a large result set.
        The response will include no hits, with two important results,
        the total_hits will include the total hits that match the query
        and the scroll_id that allows to start the scroll process.
        Returns a list of results

        @param scroll_size: scroll size
        @param scroll_timeout: rountdtrip timeout
        """
        q_body = {"query": {
            "match_all": {}
        }}
        result = self.es.search(self.index_name,
                                doc_type,
                                q_body,
                                search_type='scan',
                                scroll=str(scroll_timeout) +
                                'm',
                                size=scroll_size)
        res = self.es.scroll(
            result['_scroll_id'], scroll=str(scroll_timeout) + 'm')
        finalres = []
        while len(res['hits']['hits']) > 0:
            print len(res['hits']['hits'])
            finalres += res['hits']['hits']
            res = self.es.scroll(
                res['_scroll_id'], scroll=str(scroll_timeout) + 'm')
        return finalres
Example #20
0
 def get_indices(self, alias_name):
     indices_client = IndicesClient(client=self.client)
     try:
         return list(indices_client.get_alias(name=alias_name).keys())
     except NotFoundError:
         return []
Example #21
0
def main():
    c_parser = configparser.ConfigParser()
    c_parser.read("config.ini")
    es_config = c_parser["ELASTIC"]
    gtfs_config = c_parser["GTFS"]
    gtfs_path = gtfs_config["gtfs_path"]
    index_prefix = es_config["index_prefix"]
    stops_index = index_prefix + "_stops"
    shapes_index = index_prefix + "_shapes"
    stop_times_index = index_prefix + "_stop_times"
    es = Elasticsearch(
        host=es_config["host"],
        scheme=es_config["scheme"],
        port=es_config.getint("port"),
        http_auth=(es_config["username"], es_config["password"]),
        use_ssl=es_config.getboolean("use_ssl"),
        verify_certs=es_config.getboolean("verify_certs"),
        ca_certs=certifi.where())
    
    with open("mappings/shapes.json", 'r' ) as shapes_mapping_file:
        shapes_mapping = shapes_mapping_file.read()
    
    with open("mappings/stops.json", 'r' ) as stops_mapping_file:
        stops_mapping = stops_mapping_file.read()

    with open("mappings/stop_times.json", 'r') as stop_times_file:
        stop_times_mapping = stop_times_file.read()
    
    indices = IndicesClient(es)
    indices.create(stops_index, body=stops_mapping)
    indices.create(shapes_index, body=shapes_mapping)
    indices.create(stop_times_index, body=stop_times_mapping)
    all_stops = gather_stops(gtfs_path)
    for ok, item in parallel_bulk(es, genbulkactions(stops_index, all_stops.values()), chunk_size=500):
        if not ok:
            print(item)
    
    print("Done with stops")

    all_shapes = gather_shapes(gtfs_path)
    all_trips = gather_trips(gtfs_path)
    all_routes = gather_routes(gtfs_path)
    shapes_to_route = shape_to_route_dict(all_trips.values(), all_routes)
    for shape_id in shapes_to_route.keys():
        all_shapes[shape_id]['route'] = shapes_to_route[shape_id]
        all_shapes[shape_id].pop('start_seq', None)
        all_shapes[shape_id].pop('finish_seq', None)

    for ok, item in parallel_bulk(es, genbulkactions(shapes_index, all_shapes.values()), chunk_size=500):
        if not ok:
            print(item)
    
    print("Done with shapes")
    for trip in all_trips.values():
        route_id = trip.pop("route_id", None)
        if route_id:
            trip['route'] = all_routes[int(route_id)]

    all_stop_times = gather_stop_times(gtfs_path)
    for stop_time in all_stop_times:
        trip_id = stop_time.pop("trip_id", None)
        stop_id = stop_time.pop("stop_id", None)
        if trip_id:
            stop_time['trip'] = all_trips[int(trip_id)]
        if stop_id:
            stop_time['stop'] = all_stops[int(stop_id)]

    for ok, item in parallel_bulk(es, genbulkactions(stop_times_index, all_stop_times), chunk_size=1000):
        if not ok:
            print(item) 

    print("Done with stop times")
Example #22
0
async def shutdown_elastic_search(elastic_search: Elasticsearch,
                                  indices_client: IndicesClient) -> None:
    indices_client.delete(index=INDEX)
    elastic_search.close()
Example #23
0
 def drop_index(self, using=None):
     from elasticsearch.client.indices import IndicesClient
     connection = get_connection_for_doctype(self._meta.document,
                                             using=using)
     return IndicesClient(connection).delete(self._meta.index)