def store_vacancy_record(es: Elasticsearch, index_name: str, record: dict, parent_id: str) -> str: hash_string = '' for k, v in record.items(): hash_string += "{}{}".format(k, v) hash_string += parent_id hash_object = hashlib.md5(hash_string.encode()) es.index(index=index_name, doc_type='vacancies', id=hash_object.hexdigest(), body=record, parent=parent_id) return hash_string
class ElasticClient: def __init__(self, index_name, index_type, ip="127.0.0.1"): ''' @param index_name: 索引名称 @param index_type: 索引类型 ''' self.index_name = index_name self.index_type = index_type self.es = Elasticsearch([ip]) def create_index(self, index_name="teacher_resume", index_type="tr_type"): #创建索引 _index_mappings = { "mappings": { self.index_type: { "properties": { "teachername": { "type": "keyword" }, "telephone": { "type": "text" }, "email": { "type": "keyword" }, "research_direction": { "type": "array" }, "personal_profile": { "type": "text" }, "teaching_results": { "type": "text" }, "research_results": { "type": "text" }, "lab_introduction": { "type": "text" }, } } } } self.es.indices.create(index=self.index_name, body=_index_mappings, ignore=400) def load_index(self): with open(os.path.join(BASE_DIR, 'static', 'files', 'test_json.json')) as f: result = json.load(f) for item in result: res = self.es.index(index=self.index_name, doc_type=self.index_type, body=item) print(res)
class SoSoImp(object): ''' classdocs ''' def __init__(self): self.es = Elasticsearch(['192.168.2.129', '192.168.2.130']) ''' Constructor ''' ''' 添加搜索信息 ''' def addSoso(self,Content): title="" if Content.title !=None: title = Content.title txt ="" if Content.txt != None : txt = Content.txt #获取情感 source=fenci.mm(title,txt) body={"title":Content.title,"summary":Content.summary,"context":Content.txt,"site_cls":Content.site_cls,"domaintype":Content.domaintype, "countryid":Content.countryid,"province":Content.province,"city":Content.city,"area":Content.area,"url":Content.url,"publictime":Content.pubdate, "createtime":Content.created,"sitename":Content.site_name,"domain1":Content.domain_1,"domain2":Content.domain_2,"sentiment":source, "subname":Content.subname} self.es.index(index="yuqing", doc_type="yuqing_type", body=body, id=Content.rowkey) #es.
def store_change_record(es: Elasticsearch, index_name: str, record: dict, url: str) -> None: es.index(index=index_name, doc_type='changes', id=url, body=record)
def store_url_record(es: Elasticsearch, index_name: str, record: dict, record_id: str) -> None: es.index(index=index_name, doc_type='url', body=record, id=record_id)
} }, "mappings": { "diary": { "properties": { "content": { "term_vector": "yes", "type": "text", "analyzer": "morfologik" } } } } }) list_of_files = glob.glob('../ustawy/*.txt') # create the list of file print("loading files....") for file_name in list_of_files: with open(file_name, 'r') as myfile: data = myfile.read() es.index(index=INDEX, doc_type=TYPE, id=file_name, body={ "content": data, }) print(es.mtermvectors(index=INDEX, doc_type=TYPE))
class BaseElasticsearchBackend(Base): """Base connection wrapper based on the ElasticSearch official library. It uses two entry points to configure the underlying connection: * ``transport_class``: the transport class from ``elasticsearch``. By default ``elasticsearch.transport.Transport``. * ``connection_class``: the connection class used by the transport class. It's undefined by default, as it is on the subclasses to provide one. If any of these elements is not defined, an ``ImproperlyConfigured`` error will be raised when the backend will try to configure the client. """ #: ElasticSearch transport class used by the client class to perform #: requests. transport_class = Transport #: ElasticSearch connection class used by the transport class to perform #: requests. connection_class = None def configure_client(self): """Instantiate and configure the ElasticSearch client. It simply takes the given HOSTS list and uses PARAMS as the keyword arguments of the ElasticSearch class. The client's transport_class is given by the class attribute ``transport_class``, and the connection class used by the transport class is given by the class attribute ``connection_class``. An ``ImproperlyConfigured`` exception is raised if any of these elements is undefined. """ hosts = self.server['HOSTS'] params = self.server['PARAMS'] if not self.transport_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no transport class provided' % self.__class__) if not self.connection_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no connection class provided' % self.__class__) #pylint: disable=star-args self.client = Elasticsearch(hosts, transport_class=self.transport_class, connection_class=self.connection_class, **params) # Server methods # ============== # The underlying client does not require index names to perform server # related queries, such as "ping" or "info". The connection wrapper act # for them as a proxy. def ping(self, **kwargs): return self.client.ping(**kwargs) def info(self, **kwargs): return self.client.info(**kwargs) def put_script(self, lang, script_id, body, **kwargs): return self.client.put_script(lang, script_id, body, **kwargs) def get_script(self, lang, script_id, **kwargs): return self.client.get_script(lang, script_id, **kwargs) def delete_script(self, lang, script_id, **kwargs): return self.client.delete_script(lang, script_id, **kwargs) def put_template(self, template_id, body, **kwargs): return self.client.put_template(template_id, body, **kwargs) def get_template(self, template_id, body=None, **kwargs): return self.client.get_template(template_id, body, **kwargs) def delete_template(self, template_id=None, **kwargs): return self.client.delete_template(template_id, **kwargs) # Bulk methods # ============ # The underlying client does not require index names, but it can be used. # As it makes sense to not give an index, developers are free to use these # as they want, as long as they are careful. def mget(self, body, index=None, doc_type=None, **kwargs): return self.client.mget(body, index, doc_type, **kwargs) def bulk(self, body, index=None, doc_type=None, **kwargs): return self.client.bulk(body, index, doc_type, **kwargs) def msearch(self, body, index=None, doc_type=None, **kwargs): return self.client.msearch(body, index, doc_type, **kwargs) def mpercolate(self, body, index=None, doc_type=None, **kwargs): return self.client.mpercolate(body, index, doc_type, **kwargs) # Scroll methods # ============== # The underlying client does not require an index to perform scroll. def scroll(self, scroll_id, **kwargs): return self.client.scroll(scroll_id, **kwargs) def clear_scroll(self, scroll_id, body=None, **kwargs): return self.client.clear_scroll(scroll_id, body, **kwargs) # Query methods # ============= # The underlying client requires index names (or alias names) to perform # queries. The connection wrapper overrides these client methods to # automatically uses the configured names (indices and/or aliases). def create(self, doc_type, body, doc_id=None, **kwargs): return self.client.create( self.indices, doc_type, body, doc_id, **kwargs) def index(self, doc_type, body, doc_id=None, **kwargs): return self.client.index( self.indices, doc_type, body, doc_id, **kwargs) def exists(self, doc_id, doc_type='_all', **kwargs): return self.client.exists(self.indices, doc_id, doc_type, **kwargs) def get(self, doc_id, doc_type='_all', **kwargs): return self.client.get(self.indices, doc_id, doc_type, **kwargs) def get_source(self, doc_id, doc_type='_all', **kwargs): return self.client.get_source(self.indices, doc_id, doc_type, **kwargs) def update(self, doc_type, doc_id, body=None, **kwargs): return self.client.update( self.indices, doc_type, doc_id, body, **kwargs) def search(self, doc_type=None, body=None, **kwargs): return self.client.search(self.indices, doc_type, body, **kwargs) def search_shards(self, doc_type=None, **kwargs): return self.client.search_shards(self.indices, doc_type, **kwargs) def search_template(self, doc_type=None, body=None, **kwargs): return self.client.search_template( self.indices, doc_type, body, **kwargs) def explain(self, doc_type, doc_id, body=None, **kwargs): return self.client.explain( self.indices, doc_type, doc_id, body, **kwargs) def delete(self, doc_type, doc_id, **kwargs): return self.client.delete(self.indices, doc_type, doc_id, **kwargs) def count(self, doc_type=None, body=None, **kwargs): return self.client.count(self.indices, doc_type, body, **kwargs) def delete_by_query(self, doc_type=None, body=None, **kwargs): return self.client.delete_by_query( self.indices, doc_type, body, **kwargs) def suggest(self, body, **kwargs): return self.client.suggest(body, self.indices, **kwargs) def percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.percolate( self.indices, doc_type, doc_id, body, **kwargs) def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.count_percolate( self.indices, doc_type, doc_id, body, **kwargs) def mlt(self, doc_type, doc_id, body=None, **kwargs): return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs) def termvector(self, doc_type, doc_id, body=None, **kwargs): return self.client.termvector( self.indices, doc_type, doc_id, body, **kwargs) def mtermvectors(self, doc_type=None, body=None, **kwargs): return self.client.mtermvectors(self.indices, doc_type, body, **kwargs) def benchmark(self, doc_type=None, body=None, **kwargs): return self.client.benchmark(self.indices, doc_type, body, **kwargs) def abort_benchmark(self, name=None, **kwargs): return self.client.abort_benchmark(name, **kwargs) def list_benchmarks(self, doc_type=None, **kwargs): return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
class BaseElasticsearchBackend(Base): """Base connection wrapper based on the ElasticSearch official library. It uses two entry points to configure the underlying connection: * ``transport_class``: the transport class from ``elasticsearch``. By default ``elasticsearch.transport.Transport``. * ``connection_class``: the connection class used by the transport class. It's undefined by default, as it is on the subclasses to provide one. If any of these elements is not defined, an ``ImproperlyConfigured`` error will be raised when the backend will try to configure the client. """ #: ElasticSearch transport class used by the client class to perform #: requests. transport_class = Transport #: ElasticSearch connection class used by the transport class to perform #: requests. connection_class = None def configure_client(self): """Instantiate and configure the ElasticSearch client. It simply takes the given HOSTS list and uses PARAMS as the keyword arguments of the ElasticSearch class. The client's transport_class is given by the class attribute ``transport_class``, and the connection class used by the transport class is given by the class attribute ``connection_class``. An ``ImproperlyConfigured`` exception is raised if any of these elements is undefined. """ hosts = self.server['HOSTS'] params = self.server['PARAMS'] if not self.transport_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no transport class provided' % self.__class__) if not self.connection_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no connection class provided' % self.__class__) #pylint: disable=star-args self.client = Elasticsearch(hosts, transport_class=self.transport_class, connection_class=self.connection_class, **params) # Server methods # ============== # The underlying client does not require index names to perform server # related queries, such as "ping" or "info". The connection wrapper act # for them as a proxy. def ping(self, **kwargs): return self.client.ping(**kwargs) def info(self, **kwargs): return self.client.info(**kwargs) def put_script(self, lang, script_id, body, **kwargs): return self.client.put_script(lang, script_id, body, **kwargs) def get_script(self, lang, script_id, **kwargs): return self.client.get_script(lang, script_id, **kwargs) def delete_script(self, lang, script_id, **kwargs): return self.client.delete_script(lang, script_id, **kwargs) def put_template(self, template_id, body, **kwargs): return self.client.put_template(template_id, body, **kwargs) def get_template(self, template_id, body=None, **kwargs): return self.client.get_template(template_id, body, **kwargs) def delete_template(self, template_id=None, **kwargs): return self.client.delete_template(template_id, **kwargs) # Bulk methods # ============ # The underlying client does not require index names, but it can be used. # As it makes sense to not give an index, developers are free to use these # as they want, as long as they are careful. def mget(self, body, index=None, doc_type=None, **kwargs): return self.client.mget(body, index, doc_type, **kwargs) def bulk(self, body, index=None, doc_type=None, **kwargs): return self.client.bulk(body, index, doc_type, **kwargs) def msearch(self, body, index=None, doc_type=None, **kwargs): return self.client.msearch(body, index, doc_type, **kwargs) def mpercolate(self, body, index=None, doc_type=None, **kwargs): return self.client.mpercolate(body, index, doc_type, **kwargs) # Scroll methods # ============== # The underlying client does not require an index to perform scroll. def scroll(self, scroll_id, **kwargs): return self.client.scroll(scroll_id, **kwargs) def clear_scroll(self, scroll_id, body=None, **kwargs): return self.client.clear_scroll(scroll_id, body, **kwargs) # Query methods # ============= # The underlying client requires index names (or alias names) to perform # queries. The connection wrapper overrides these client methods to # automatically uses the configured names (indices and/or aliases). def create(self, doc_type, body, doc_id=None, **kwargs): return self.client.create(self.indices, doc_type, body, doc_id, **kwargs) def index(self, doc_type, body, doc_id=None, **kwargs): return self.client.index(self.indices, doc_type, body, doc_id, **kwargs) def exists(self, doc_id, doc_type='_all', **kwargs): return self.client.exists(self.indices, doc_id, doc_type, **kwargs) def get(self, doc_id, doc_type='_all', **kwargs): return self.client.get(self.indices, doc_id, doc_type, **kwargs) def get_source(self, doc_id, doc_type='_all', **kwargs): return self.client.get_source(self.indices, doc_id, doc_type, **kwargs) def update(self, doc_type, doc_id, body=None, **kwargs): return self.client.update(self.indices, doc_type, doc_id, body, **kwargs) def search(self, doc_type=None, body=None, **kwargs): return self.client.search(self.indices, doc_type, body, **kwargs) def search_shards(self, doc_type=None, **kwargs): return self.client.search_shards(self.indices, doc_type, **kwargs) def search_template(self, doc_type=None, body=None, **kwargs): return self.client.search_template(self.indices, doc_type, body, **kwargs) def explain(self, doc_type, doc_id, body=None, **kwargs): return self.client.explain(self.indices, doc_type, doc_id, body, **kwargs) def delete(self, doc_type, doc_id, **kwargs): return self.client.delete(self.indices, doc_type, doc_id, **kwargs) def count(self, doc_type=None, body=None, **kwargs): return self.client.count(self.indices, doc_type, body, **kwargs) def delete_by_query(self, doc_type=None, body=None, **kwargs): return self.client.delete_by_query(self.indices, doc_type, body, **kwargs) def suggest(self, body, **kwargs): return self.client.suggest(body, self.indices, **kwargs) def percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.percolate(self.indices, doc_type, doc_id, body, **kwargs) def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.count_percolate(self.indices, doc_type, doc_id, body, **kwargs) def mlt(self, doc_type, doc_id, body=None, **kwargs): return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs) def termvector(self, doc_type, doc_id, body=None, **kwargs): return self.client.termvector(self.indices, doc_type, doc_id, body, **kwargs) def mtermvectors(self, doc_type=None, body=None, **kwargs): return self.client.mtermvectors(self.indices, doc_type, body, **kwargs) def benchmark(self, doc_type=None, body=None, **kwargs): return self.client.benchmark(self.indices, doc_type, body, **kwargs) def abort_benchmark(self, name=None, **kwargs): return self.client.abort_benchmark(name, **kwargs) def list_benchmarks(self, doc_type=None, **kwargs): return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
for sentence in sentences: ### Tokenize sentence in paragraph sentence = underthesea.word_tokenize(sentence, format="text") ### Lower case sentence = sentence.lower() paragraph_tokenized = paragraph_tokenized + sentence paragraph_tokenized = paragraph_tokenized.replace("\n", "") content_tokenized.append({ "type": "text", "content": paragraph_tokenized }) ### Convert và đẩy dữ liệu lên elasticsearch es_push_body = { "Trang": news_page, "Title": title_tokenized, "NoiDung": content_tokenized, "Description": des_tokenized, "NewspaperLink": news_link, } es.index(index="my-index", body=es_push_body) ### Đếm tổng số bản ghi hiện tại trên ES es_check_body = {"query": {"match_all": {}}} result_check = es.search(index="my-index", body=es_check_body) print(result_check["hits"]["total"]["value"])