def handle(self, *args, **options): es = Elasticsearch(hosts=[{'host': 'localhost', 'port': 9200}]) fop=open('spider/management/commands/'+str(argv[2]), 'r') inds = IndicesClient(es) mapping={ "mappings": { "product_type": { "properties": { "code": { "type" : "string" },"name": {"type" : "string"},"img": {"type" : "string"},"url": {"type" : "string"},"price_reg": {"type" : "float"},"price_discount": {"type" : "float"}}}}} if not inds.exists(index='gearbest_index'): inds.create(index='gearbest_index',body=mapping) print 'gearbest_index created' for jsonline in fop: jobj=loads(jsonline) del jobj["_type"] es.index(index="gearbest_index",doc_type='product_type', body=jobj, id=jobj['code']) disc=0 reg=0 if len(jobj['price_discount'])>0: disc = float(jobj['price_discount'][0]) if len(jobj['price_reg'])>0: reg = float(jobj['price_reg'][0]) #insert="INSERT into 'price_gb' ('price','price_disc','code','date') values ("+str(reg)+", "+str(disc)+", '"+str(jobj['code'])+"', '"+str(datetime.today())+"')" #cursor = connection.cursor() #cursor.execute(insert) add_price=Price_gb(price=reg,price_disc=disc,code=str(jobj['code']),date=datetime.date.today()) add_price.save() print 'code='+str(jobj['code'])
def setUp(self): """ Starts a new connector for every test """ try: os.unlink("config.txt") except OSError: pass open("config.txt", "w").close() self.connector = Connector( address='%s:%s' % (mongo_host, self.primary_p), oplog_checkpoint='config.txt', target_url=elastic_pair, ns_set=['test.test'], u_key='_id', auth_key=None, doc_manager='mongo_connector/doc_managers/elastic_doc_manager.py', auto_commit_interval=0) # Clean out test databases try: self.elastic_doc._remove() except OperationFailed: try: # Create test.test index if necessary client = Elasticsearch(hosts=[elastic_pair]) idx_client = IndicesClient(client) idx_client.create(index='test.test') except es_exceptions.TransportError: pass self.conn.test.test.drop() self.connector.start() assert_soon(lambda: len(self.connector.shard_set) > 0) assert_soon(lambda: sum(1 for _ in self.elastic_doc._search()) == 0)
def create_index_conf(): indices_client = IndicesClient(models.client) index_name = 'conf' doc_type = index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name)
def create_index_survey(): indices_client = IndicesClient(models.client) index_name = models.SurveyMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) #put_settings(models.ScentemotionMap) # add qstfld fields es_mapping = models.SurveyMap._meta.es_mapping for qst, mapping in survey.qst2fld.items(): fields = mapping[0] field_type = mapping[1] if field_type == 'nested_qst_ans': for field in fields: if field not in es_mapping['properties']: es_mapping['properties'][field] = {} es_mapping['properties'][field]['type'] = 'nested' es_mapping['properties'][field]['properties'] = {} es_mapping['properties'][field]['properties']['question'] = {'type' : 'text', 'fields' : {'keyword' : {'type' : 'keyword', 'ignore_above' : 256}}} es_mapping['properties'][field]['properties']['answer'] = {'type' : 'text', 'fields' : {'keyword' : {'type' : 'keyword', 'ignore_above' : 256}}} #'type' : 'nested', #'properties' : { # 'question' : {'type' : 'text', 'fields' : {'keyword' : {'type' : 'keyword', 'ignore_above' : 256}}}, # 'answer' : {'type' : 'text', 'fields' : {'keyword' : {'type' : 'keyword', 'ignore_above' : 256}}}, # } #}, indices_client.put_mapping( doc_type=models.SurveyMap._meta.es_type_name, #body=models.SurveyMap._meta.es_mapping, body=es_mapping, index=index_name )
def prepare_index(self, courses): """ Not a test. This method is doing the heavy lifting for the tests in this class: - prepare the Elasticsearch index, - execute the query. """ self.create_filter_pages() # Index these 4 courses in Elasticsearch indices_client = IndicesClient(client=ES_CLIENT) # Delete any existing indices so we get a clean slate indices_client.delete(index="_all") # Create an index we'll use to test the ES features indices_client.create(index="test_courses") indices_client.close(index="test_courses") indices_client.put_settings(body=ANALYSIS_SETTINGS, index="test_courses") indices_client.open(index="test_courses") # Use the default courses mapping from the Indexer indices_client.put_mapping(body=CoursesIndexer.mapping, doc_type="course", index="test_courses") # Add the sorting script ES_CLIENT.put_script(id="state", body=CoursesIndexer.scripts["state"]) # Actually insert our courses in the index actions = [{ "_id": course["id"], "_index": "test_courses", "_op_type": "create", "_type": "course", **course, } for course in courses] bulk(actions=actions, chunk_size=500, client=ES_CLIENT) indices_client.refresh()
def reindex(self): elastic_client = Elasticsearch([{ "host": self.__host, "port": self.__port }]) index_client = IndicesClient(elastic_client) # Create new index with necessory fields mapping # , master_timeout=10, timeout=10 index_client.create(index=self.__target_index, body=self.__body) # reindexind data from source index to target index helpers.reindex(client=elastic_client, source_index=self.__source_index, target_index=self.__target_index) # creating alias for target index alias = {'actions': []} # remove_action = {"remove": {"index": self.__source_index, "alias": self.__alias}} add_action = { "add": { "index": self.__target_index, "alias": self.__alias } } # alias['actions'].append(remove_action) alias['actions'].append(add_action) # deleteing the source index index_client.delete(index=self.__source_index) index_client.update_aliases(body=alias)
def create_wikipedia_index(ic: IndicesClient) -> None: """ Add an index to Elasticsearch called 'wikipedia' Parameters ---------- ic : IndicesClient The client to control Elasticsearch index settings Returns ------- None """ request_body = { "settings": { "analysis": { "analyzer": { "my_analyzer": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "my_stops"] } }, "filter": { "my_stops": { "type": "stop", "stopwords_path": "stopwords.txt" } } } } } ic.create(index="wikipedia", body=request_body)
class TestSingleDocSigTerms(TestCase): def setUp(self): super(TestSingleDocSigTerms, self).setUp() self.es = Elasticsearch(hosts=['localhost:%d' % es_runner.es_state.port]) self.ic = IndicesClient(self.es) self.index = 'single_doc_sigterms_test' self.doc_type = 'test-doc' self.field = 'text' if self.ic.exists(self.index): self.ic.delete(self.index) self.ic.create(self.index) self.es.create(self.index, self.doc_type, {self.field: 'foo ba knark foo knirk knark foo'}, id='doc_1') def test_tf_for_doc_id(self): sigterms = SingleDocSigTerms(self.es, self.index, self.doc_type, self.field, None) resp = dict(sigterms.tf_for_doc_id('doc_1')) self.assertEquals(4, len(resp)) self.assertEquals(3, resp['foo']) self.assertEquals(2, resp['knark']) self.assertEquals(1, resp['ba']) self.assertEquals(1, resp['knirk'])
def perform_create_index(indexable, logger): """ Create a new index in ElasticSearch from an indexable instance """ indices_client = IndicesClient(client=ES_CLIENT) # Create a new index name, suffixing its name with a timestamp new_index = "{:s}_{:s}".format( indexable.index_name, timezone.now().strftime("%Y-%m-%d-%Hh%Mm%S.%fs")) # Create the new index logger.info( 'Creating a new Elasticsearch index "{:s}"...'.format(new_index)) indices_client.create(index=new_index) # The index needs to be closed before we set an analyzer indices_client.close(index=new_index) indices_client.put_settings(body=ANALYSIS_SETTINGS, index=new_index) indices_client.open(index=new_index) indices_client.put_mapping(body=indexable.mapping, doc_type=indexable.document_type, index=new_index) # Populate the new index with data provided from our indexable class richie_bulk(indexable.get_es_documents(new_index)) # Return the name of the index we just created in ElasticSearch return new_index
def perform_create_index(indexable, logger): """ Create a new index in ElasticSearch from an indexable instance """ indices_client = IndicesClient(client=settings.ES_CLIENT) # Create a new index name, suffixing its name with a timestamp new_index = "{:s}_{:s}".format( indexable.index_name, timezone.now().strftime("%Y-%m-%d-%Hh%Mm%S.%fs")) # Create the new index logger.info( 'Creating a new Elasticsearch index "{:s}"...'.format(new_index)) indices_client.create(index=new_index) indices_client.put_mapping(body=indexable.mapping, doc_type=indexable.document_type, index=new_index) # Populate the new index with data provided from our indexable class bulk( actions=indexable.get_data_for_es(new_index, "create"), chunk_size=settings.ES_CHUNK_SIZE, client=settings.ES_CLIENT, stats_only=True, ) # Return the name of the index we just created in ElasticSearch return new_index
def setUp(self): """ Starts a new connector for every test """ try: os.unlink("config.txt") except OSError: pass open("config.txt", "w").close() self.connector = Connector( address='%s:%s' % (mongo_host, self.primary_p), oplog_checkpoint='config.txt', target_url=elastic_pair, ns_set=['test.test'], u_key='_id', auth_key=None, doc_manager='mongo_connector/doc_managers/elastic_doc_manager.py', auto_commit_interval=0 ) # Clean out test databases try: self.elastic_doc._remove() except OperationFailed: try: # Create test.test index if necessary client = Elasticsearch(hosts=[elastic_pair]) idx_client = IndicesClient(client) idx_client.create(index='test.test') except es_exceptions.TransportError: pass self.conn.test.test.drop() self.connector.start() assert_soon(lambda: len(self.connector.shard_set) > 0) assert_soon(lambda: sum(1 for _ in self.elastic_doc._search()) == 0)
class TestSingleDocSigTerms(TestCase): def setUp(self): super(TestSingleDocSigTerms, self).setUp() self.es = Elasticsearch( hosts=['localhost:%d' % es_runner.es_state.port]) self.ic = IndicesClient(self.es) self.index = 'single_doc_sigterms_test' self.doc_type = 'test-doc' self.field = 'text' if self.ic.exists(self.index): self.ic.delete(self.index) self.ic.create(self.index) self.es.create(self.index, self.doc_type, {self.field: 'foo ba knark foo knirk knark foo'}, id='doc_1') def test_tf_for_doc_id(self): sigterms = SingleDocSigTerms(self.es, self.index, self.doc_type, self.field, None) resp = dict(sigterms.tf_for_doc_id('doc_1')) self.assertEquals(4, len(resp)) self.assertEquals(3, resp['foo']) self.assertEquals(2, resp['knark']) self.assertEquals(1, resp['ba']) self.assertEquals(1, resp['knirk'])
def execute_query(self, courses, querystring="", **extra): """ Not a test. Prepare the ElasticSearch index and execute the query in it. """ indices_client = IndicesClient(client=ES_CLIENT) # Delete any existing indices so we get a clean slate indices_client.delete(index="_all") # Create an index we'll use to test the ES features indices_client.create(index=COURSES_INDEX) # The index needs to be closed before we set an analyzer indices_client.close(index=COURSES_INDEX) indices_client.put_settings(body=ANALYSIS_SETTINGS, index=COURSES_INDEX) indices_client.open(index=COURSES_INDEX) # Use the default courses mapping from the Indexer indices_client.put_mapping(body=CoursesIndexer.mapping, doc_type="course", index=COURSES_INDEX) # Add the sorting script ES_CLIENT.put_script(id="score", body=CoursesIndexer.scripts["score"]) ES_CLIENT.put_script(id="state_field", body=CoursesIndexer.scripts["state_field"]) # Actually insert our courses in the index actions = [{ "_id": course["id"], "_index": COURSES_INDEX, "_op_type": "create", "_type": "course", "absolute_url": { "en": "en/url", "fr": "fr/url" }, "categories": ["1", "2", "3"], "cover_image": { "en": "en/image", "fr": "fr/image" }, "is_meta": False, "logo": { "en": "/en/some/img.png", "fr": "/fr/some/img.png" }, "nb_children": 0, "organizations": ["11", "12", "13"], **course, } for course in courses] bulk(actions=actions, chunk_size=500, client=ES_CLIENT) indices_client.refresh() results = self.client.get( f"/api/v1.0/courses/autocomplete/?{querystring:s}", **extra) self.assertEqual(results.status_code, 200) return json.loads(results.content)
def execute_query(self, querystring=""): """ Not a test. This method is doing the heavy lifting for the tests in this class: create and fill the index with our courses so we can run our queries and check our facet counts. It also executes the query and returns the result from the API. """ # Create the subject category page. This is necessary to link the subjects we # defined above with the "subjects" filter # As it is the only page we create, we expect it to have the path "0001" CategoryFactory(page_reverse_id="subjects", should_publish=True) # Index these 4 courses in Elasticsearch indices_client = IndicesClient(client=ES_CLIENT) # Delete any existing indices so we get a clean slate indices_client.delete(index="_all") # Create an index we'll use to test the ES features indices_client.create(index="test_courses") indices_client.close(index="test_courses") indices_client.put_settings(body=ANALYSIS_SETTINGS, index="test_courses") indices_client.open(index="test_courses") # Use the default courses mapping from the Indexer indices_client.put_mapping( body=CoursesIndexer.mapping, doc_type="course", index="test_courses" ) # Add the sorting script ES_CLIENT.put_script(id="state", body=CoursesIndexer.scripts["state"]) # Actually insert our courses in the index actions = [ { "_id": course["id"], "_index": "test_courses", "_op_type": "create", "_type": "course", "absolute_url": {"en": "url"}, "cover_image": {"en": "image"}, "title": {"en": "title"}, **course, "course_runs": [ { "languages": course_run["languages"], "start": arrow.utcnow().datetime, "end": arrow.utcnow().datetime, "enrollment_start": arrow.utcnow().datetime, "enrollment_end": arrow.utcnow().datetime, } for course_run in course["course_runs"] ], } for course in COURSES ] bulk(actions=actions, chunk_size=500, client=ES_CLIENT) indices_client.refresh() response = self.client.get(f"/api/v1.0/courses/?{querystring:s}") self.assertEqual(response.status_code, 200) return json.loads(response.content)
def create_index(es, index_name): es_indices = IndicesClient(es) # es_indices.create(index=index_name) with open('data/FifaRecords.mappings.json') as json_data: d = json.load(json_data) es_indices.create(index=index_name, body=d) print("Created ES index {}".format(index_name))
def create_index(): indices_client = IndicesClient(client=settings.ES) index_name = Apartments._meta.es_index_name if not indices_client.exists(index_name): indices_client.create(index=index_name) indices_client.put_mapping(doc_type=Apartments._meta.es_type_name, body=Apartments._meta.es_mapping, index=index_name)
def recreate_index(self, index_name, index_mapping): indices_client = IndicesClient(client=ES_CLIENT) if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping(doc_type='page', index=index_name, body=index_mapping)
def _create_main_index_if_not_exists(self): """ method that creates new elastic index if not existed :return: """ ic = IndicesClient(self.es) if not ic.exists(MAIN_INDEX_NAME): ic.create(MAIN_INDEX_NAME)
def create_index_mi(): indices_client = IndicesClient(models.client) index_name = models.PostMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping(body=models.PostMap._meta.es_mapping, index=index_name)
def create_index_pi(): # indices_client = IndicesClient(client=settings.ES_HOSTS) indices_client = IndicesClient(models.client) index_name = models.Review._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping(body=models.Review._meta.es_mapping, index=index_name)
def create_index_bestmatch(): indices_client = IndicesClient(models.client) index_name = models.bestmatchMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) #put_settings(models.bestmatchMap) indices_client.put_mapping(body=models.bestmatchMap._meta.es_mapping, index=index_name)
def recreate_index(self): indices_client = IndicesClient(client=settings.ES_CLIENT) index_name = Student._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping(doc_type=Student._meta.es_type_name, body=Student._meta.es_mapping, index=index_name)
def create_index_si_sites(): indices_client = IndicesClient(models.client) index_name = models.PageMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping(doc_type=models.PageMap._meta.es_type_name, body=models.PageMap._meta.es_mapping, index=index_name)
def create_index_mi_feedly(): indices_client = IndicesClient(models.client) index_name = models.FeedlyMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) #put_settings(models.FeedlyMap) indices_client.put_mapping(doc_type=models.FeedlyMap._meta.es_type_name, body=models.FeedlyMap._meta.es_mapping, index=index_name)
def initialize(self, idx): es_index, es_doctype = self.indexinfo(idx) self.logger.info("Initializing %s" % es_index) idx_client = IndicesClient(self.es) if idx_client.exists(es_index): idx_client.delete(es_index) idx_client.create(es_index) if idx == 'event': idx_client.put_mapping(doc_type=es_doctype, index=[es_index], body=event_mapping()) self.logger.info("%s ready." % es_index)
def create_index_dhk(): indices_client = IndicesClient(models.client) index_name = 'recipes' if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping( # ES7.0 does not support types anymore doc_type=index_name, body={'properties': wb_excel.recipes}, index=index_name)
def create_index_survey(): indices_client = IndicesClient(models.client) index_name = models.SurveyMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) #put_settings(models.ScentemotionMap) indices_client.put_mapping(doc_type=models.SurveyMap._meta.es_type_name, body=models.SurveyMap._meta.es_mapping, index=index_name)
def recreate_index(self): indices_client = IndicesClient(client=settings.ES_CLIENT) index_name = Student._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping( doc_type=Student._meta.es_type_name, body=Student._meta.es_mapping, index=index_name )
def create_es_index(index_name): es = Elasticsearch() client = IndicesClient(es) # take this opportunity to create training index if it doesn't exist if not client.exists('appcompat-training'): client.create(index='appcompat-training', body=CONFIG) if client.exists(index_name): raise Exception('Index already exists: {}'.format(index_name)) client.create(index=index_name, body=CONFIG)
def recreate_index(self): indices_client = IndicesClient(client=settings.ES_CLIENT) index_name = es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name, body=es_ind_settings) for model_name in es_models: indices_client.put_mapping( doc_type=model_es_indices[model_name]['type'], body=es_mappings[model_name], index=es_index_name)
def __createIndex(self): es = Elasticsearch([{'host': self.elasticsearch_host, 'port': self.elasticsearch_port}]) ic = IndicesClient(es) if(ic.exists(index='wow')): print("deleting old index") self.deleteIndex() ic.create(index='wow') # blah = glob.glob(os.path.join(self.map_directory, '*')) for currentFile in glob.glob(os.path.join(self.map_directory, '*')): print("MAP FILE: " + currentFile) self.__mapFile(currentFile)
def recreateIndex(self): """function to recreate the index in tge elasticsearch""" print("delete the previous index and creating th new one...") indices_client = IndicesClient(client=settings.ES_CLIENT) index_name = Product._meta.es_index_name type_type = Product._meta.es_type_name if indices_client.exists(index=index_name): indices_client.delete(index=index_name) indices_client.create(index_name) indices_client.put_mapping(doc_type=Product._meta.es_type_name, body=Product._meta.es_mapping, index=index_name)
def create_parcel_mapping(): idx_client = IndicesClient(es) if not idx_client.exists(index=parcel_index): idx_client.create(index=parcel_index) with open('osc\util\mappings\parcel.json') as mapping_file: mapping = json.load(mapping_file) idx_client.put_mapping(doc_type=parcel_mapping, index=[parcel_index], body=mapping)
def execute_query(self, kind, querystring=""): """ Not a test. This method is doing the heavy lifting for the tests in this class: create and fill the index with our categories so we can run our queries and check the results. It also executes the query and returns the result from the API. """ # Index these categories in Elasticsearch indices_client = IndicesClient(client=ES_CLIENT) # Delete any existing indexes so we get a clean slate indices_client.delete(index="_all") # Create an index we'll use to test the ES features indices_client.create(index="test_categories") indices_client.close(index="test_categories") indices_client.put_settings(body=ANALYSIS_SETTINGS, index="test_categories") indices_client.open(index="test_categories") # Use the default categories mapping from the Indexer indices_client.put_mapping(body=CategoriesIndexer.mapping, doc_type="category", index="test_categories") # Actually insert our categories in the index actions = [{ "_id": category["id"], "_index": "test_categories", "_op_type": "create", "_type": "category", "absolute_url": { "en": "en/url" }, "description": { "en": "en/description" }, "icon": { "en": "en/icon" }, "is_meta": False, "logo": { "en": "en/logo" }, "nb_children": 0, "path": category["id"], **category, } for category in CATEGORIES] bulk(actions=actions, chunk_size=500, client=ES_CLIENT) indices_client.refresh() response = self.client.get(f"/api/v1.0/{kind:s}/?{querystring:s}") self.assertEqual(response.status_code, 200) return json.loads(response.content)
def recreate_index(self): indices_client = IndicesClient(client=settings.ES_CLIENT) index_name = self.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name, body=self.es_ind_settings) ## create mapping for one model only for now model_name = 'place' indices_client.put_mapping( doc_type=model_es_indices[model_name]['type'], body=es_mappings[model_name], index=index_name)
def setUp(self): """Empty ElasticSearch at the start of every test """ try: self.elastic_doc._remove() except OperationFailed: try: # Create test.test index if necessary client = Elasticsearch(hosts=['localhost:9200']) idx_client = IndicesClient(client) idx_client.create(index='test.test') except es_exceptions.TransportError: pass
def initialize(self, conf, context): host = conf.get('zeit.recommend.elasticsearch.host', 'localhost') port = conf.get('zeit.recommend.elasticsearch.port', 9200) self.es = Elasticsearch(hosts=[{'host': host, 'port': port}]) self.match = re.compile('seite-[0-9]|komplettansicht').match self.index = '%s-%s' % date.today().isocalendar()[:2] ic = IndicesClient(self.es) try: if not ic.exists(self.index): ic.create(self.index) except ConnectionError, e: log('[UserIndexBolt] ConnectionError, index unreachable: %s' % e) return
def _create_weight_index(es, index): """ Creates the index with the right mapping if it doesn't exist. :param es: :type es:elasticsearch.Elasticsearch :param index: :type index:str|unicode """ ic = IndicesClient(es) if ic.exists(index): logging.info('Index %s already exists ...' % index) else: ic.create(index=index, body=ES_TERMWEIGHTING_INDEX_SETTINGS)
def create_index(name): es = get_es() ic = IndicesClient(es) body = {} # body.update(settings.INDEX_SETTINGS) body.update(settings.INDEX_MAPPINGS) resp = ic.create(name, json.dumps(body)) logger.debug('index create: ' + str(resp))
def import_ontology(ontology: lib.obo.Ontology, index_name: str): es = elasticsearch.Elasticsearch() ies = IndicesClient(es) actions = [dict( _index=index_name, _type=index_name, _source=dict( id=item.id, names=item.names() ) ) for item in ontology.items()] if ies.exists(index_name): ies.delete(index_name) ies.create(index_name) return bulk(es, actions=actions)
def create_index_if_not_exists(self): """ Check if index exists & if not exists create index & types & store their mappings. """ ic = IndicesClient(self.es) response = ic.exists(index=[self.index_name]) if not response: es_mappings = ElasticSearchController.get_index_mapper_dict() index_response = ic.create(index=self.index_name, body={ "mappings":es_mappings })
def _init_mapping(self, mapping_path): esi = IndicesClient(es.get_es_handle()) index = settings.ES_INDEX #first create index if not exists if not esi.exists(index): self.stdout.write("Creating index for db : %s"%index) esi.create(index=index) self.stdout.write("Index Created for : %s"%index) if not mapping_path or not os.path.exists(mapping_path): raise CommandError("not existing mapping path") mapping_str = open(mapping_path, "r").read() mappings = json.loads(mapping_str) for k,v in mappings.iteritems(): res = esi.put_mapping(index, k, {k:mappings[k]}) self.stdout.write(str(res))
def setup(forced): properties = {} properties["fail_symptom"] = {"type" : "string", "index": "not_analyzed"} properties["ats_log"] = {"type" : "string"} properties["file_path"] = {"type" : "string", "analyzer": "path-analyzer"} add_unique_mapping(properties, "Test Start Time", {"VALUE" : {"type" : "date", "format": "yyyy/MM/dd HH:mm:ssZ||yyyy/MM/ddZ"}}) add_unique_mapping(properties, "Test end Time", {"VALUE" : {"type" : "date", "format": "yyyy/MM/dd HH:mm:ssZ||yyyy/MM/ddZ"}}) es = Elasticsearch([{'host': 'localhost', 'port': 9200}], max_retries=10, retry_on_timeout=True) idx_client = IndicesClient(es) if (idx_client.exists(index=PROJECT)): if (forced): idx_client.delete(index=PROJECT) else : print "Index already exists!" return runin_csv_status = {"runin_csv_status" : {"path_match": "RunInLog.*.STATUS", "mapping": {"index": "not_analyzed"}}} runin_csv_value = {"runin_csv_value" : {"path_match": "RunInLog.*.VALUE", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} runin_csv_u_limit = {"runin_csv_u_limit" : {"path_match": "RunInLog.*.U_LIMIT", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} runin_csv_l_limit = {"runin_csv_l_limit" : {"path_match": "RunInLog.*.L_LIMIT", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} runin_csv_test_time = {"runin_csv_test_time" : {"path_match": "RunInLog.*.TEST_TIME", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} csv_status = {"csv_status" : {"path_match": "*.STATUS", "mapping": {"index": "not_analyzed"}}} csv_value = {"csv_value" : {"path_match": "*.VALUE", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} csv_u_limit = {"csv_u_limit" : {"path_match": "*.U_LIMIT", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} csv_l_limit = {"csv_l_limit" : {"path_match": "*.L_LIMIT", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} csv_test_time = {"csv_test_time" : {"path_match": "*.TEST_TIME", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} dynamic_templates = [runin_csv_status, runin_csv_value, runin_csv_u_limit, runin_csv_l_limit, runin_csv_test_time, csv_status, csv_value, csv_u_limit, csv_l_limit, csv_test_time] analysis = {} analysis["analyzer"] = {} analysis["tokenizer"] = {} analysis["analyzer"]["path-analyzer"] = {"type": "custom", "tokenizer": "path-tokenizer"} analysis["tokenizer"]["path-tokenizer"] = {"type": "path_hierarchy"} mappings = {"dynamic_templates" : dynamic_templates, "properties" : properties} data = {"settings" : {"index.mapping.ignore_malformed": True, "number_of_replicas": 1, "analysis": analysis}, "mappings" : {STAGE: mappings}} print json.dumps(data) idx_client.create(index=PROJECT, body=data)
class RedisEsSetupMixin(object): def setUp(self): self.settings = TEST_SETTINGS_OBJECT self.es = get_es(self.settings) self.esi = IndicesClient(self.es) self.index = self.settings.get("ES_INDEX") #create the index firstly if self.esi.exists(self.index): self.esi.delete(index=self.index) self.esi.create(index=self.index) mapping_path = os.path.join(SCRAPY_ROOT, "resources/mappings.json") mapping_str = open(mapping_path, "r").read() mappings = json.loads(mapping_str) for k,v in mappings.iteritems(): res = self.esi.put_mapping(self.index, k, {k:mappings[k]}) #print res self.redis_conn = get_redis(self.settings) def tearDown(self): if self.esi.exists(self.index): self.esi.delete(index=self.index) print "ES INDEX DELETED" #remove redis stuff self.redis_conn.flushdb() print "REDIS DB DELETED"
class pdfGraph(): """Create and manage the PDF graph in Neo4j and index in Elasticsearch""" db_path = "http://localhost:7474/db/data/" db = None pdf_documents = None authors = None keywords = None es_cluster = [{'host': 'localhost', 'port': 9200}] es = None es_ixc = None def __init__(self): """ setup Neo4j database connection and node labels and Elasticsearch mapping attachments index """ self.db = GraphDatabase(self.db_path) self.pdf_documents = self.db.labels.create("PDFDocument") self.authors = self.db.labels.create("Author") self.keywords = self.db.labels.create("Keyword") self.es = Elasticsearch(self.es_cluster) self.es_ixc = IndicesClient(self.es) self.es_ixc.create( index="pdf_documents", body={ 'mappings': { 'pdf': { 'properties': { 'url': {'type': "string"}, 'pdf_file': {'type': "attachment"} } } } } ) def createNodesAndIx(self, doc_url, doc_info, doc_metadata, doc_data): """Given document details create nodes and relationships for documents, authors and keywords and store the related documents for indexing and search""" # not all pdf docs have all fields so we need to check for existence check_for = lambda n, d: d[n] if (n in d) else '' author = check_for('Author', doc_info[0]) # create an author node if one doesn't already exists if author is not '': author_node = self.authorExists(author) if author_node is None: author_node = self.createAuthor(author) # create keyword nodes if they don't already exist if check_for('pdf', doc_metadata) is not '': keywords = check_for('Keywords', doc_metadata['pdf']) else: keywords = '' if keywords is not '': keyword_nodes = [] for keyword in map(lambda x: x.strip(" '\""), keywords.split(",")): keyword_node = self.keywordExists(keyword) if keyword_node is None: keyword_node = self.createKeyword(keyword) keyword_nodes.append(keyword_node) # create the document node pdf_node = self.db.nodes.create( url=doc_url, info=repr(doc_info), metadata=repr(doc_metadata), title=check_for('Title', doc_info[0]) ) self.pdf_documents.add(pdf_node) # create relationships b/w document, author and keywords if author is not '': pdf_node.relationships.create("AUTHORED_BY", author_node) if keywords is not '': for keyword_node in keyword_nodes: pdf_node.relationships.create("HAS_KEYWORD", keyword_node) # add the document for full-text search to ES using Neo4j id self.es.create( index="pdf_documents", doc_type="pdf", id=pdf_node.id, body={ 'url': doc_url, 'pdf_file': base64.b64encode(doc_data.getvalue()) } ) def authorExists(self, author): """Check for an existing author node""" r = self.db.query( 'match (a:Author) where a.name = "' + author + '" return a', returns=(client.Node) ) return r[0][0] if (len(r) > 0) else None def createAuthor(self, author): """Create an author node""" an_author = self.db.nodes.create(name=author) self.authors.add(an_author) return an_author def keywordExists(self, keyword): """Check for an existing keyword node""" r = self.db.query( 'match (k:Keyword) where k.name = "' + keyword + '" return k', returns=(client.Node) ) return r[0][0] if (len(r) > 0) else None def createKeyword(self, keyword): """Create a keyword node""" a_keyword = self.db.nodes.create(name=keyword) self.keywords.add(a_keyword) return a_keyword
def create_index_mappings(es_client, ea_index, recreate=False, old_ea_index=None): esversion = es_client.info()["version"]["number"] print("Elastic Version: " + esversion) es_index_mappings = read_es_index_mappings() if is_atleastsix(esversion) else read_es_index_mappings(5) es_index = IndicesClient(es_client) if not recreate: if es_index.exists(ea_index): print('Index ' + ea_index + ' already exists. Skipping index creation.') return None # (Re-)Create indices. if is_atleastsix(esversion): index_names = ( ea_index, ea_index + '_status', ea_index + '_silence', ea_index + '_error', ea_index + '_past', ) else: index_names = ( ea_index, ) for index_name in index_names: if es_index.exists(index_name): print('Deleting index ' + index_name + '.') try: es_index.delete(index_name) except NotFoundError: # Why does this ever occur?? It shouldn't. But it does. pass es_index.create(index_name) # To avoid a race condition. TODO: replace this with a real check time.sleep(2) if is_atleastseven(esversion): # TODO remove doc_type completely when elasicsearch client allows doc_type=None # doc_type is a deprecated feature and will be completely removed in Elasicsearch 8 es_client.indices.put_mapping(index=ea_index, doc_type='_doc', body=es_index_mappings['elastalert'], include_type_name=True) es_client.indices.put_mapping(index=ea_index + '_status', doc_type='_doc', body=es_index_mappings['elastalert_status'], include_type_name=True) es_client.indices.put_mapping(index=ea_index + '_silence', doc_type='_doc', body=es_index_mappings['silence'], include_type_name=True) es_client.indices.put_mapping(index=ea_index + '_error', doc_type='_doc', body=es_index_mappings['elastalert_error'], include_type_name=True) es_client.indices.put_mapping(index=ea_index + '_past', doc_type='_doc', body=es_index_mappings['past_elastalert'], include_type_name=True) elif is_atleastsixtwo(esversion): es_client.indices.put_mapping(index=ea_index, doc_type='_doc', body=es_index_mappings['elastalert']) es_client.indices.put_mapping(index=ea_index + '_status', doc_type='_doc', body=es_index_mappings['elastalert_status']) es_client.indices.put_mapping(index=ea_index + '_silence', doc_type='_doc', body=es_index_mappings['silence']) es_client.indices.put_mapping(index=ea_index + '_error', doc_type='_doc', body=es_index_mappings['elastalert_error']) es_client.indices.put_mapping(index=ea_index + '_past', doc_type='_doc', body=es_index_mappings['past_elastalert']) elif is_atleastsix(esversion): es_client.indices.put_mapping(index=ea_index, doc_type='elastalert', body=es_index_mappings['elastalert']) es_client.indices.put_mapping(index=ea_index + '_status', doc_type='elastalert_status', body=es_index_mappings['elastalert_status']) es_client.indices.put_mapping(index=ea_index + '_silence', doc_type='silence', body=es_index_mappings['silence']) es_client.indices.put_mapping(index=ea_index + '_error', doc_type='elastalert_error', body=es_index_mappings['elastalert_error']) es_client.indices.put_mapping(index=ea_index + '_past', doc_type='past_elastalert', body=es_index_mappings['past_elastalert']) else: es_client.indices.put_mapping(index=ea_index, doc_type='elastalert', body=es_index_mappings['elastalert']) es_client.indices.put_mapping(index=ea_index, doc_type='elastalert_status', body=es_index_mappings['elastalert_status']) es_client.indices.put_mapping(index=ea_index, doc_type='silence', body=es_index_mappings['silence']) es_client.indices.put_mapping(index=ea_index, doc_type='elastalert_error', body=es_index_mappings['elastalert_error']) es_client.indices.put_mapping(index=ea_index, doc_type='past_elastalert', body=es_index_mappings['past_elastalert']) print('New index %s created' % ea_index) if old_ea_index: print("Copying all data from old index '{0}' to new index '{1}'".format(old_ea_index, ea_index)) # Use the defaults for chunk_size, scroll, scan_kwargs, and bulk_kwargs elasticsearch.helpers.reindex(es_client, old_ea_index, ea_index) print('Done!')
def main(): parser = argparse.ArgumentParser() parser.add_argument("--src-host", action="store", default="127.0.0.1", type=unicode, help="Source host [default: %(default)s]") parser.add_argument("--src-port", action="store", default=9200, help="Source port [default: %(default)s]") parser.add_argument("--src-index", action="store", default="", type=unicode, help="Source index") parser.add_argument("--src-batch-size", action="store", type=int, default=5000, help="Source query batchsize [default: %(default)s]") parser.add_argument("--src-scroll-interval", action="store", type=unicode, default="60m", help="Interval for source scroll query [default: %(default)s]") parser.add_argument("--dest-host", action="store", default="127.0.0.1", type=unicode, help="Destination host [default: %(default)s]") parser.add_argument("--dest-port", action="store", default=9200, help="Destination port [default: %(default)s]") parser.add_argument("--dest-index", action="store", default="", type=unicode, help="Destination index") parser.add_argument("--dest-batch-size", action="store", type=int, default=5000, help="Destination batchsize [default: %(default)s]") parser.add_argument("--dest-alias", action="store", help="Destination index alias (to be set after we have finished populating)") parser.add_argument("--dest-concurrency", action="store", type=int, default=4, help="Destination batchsize [default: %(default)s]") parser.add_argument("--dest-delete-index", action="store_true", help="Delete destination index at before starting") parser.add_argument("--query", action="store", type=unicode, default="", help="Query to use [if None is specified, a match_all will be used]") args = parser.parse_args() if args.src_index is None or len(args.src_index) == 0: raise Exception("--src-index must be specified!") if args.dest_index is None or len(args.dest_index) == 0: raise Exception("--dest-index must be specified!") dt_start = datetime.now() # copy mapping src_es_instance = get_elasticsearch(args.src_host, args.src_port) dest_es_instance = get_elasticsearch(args.dest_host, args.dest_port) # check if src_index exists src_es_ic = IndicesClient(src_es_instance) if not src_es_ic.exists(args.src_index): raise Exception("--src-index %s does not exist!" % args.src_index) # check if dest_index exists dest_es_ic = IndicesClient(dest_es_instance) if dest_es_ic.exists(args.dest_index): if args.dest_delete_index: dest_es_ic.delete(index=args.dest_index) else: raise Exception("--dest-index %s already exists! Use --dest-delete-index if you want to drop it" % args.dest_index) log.info("Copying mapping...") # copy mapping over to dest src_index_information = src_es_ic.get(index=args.src_index) dest_es_ic.create(index=args.dest_index, body=src_index_information.get(args.src_index, {})) # set num_of_replicas to 0 dest_es_ic.put_settings(index=args.dest_index, body={"settings": {"index": {"number_of_replicas": 0}}}) # perform multiprocessing log.info("Copying data...") MAGIC_STRING = "%s:%s" % (str(uuid4()), str(uuid4())) DEST_QUEUE = Queue() DEST_COUNTER = Value('i', 0) src_process = Process(target=src_worker, args=(args, DEST_QUEUE, MAGIC_STRING)) src_process.start() dest_processes = [Process(target=dest_worker, args=(args, DEST_QUEUE, MAGIC_STRING, DEST_COUNTER)) for i in xrange(args.dest_concurrency)] for i in dest_processes: i.start() src_process.join() for i in dest_processes: i.join() log.info("[dest_worker] Total processed %s" % DEST_COUNTER.value) if args.dest_alias is not None and len(args.dest_alias) > 0: # we remove all existing mappings to this alias, then add it to the current dest_index for idx_name, aliases_mapping in dest_es_ic.get_aliases().iteritems(): if args.dest_alias in aliases_mapping.get("aliases", {}): dest_es_ic.delete_alias(index=idx_name, name=args.dest_alias) dest_es_ic.put_alias(index=args.dest_index, name=args.dest_alias) dest_es_ic.refresh(args.dest_index) dt_end = datetime.now() log.info("Time elapsed: %s" % (dt_end-dt_start, ))
def add_index(self, indexJSON, index, doc_type, alias=None): ic = IndicesClient(self.es) response = ic.create(index=index, body=json.load(open(indexJSON))) if alias: ic.put_alias(index=index, name=alias) print indexJSON, response
def handle(self, *args, **options): Student.objects.all().delete() University.objects.all().delete() Course.objects.all().delete() start = time.time() # database part # make some Universities university_names = ( 'MIT', 'MGU', 'CalTech', 'KPI', 'DPI', 'PSTU' ) universities = [] for name in university_names: uni = mommy.make(University, name=name) universities.append(uni) # make some courses template_options = ['CS%s0%s', 'MATH%s0%s', 'CHEM%s0%s', 'PHYS%s0%s'] courses = [] for num in range(1, 4): for course_num in range(1, 4): for template in template_options: name = template % (course_num, num) course = mommy.make(Course, name=name) courses.append(course) students = [] for _ in xrange(options.get('count')[0]): stud = mommy.prepare( Student, university=random.choice(universities), first_name=names.get_first_name(), last_name=names.get_last_name(), age=random.randint(17, 25) ) students.append(stud) Student.objects.bulk_create(students) ThroughModel = Student.courses.through stud_courses = [] for student_id in Student.objects.values_list('pk', flat=True): courses_already_linked = [] for _ in range(random.randint(1, 10)): index = random.randint(0, len(courses) - 1) if index not in courses_already_linked: courses_already_linked.append(index) else: continue stud_courses.append( ThroughModel( student_id=student_id, course_id=courses[index].pk ) ) ThroughModel.objects.bulk_create(stud_courses) # recreate index indices_client = IndicesClient(client=settings.ES_CLIENT) if indices_client.exists('django'): indices_client.delete(index='django') indices_client.create(index='django') indices_client.put_mapping( doc_type='student', body=Student._meta.es_mapping, index='django' ) # update part put_all_to_index(Student) finish = time.time() - start print '%s items %s seconds' % (options.get('count')[0], finish)
import json import object_storage import re from elasticsearch import Elasticsearch from elasticsearch.client import IndicesClient es = Elasticsearch("http://elasticm1:9200") ic = IndicesClient(es) mapping = '{"mappings": {"comment": {"properties": {"created_utc": {"type": "date","format": "epoch_second"}, "body": {"type": "string", "analyzer": "english"}}}}}' ic.create(index="2007", body=mapping) sl_storage = object_storage.get_client("username", "key", datacenter="sjc01") def iterload(src): buffer = "" dec = json.JSONDecoder() for chunk in src: buffer = buffer + chunk while True: try: r = dec.raw_decode(buffer) except: break yield r[0] buffer = buffer[r[1] :].strip(" \n\r\t") chunk_size = 512 * 1024
def schema_setup(es, project, forced, logger=None): properties = {} properties["fail_symptom"] = {"type": "string", "index": "not_analyzed"} properties["ats_log"] = {"type": "string"} properties["file_path"] = {"type": "string", "analyzer": "path-analyzer"} add_unique_mapping( properties, "Test Start Time", {"VALUE": {"type": "date", "format": "yyyy/MM/dd HH:mm:ssZ||yyyy/MM/ddZ"}} ) add_unique_mapping( properties, "Test end Time", {"VALUE": {"type": "date", "format": "yyyy/MM/dd HH:mm:ssZ||yyyy/MM/ddZ"}} ) idx_client = IndicesClient(es) if idx_client.exists(index=project): if forced: idx_client.delete(index=project) else: print "Index already exists!" return runin_csv_status = {"runin_csv_status": {"path_match": "RunInLog.*.STATUS", "mapping": {"index": "not_analyzed"}}} runin_csv_value = { "runin_csv_value": { "path_match": "RunInLog.*.VALUE", "mapping": {"index": "not_analyzed", "fields": {"double": {"type": "double"}}}, } } runin_csv_u_limit = { "runin_csv_u_limit": { "path_match": "RunInLog.*.U_LIMIT", "mapping": {"index": "not_analyzed", "fields": {"double": {"type": "double"}}}, } } runin_csv_l_limit = { "runin_csv_l_limit": { "path_match": "RunInLog.*.L_LIMIT", "mapping": {"index": "not_analyzed", "fields": {"double": {"type": "double"}}}, } } runin_csv_test_time = { "runin_csv_test_time": { "path_match": "RunInLog.*.TEST_TIME", "mapping": {"index": "not_analyzed", "fields": {"double": {"type": "double"}}}, } } csv_status = {"csv_status": {"path_match": "*.STATUS", "mapping": {"index": "not_analyzed"}}} csv_value = { "csv_value": { "path_match": "*.VALUE", "mapping": {"index": "not_analyzed", "fields": {"double": {"type": "double"}}}, } } csv_u_limit = { "csv_u_limit": { "path_match": "*.U_LIMIT", "mapping": {"index": "not_analyzed", "fields": {"double": {"type": "double"}}}, } } csv_l_limit = { "csv_l_limit": { "path_match": "*.L_LIMIT", "mapping": {"index": "not_analyzed", "fields": {"double": {"type": "double"}}}, } } csv_test_time = { "csv_test_time": { "path_match": "*.TEST_TIME", "mapping": {"index": "not_analyzed", "fields": {"double": {"type": "double"}}}, } } dynamic_templates = [ runin_csv_status, runin_csv_value, runin_csv_u_limit, runin_csv_l_limit, runin_csv_test_time, csv_status, csv_value, csv_u_limit, csv_l_limit, csv_test_time, ] analysis = {} analysis["analyzer"] = {} analysis["tokenizer"] = {} analysis["analyzer"]["path-analyzer"] = {"type": "custom", "tokenizer": "path-tokenizer"} analysis["tokenizer"]["path-tokenizer"] = {"type": "path_hierarchy"} mappings = {"dynamic_templates": dynamic_templates, "properties": properties} data = { "settings": {"index.mapping.ignore_malformed": True, "number_of_replicas": 1, "analysis": analysis}, "mappings": {ES_DOC_TYPE: mappings}, } if logger == None: print json.dumps(data) else: logger.info("Schema: %s" % json.dumps(data)) idx_client.create(index=project, body=data)
class IndicesManager(object): def __init__(self, options=None): self.options = options or {} self.es = get_elasticsearch(self.options) self.esc = IndicesClient(self.es) self.conf_dir = sys.path[0] def __create__(self, name, config=None, type=None): result = None try: if not config: file_name = "{}/config/{}_index.json".format( self.conf_dir, type) with open(file_name) as fp: config = fp.read() # create the index with version number result = self.esc.create(index=name, body=config) except es_exceptions.TransportError: print("unable to connect to Elasticsearch") return result def create(self, doc_type): alias_name = 'frisc_{}'.format(doc_type) index_name = '{}_v1'.format(alias_name) try: if self.esc.exists_alias(alias_name): print('Index {} already existst, updating'.format(alias_name)) self.update(doc_type) return self.__create__(index_name, type=doc_type) # set an alias to the index self.esc.put_alias(index=index_name, name=alias_name) except es_exceptions.TransportError: print("unable to connect to Elasticsearch") def update(self, doc_type): alias_name = 'frisc_{}'.format(doc_type) index_name = '{}_v1'.format(alias_name), try: if not self.esc.exists_alias(alias_name): self.create(doc_type) return version_number = 0 old_index_name = '' old_indexes = self.esc.get_alias(name=alias_name) for index in old_indexes.keys(): match = re.search('^({})_v(\d+)$'.format(alias_name), index) if match: version = int(match.group(2)) if version > version_number: version_number = version old_index_name = match.group(0) version_number += 1 index_name = '{}_v{}'.format(alias_name, version_number) if self.esc.exists(index_name): # raise soemthing raise self.__create__(index_name, type=doc_type) reindex(self.es, old_index_name, index_name) self.esc.update_aliases( body={'actions': [ {'remove': {'alias': alias_name, 'index': old_index_name}}, {'add': {'alias': alias_name, 'index': index_name}} ]} ) except es_exceptions.TransportError: print("unable to connect to Elasticsearch")
from elasticsearch import Elasticsearch from elasticsearch.client import IndicesClient from elasticsearch_dsl import Mapping, String, Search es = Elasticsearch() ies = IndicesClient(es) ies.delete('test') ies.create('test') ies.close('test') ies.put_settings(index='test', body={ "analysis":{ "analyzer":{ "default":{ "type":"custom", "tokenizer":"standard", "filter":[ "standard", "lowercase", "stop", "kstem" ] } } } }) m = Mapping('test') m.field('f', String()) m.save(index='test', using=es) ies.open(index='test')
def main(index_num): n_out = int(10e6) n_batch = int(4e3) n_batches = n_out // n_batch index = 'image_hashes_%02d' % index_num client = Elasticsearch('localhost:9200') index_client = IndicesClient(client) if index_client.exists(index): print('Not deleting %s!' % index); return; sys.exit(1) index_client.delete(index) es_short = { 'type': 'short', } field_name = lambda i: '%x' % i fields = {field_name(i): es_short for i in range(n_samples)} fields['raw'] = { 'type': 'string', 'store': True, 'index': 'not_analyzed', 'doc_values': True } index_client.create(index=index, body={ 'settings': { 'number_of_shards': 4, 'number_of_replicas': 0 }, 'mappings': { 'images': { '_source': {'enabled': False}, 'properties': fields } } }) sampler, pow2 = get_sampler(n_samples, b_p_sample) start_time = time.time() for i_batch in range(1, n_batches+1): data = np.random.randn(n_batch, dim_in) hash = (data.dot(proj) > 0).astype(np.uint64) hash_int = hash.dot(2**np.arange(dim_out).astype(np.uint64)) #print('\n'.join(repr(i.astype(np.uint8)) for i in hash)); return sampled = np.vstack( hash.dot(sampler[:,:,j]).dot(pow2) for j in range(n_samples) ).astype(np.int16).T.tolist() #print(repr(sampled)); print(repr([len(sampled), len(sampled[0])])); return docs = [] for i in range(n_batch): doc = { field_name(j): sampled[i][j] for j in range(n_samples) } doc['raw'] = '{0:064b}'.format(hash_int[i]) doc_id = random.getrandbits(63) docs.append('{"index":{"_index": "%s", "_type": "images", "_id": "%d"}})' % (index, doc_id)) docs.append(json.dumps(doc)) #print(json.dumps(json.loads(docs[1]), indent=4)); return try: response = client.bulk(body='\n'.join(docs)) except: # Even when an exception is thrown typically documents were stored in ES sleep_seconds = 10 print('\rHTTP timed out, sleeping %d seconds...' % sleep_seconds) time.sleep(sleep_seconds) print('\rChunk %5d/%d, %5.2f%%' % (i_batch, n_batches, i_batch*100.0/n_batches), end='') index_time = time.time() print('\nCalling optimize, indexing took %.1f s...' % (index_time - start_time)) sys.stdout.flush() index_client.optimize(index=index, max_num_segments=3, request_timeout=1e6) print('Optimization done in %.1f s' % (time.time() - index_time))
class ESIndexManager(object): def __init__(self, es_config=None): if not es_config: es_config = SMConfig.get_conf()['elasticsearch'] self._es = init_es_conn(es_config) self._ind_client = IndicesClient(self._es) def internal_index_name(self, alias): yin, yang = '{}-yin'.format(alias), '{}-yang'.format(alias) assert not (self.exists_index(yin) and self.exists_index(yang)), \ 'Only one of {} and {} should exist'.format(yin, yang) if self.exists_index(yin): return yin elif self.exists_index(yang): return yang else: return yin def create_index(self, index): dynamic_templates = [{ "strings": { "match_mapping_type": "string", "mapping": { "type": "keyword", "normalizer": "default"}} }] body = { "settings": { "index": { "number_of_shards": 1, "number_of_replicas": 0, "max_result_window": 2147483647, "analysis": { "normalizer": { "default": { "type": "custom", "filter": ["lowercase", "asciifolding"] } } }}}, "mappings": { "dataset": { "dynamic_templates": dynamic_templates, "properties": { "ds_id": {"type": "keyword"} } }, "annotation": { "dynamic_templates": dynamic_templates, "properties": { "ds_id": {"type": "keyword"}, "chaos": {"type": "float"}, "image_corr": {"type": "float"}, "pattern_match": {"type": "float"}, "total_iso_ints": {"type": "float"}, "min_iso_ints": {"type": "float"}, "max_iso_ints": {"type": "float"}, "msm": {"type": "float"}, "fdr": {"type": "float"}}}}} if not self._ind_client.exists(index): out = self._ind_client.create(index=index, body=body) logger.info('Index {} created\n{}'.format(index, out)) else: logger.info('Index {} already exists'.format(index)) def delete_index(self, index): if self._ind_client.exists(index): out = self._ind_client.delete(index) logger.info('Index {} deleted: {}'.format(index, out)) def exists_index(self, index): return self._ind_client.exists(index) def another_index_name(self, index): assert index.endswith('yin') or index.endswith('yang') if index.endswith('yin'): return index.replace('yin', 'yang') else: return index.replace('yang', 'yin') def remap_alias(self, new_index, alias='sm'): old_index = self.another_index_name(new_index) logger.info('Remapping {} alias: {} -> {}'.format(alias, old_index, new_index)) self._ind_client.update_aliases({ "actions": [{"add": {"index": new_index, "alias": alias}}] }) if self._ind_client.exists_alias(old_index, alias): self._ind_client.update_aliases({ "actions": [{"remove": {"index": old_index, "alias": alias}}] }) out = self._ind_client.delete(index=old_index) logger.info('Index {} deleted: {}'.format(old_index, out))
def criar_indice(nome_indice): config_settings = """ { "settings": { "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "standard", "filter": ["standard", "pt_BR", "lowercase","portuguese_stop", "asciifolding"] } }, "filter": { "my_stemmer": { "type": "stemmer", "name": "brazilian" }, "portuguese_stop": { "type": "stop", "stopwords": "_brazilian_" }, "pt_BR": { "type": "hunspell", "language": "pt_BR" } } } } } """ config_mapping = """ { "radar" : { "_all" : {"enabled" : true, "analyzer": "my_analyzer"}, "properties" : { "casa_legilativa_local" : { "type" : "string" }, "casa_legislativa_esfera" : { "type" : "string" }, "casa_legislativa_id" : { "type" : "long" }, "casa_legislativa_nome" : { "type" : "string" }, "casa_legislativa_nome_curto" : { "type" : "string" }, "proposicao_ano" : { "type" : "string" }, "proposicao_data_apresentacao" : { "type" : "date", "format" : "dateOptionalTime" }, "proposicao_descricao" : { "type" : "string" }, "proposicao_ementa" : { "type" : "string", "analyzer": "my_analyzer" }, "proposicao_id" : { "type" : "long" }, "proposicao_id_prop" : { "type" : "string" }, "proposicao_indexacao" : { "type" : "string", "analyzer": "my_analyzer" }, "proposicao_numero" : { "type" : "string" }, "proposicao_sigla" : { "type" : "string" }, "proposicao_situacao" : { "type" : "string" }, "votacao_data" : { "type" : "date", "format" : "dateOptionalTime" }, "votacao_descricao" : { "type" : "string", "analyzer": "my_analyzer" }, "votacao_id" : { "type" : "long" }, "votacao_id_vot" : { "type" : "string" }, "votacao_resultado" : { "type" : "string" } }}} """ es = conectar_em_elastic_search() client_indice = IndicesClient(es) client_indice.create(index=nome_indice, body=config_settings) client_indice.put_mapping( index=nome_indice, doc_type="radar", body=config_mapping)
def main(argv): index = 'user_topics' client = Elasticsearch('localhost:9200') index_client = IndicesClient(client) if index_client.exists(index): index_client.delete(index) index_client.create(index=index, body={ 'settings': { 'number_of_shards': 4, 'number_of_replicas': 0 }, 'mappings': { 'user': { 'properties': { #'id': { # 'type': 'long', # 'doc_values': True #}, 'topics': { 'type': 'integer', 'doc_values': True }, 'n_topics': { 'type': 'integer', 'doc_values': True } } } } }) n_users = int(argv[1]) n_topics = int(argv[2]) * 0.15 n_topics_per_user = int(argv[3]) * 4.2 docs_per_chunk = int(2e4) n_chunks = int(ceil(n_users / docs_per_chunk)) start_time = time.time() for i_chunk in range(1, n_chunks+1): docs = [] for i in range(docs_per_chunk): n_user_topics = rand(n_topics_per_user)[0] topics = list(set(rand(n_topics, n_user_topics))) doc_id = str(random.getrandbits(63)) docs.append('{"index":{"_index": "%s", "_type": "user", "_id": "%s"}})' % (index, doc_id)) docs.append(json.dumps({ #'id': doc_id, 'topics': topics, 'n_topics': len(topics) })) #print(json.dumps(json.loads(docs[1]), indent=4)); return try: response = client.bulk(body='\n'.join(docs)) except: # Even when an exception is thrown typically documents were stored in ES sleep_seconds = 10 print('\rHTTP timed out, sleeping %d seconds...' % sleep_seconds) time.sleep(sleep_seconds) print('\rChunk %5d/%d, %5.2f%%' % (i_chunk, n_chunks, i_chunk*100.0/n_chunks), end='') index_time = time.time() print('\nCalling optimize, indexing took %.1f s...' % (index_time - start_time)) sys.stdout.flush() index_client.optimize(index=index, max_num_segments=3, request_timeout=1e6) print('Optimization done in %.1f s' % (time.time() - index_time))
'store': 'yes', 'index': 'not_analyzed' }, 'title': {'type': 'string'}, 'body': {'type': 'string'}, 'teaser': {'type': 'string'}, 'timestamp': {'type': 'date'} }, '_id': {'path': 'path'} } } ic = IndicesClient(es) if not ic.exists(index): ic.create(index) if not ic.exists_type(index=index, doc_type='item'): ic.put_mapping( index=index, ignore_conflicts=True, doc_type='item', body=body ) while 1: try: main() except KeyboardInterrupt: raise SystemExit(0) except:
class ESExporter: def __init__(self, sm_config): self.es = Elasticsearch(hosts=[{"host": sm_config['elasticsearch']['host']}]) self.ind_client = IndicesClient(self.es) def _index(self, annotations): to_index = [] for r in annotations: d = dict(zip(COLUMNS, r)) d['comp_names'] = u'|'.join(d['comp_names']).replace(u'"', u'') d['comp_ids'] = u'|'.join(d['comp_ids']) d['mz'] = '{:010.4f}'.format(d['mz']) if d['mz'] else '' to_index.append({ '_index': 'sm', '_type': 'annotation', '_id': '{}_{}_{}_{}'.format(d['ds_name'], d['db_name'], d['sf'], d['adduct']), '_source': d }) bulk(self.es, actions=to_index, timeout='60s') def _delete(self, annotations): to_delete = [] for r in annotations: d = dict(zip(COLUMNS, r)) to_delete.append({ '_op_type': 'delete', '_index': 'sm', '_type': 'annotation', '_id': '{}_{}_{}_{}'.format(d['ds_name'], d['db_name'], d['sf'], d['adduct']), }) try: bulk(self.es, to_delete) except BulkIndexError as e: logger.warn('{} - {}'.format(e.args[0], e.args[1][1])) def index_ds(self, db, ds_name, db_name): annotations = db.select(RESULTS_TABLE_SQL, ds_name, db_name) logger.info('Deleting documents from the index: {}-{}'.format(ds_name, db_name)) self._delete(annotations) logger.info('Indexing documents: {}-{}'.format(ds_name, db_name)) self._index(annotations) def create_index(self, name='sm'): body = { 'settings': { "index": { 'max_result_window': 2147483647, "analysis": { "analyzer": { "analyzer_keyword": { "tokenizer": "keyword", "filter": "lowercase" } } } } }, 'mappings': { "annotation": { "properties": { "db_name": {"type": "string", "index": "not_analyzed"}, "ds_name": {"type": "string", "index": "not_analyzed"}, "sf": {"type": "string", "index": "not_analyzed"}, "comp_names": { "type": "string", "analyzer": "analyzer_keyword", }, "comp_ids": {"type": "string", "index": "not_analyzed"}, "chaos": {"type": "float", "index": "not_analyzed"}, "image_corr": {"type": "float", "index": "not_analyzed"}, "pattern_match": {"type": "float", "index": "not_analyzed"}, "msm": {"type": "float", "index": "not_analyzed"}, "adduct": {"type": "string", "index": "not_analyzed"}, "fdr": {"type": "float", "index": "not_analyzed"}, "mz": {"type": "string", "index": "not_analyzed"} } } } } if not self.ind_client.exists(name): out = self.ind_client.create(index=name, body=body) logger.info('Index {} created\n{}'.format(name, out)) else: logger.info('Index {} already exists'.format(name)) def delete_index(self, name='sm'): out = self.ind_client.delete(name) logger.info('Index {} deleted\n{}'.format(name, out))