def init(): conn = ES('127.0.0.1:9200') try: conn.delete_index("zhihu") except: pass conn.create_index("zhihu") mapping = { u'id': { 'store': 'yes', 'type': u'integer' }, u'link': { 'store': 'yes', 'type': u'string' }, u'title': { 'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string' }, } conn.put_mapping("answer", {'properties': mapping}, ["zhihu"]) for item in Data().getData(): conn.index(item, "zhihu", "answer", item['id']) conn.refresh(["zhihu"]) return redirect('/list')
def setUp(self): self.es_host = None self.es_cluster_name = None self._state = [] super(ElasticSearchSupport, self).setUp() self.es_host = self.input.param("es_host", "127.0.0.1") self.es_port = self.input.param("es_port", 9091) conn = ES(self.es_host + ":9200") if not self.input.param("skip_cleanup", True) or self.case_number == 1: conn.delete_index_if_exists("default") conn.create_index("default") self.log.warning("waiting for ES index to be ready to use") time.sleep(30) self._link_es_cluster() self._start_es_replication() self.log.warning("after setUp es")
class DatabaseWrapper(NonrelDatabaseWrapper): def _cursor(self): self._ensure_is_connected() return self._connection def __init__(self, *args, **kwds): super(DatabaseWrapper, self).__init__(*args, **kwds) self.features = DatabaseFeatures(self) self.ops = DatabaseOperations(self) self.client = DatabaseClient(self) self.creation = DatabaseCreation(self) self.validation = DatabaseValidation(self) self.introspection = DatabaseIntrospection(self) self._is_connected = False @property def db_connection(self): self._ensure_is_connected() return self._db_connection def _ensure_is_connected(self): if not self._is_connected: try: port = int(self.settings_dict["PORT"]) except ValueError: raise ImproperlyConfigured("PORT must be an integer") self.db_name = self.settings_dict["NAME"] self._connection = ES( "%s:%s" % (self.settings_dict["HOST"], port), decoder=Decoder, encoder=Encoder, autorefresh=True, default_indices=[self.db_name], ) self._db_connection = self._connection # auto index creation: check if to remove try: self._connection.create_index(self.db_name) except: pass # We're done! self._is_connected = True
def index(fname, index_name, keys_to_tag): fptr = open(fname, 'rb') line_count = 0 conn = ES(["localhost:9200"]) if not conn.exists_index(index_name): conn.create_index(index_name) start = time.clock() numb_exceptions = 0 for line in fptr: if ((line_count % 10000) == 0): end = time.clock() minutes = (end - start) / 60.0 print 'File: %s Done with %d took %f min. ' %(fname, line_count, minutes) print 'number of exceptions ', numb_exceptions line_count += 1 data = json.loads(line) if not data.get('tags'): continue post_id = int(data['post_id']) found_content = False for k in keys_to_tag: if data.get(k): found_content = True if not found_content: continue index_data = dict() for k in keys_to_tag: value = data.get(k) if (value and (k == 'content')): try: stripped_value = utils.strip_tags(value) except Exception: stripped_value = value index_data[k] = stripped_value if post_id and data: try: conn.index(index_data, index_name, "test-type", post_id) except Exception: numb_exceptions += 1 continue print 'number of exceptions ', numb_exceptions
class DatabaseWrapper(NonrelDatabaseWrapper): def _cursor(self): self._ensure_is_connected() return self._connection def __init__(self, *args, **kwds): super(DatabaseWrapper, self).__init__(*args, **kwds) self.features = DatabaseFeatures(self) self.ops = DatabaseOperations(self) self.client = DatabaseClient(self) self.creation = DatabaseCreation(self) self.validation = DatabaseValidation(self) self.introspection = DatabaseIntrospection(self) self._is_connected = False @property def db_connection(self): self._ensure_is_connected() return self._db_connection def _ensure_is_connected(self): if not self._is_connected: try: port = int(self.settings_dict['PORT']) except ValueError: raise ImproperlyConfigured("PORT must be an integer") self.db_name = self.settings_dict['NAME'] self._connection = ES("%s:%s" % (self.settings_dict['HOST'], port), decoder=Decoder, encoder=Encoder, autorefresh=True, default_indexes=[self.db_name]) self._db_connection = self._connection #auto index creation: check if to remove try: self._connection.create_index(self.db_name) except: pass # We're done! self._is_connected = True
def init(): conn = ES('127.0.0.1:9200') try: conn.delete_index("zhihu") except: pass conn.create_index("zhihu") mapping = { u'id': {'store': 'yes', 'type': u'integer'}, u'link': {'store': 'yes', 'type': u'string'}, u'title': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string'}, } conn.put_mapping("answer", {'properties': mapping}, ["zhihu"]) for item in Data().getData(): conn.index(item, "zhihu", "answer", item['id']) conn.refresh(["zhihu"]) return redirect('/list')
def ext_process(listname, hostname, url, filepath, msg): """Here's where you put your code to deal with the just archived message. Arguments here are the list name, the host name, the URL to the just archived message, the file system path to the just archived message and the message object. These can be replaced or augmented as needed. """ from pyes import ES from pyes.exceptions import ClusterBlockException, NoServerAvailable import datetime #CHANGE this settings to reflect your configuration _ES_SERVERS = ['127.0.0.1:9500'] # I prefer thrift _indexname = "mailman" _doctype = "mail" date = datetime.datetime.today() try: iconn = ES(_ES_SERVERS) status = None try: status = iconn.status(_indexname) logger.debug("Indexer status:%s" % status) except: iconn.create_index(_indexname) time.sleep(1) status = iconn.status(_indexname) mappings = { u'text': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets"}, u'url': {'boost': 1.0, 'index': 'not_analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "no"}, u'title': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets"}, u'date': {'store': 'yes', 'type': u'date'}} time.sleep(1) status = iconn.put_mapping(_doctype, mappings, _indexname) data = dict(url=url, title=msg.get('subject'), date=date, text=str(msg) ) iconn.index(data, _indexname, _doctype) syslog('debug', 'listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except ClusterBlockException: syslog('error', 'Cluster in revocery state: listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except NoServerAvailable: syslog('error', 'No server available: listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except: import traceback syslog('error', 'Unknown: listname: %s, hostname: %s, url: %s, path: %s, msg: %s\nstacktrace: %s', listname, hostname, url, filepath, msg, repr(traceback.format_exc())) return
def ext_process(listname, hostname, url, filepath, msg): """Here's where you put your code to deal with the just archived message. Arguments here are the list name, the host name, the URL to the just archived message, the file system path to the just archived message and the message object. These can be replaced or augmented as needed. """ from pyes import ES from pyes.exceptions import ClusterBlockException, NoServerAvailable import datetime #CHANGE this settings to reflect your configuration _ES_SERVERS = ['127.0.0.1:9500'] # I prefer thrift _indexname = "mailman" _doctype = "mail" date = datetime.datetime.today() try: iconn = ES(_ES_SERVERS) status = None try: status = iconn.status(_indexname) logger.debug("Indexer status:%s" % status) except: iconn.create_index(_indexname) time.sleep(1) status = iconn.status(_indexname) mappings = { u'text': { 'store': 'true', 'type': u'text', "term_vector": "with_positions_offsets" }, u'url': { 'store': 'true', 'type': u'keyword' }, u'title': { 'store': 'true', 'type': u'text', "term_vector": "with_positions_offsets" }, u'date': { 'store': 'true', 'type': u'date' } } time.sleep(1) status = iconn.put_mapping(_doctype, mappings, _indexname) data = dict(url=url, title=msg.get('subject'), date=date, text=str(msg)) iconn.index(data, _indexname, _doctype) syslog('debug', 'listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except ClusterBlockException: syslog( 'error', 'Cluster in revocery state: listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except NoServerAvailable: syslog( 'error', 'No server available: listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except: import traceback syslog( 'error', 'Unknown: listname: %s, hostname: %s, url: %s, path: %s, msg: %s\nstacktrace: %s', listname, hostname, url, filepath, msg, repr(traceback.format_exc())) return
dataset = shelve.open("samples.shelve") mapping = { u'description': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'name': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'age': {'store': 'yes', 'type': u'integer'}, } conn.create_index("test-index") conn.put_mapping("test-type", {'properties':mapping}, ["test-index"]) start = datetime.now() for k, userdata in dataset.items(): # conn.index(userdata, "test-index", "test-type", k) conn.index(userdata, "test-index", "test-type", k, bulk=True) conn.force_bulk() end = datetime.now() print "time:", end-start dataset.close()
class ProcessSpiderData(Task): def run(self, spider_name): cities = [] backup_source = [] backup_created_date = None self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500) java = JavaInterface() self.extractor = java.ArticleSentencesExtractor.INSTANCE self.logger = ProcessSpiderData.get_logger() spider = Data.objects.get(name=spider_name) source = spider.source if spider and len(source): backup_created_date = spider.created_date index_new = '%s_%d' % (spider.name, int(time.time())) # create new index (not connected to alias) self.elastic.create_index(index_new) self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new) for item in source: item = self._process_content(item) item = self._get_location(item) if item.has_key('city'): cities.append(item['city']) self._create_index(index_new, item) backup_source.append(item) # save new index (in bulk) self.elastic.force_bulk() # create alias indices_old = self.elastic.get_alias(spider.name) self.elastic.set_alias(spider.name, [index_new]) # delete all indices for index in indices_old: self.elastic.delete_index_if_exists(index) # optimize self.elastic.optimize(index_new, refresh=True) # save backup (currently processed data) if len(backup_source) and backup_created_date: self._process_cities(set(cities), spider_name) cache.clear() obj = DataBackup.objects.get_or_create( name=spider_name, created_date=backup_created_date ) obj[0].source = binascii.hexlify(bz2.compress( JSONEncoder().encode(backup_source) )) obj[0].save() # force java & ES garbage collection self.elastic.connection.close() del self.extractor del java return True def _process_content(self, item): if len(item['content']): item['content'] = self.extractor.getText(jpype.JString(item['content'])) return item def _get_location(self, item): if not item.has_key('city'): return item try: geo = geocoders.GeoNames() places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False) if places: place, (lat, lon) = places[0] if isinstance(places, list) else places if place: item['pin'] = { 'location': { 'lat': lat, 'lon': lon } } except: pass return item def _create_index(self, index, item): id = item['id'] del item['id'] try: self.elastic.get(index, 'job', id) except ElasticSearchException: self.elastic.index( dumps(item, cls=DjangoJSONEncoder), index, 'job', id, bulk=True ) def _process_cities(self, cities, spider_name): cities_current = City.objects.filter(indices__contains='"%s"' % spider_name) # save lists of saved cities cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ] cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ] for city in cities: city = unicode(city.strip().lower()) city = normalize_spaces.sub(' ', city) city = remove_braces.sub('', city) city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')] city_clean = ' '.join(filter(None, city_clean)) city, created = City.objects.get_or_create(name = city_clean[:255]) if created: city.indices = [spider_name] else: city.indices.append(spider_name) city.indices = list(set(city.indices)) city.save() if city.name in cities_old_single: cities_old_single.remove(city.name) if city.name in cities_old_multi: cities_old_multi.remove(city.name) # remove unlinked citie City.objects.filter(name__in=cities_old_single).delete() for item in City.objects.filter(name__in=cities_old_multi): if spider_name in item.indices: item.indices.remove(spider_name) item.save()
dataset = shelve.open("samples.shelve") mapping = { u'description': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'name': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'age': {'store': 'yes', 'type': u'integer'}, } conn.create_index("test-index") conn.put_mapping("test-type", {'properties':mapping}, ["test-index"]) start = datetime.now() for k, userdata in dataset.items(): # conn.index(userdata, "test-index", "test-type", k) conn.index(userdata, "test-index", "test-type", k, bulk=True) conn.force_bulk() end = datetime.now() print "time:", end - start dataset.close()
class ESIndexerBase(object): ES_HOST = ES_HOST ES_INDEX_NAME = ES_INDEX_NAME ES_INDEX_TYPE = 'gene' def __init__(self): self.conn = ES(self.ES_HOST, default_indexes=[self.ES_INDEX_NAME], timeout=10.0) self.step = 10000 def create_index(self): try: print self.conn.open_index(self.ES_INDEX_NAME) except IndexMissingException: print self.conn.create_index(self.ES_INDEX_NAME) def delete_index_type(self, index_type): '''Delete all indexes for a given index_type.''' index_name = self.ES_INDEX_NAME # index_type = self.ES_INDEX_TYPE #Check if index_type exists mapping = self.conn.get_mapping(index_type, index_name) if index_name not in mapping or index_type not in mapping[index_name]: print 'Error: index type "%s" does not exist in index "%s".' % (index_type, index_name) return path = '/%s/%s' % (index_name, index_type) if ask('Confirm to delete all data under "%s":' % path) == 'Y': return self.conn.delete_mapping(index_name, index_type) def index(self, doc, index_type, id=None): '''add a doc to the index. If id is not None, the existing doc will be updated. ''' # index_type = self.ES_INDEX_TYPE return self.conn.index(doc, self.ES_INDEX_NAME, index_type, id=id) def delete_index(self, index_type, id): '''delete a doc from the index based on passed id.''' # index_type = self.ES_INDEX_TYPE return self.conn.delete(self.ES_INDEX_NAME, index_type, id) def optimize(self): return self.conn.optimize(self.ES_INDEX_NAME, wait_for_merge=True) def get_field_mapping(self): import dataload reload(dataload) dataload.register_sources() return dataload.get_mapping() def build_index(self, doc_d, update_mapping=False, bulk=True): index_name = self.ES_INDEX_NAME index_type = self.ES_INDEX_TYPE #Test if index exists try: print "Opening index...", self.conn.open_index(index_name) except NotFoundException: print 'Error: index "%s" does not exist. Create it first.' % index_name return -1 try: cur_mapping = self.conn.get_mapping(index_type, index_name) empty_mapping = False except ElasticSearchException: #if no existing mapping available for index_type #force update_mapping to True empty_mapping = True update_mapping = True # empty_mapping = not cur_mapping[index_name].get(index_type, {}) # if empty_mapping: # #if no existing mapping available for index_type # #force update_mapping to True # update_mapping = True if update_mapping: print "Updating mapping...", if not empty_mapping: print "\n\tRemoving existing mapping...", print self.conn.delete_mapping(index_name, index_type) _mapping = self.get_field_mapping() print self.conn.put_mapping(index_type, _mapping, [index_name]) print "Building index..." t0 = time.time() for doc_id, doc in doc_d.items(): self.conn.index(doc, index_name, index_type, doc_id, bulk=bulk) print self.conn.flush() print self.conn.refresh() print "Done[%s]" % timesofar(t0) def query(self, qs, fields='symbol,name', **kwargs): _q = StringQuery(qs) res = self.conn.search(_q, fields=fields, **kwargs) return res
class ElasticCatalog(object): default_indexes = { 'zelastic_doc_id': { 'type': 'string', 'index': 'not_analyzed' } } def __init__(self, connection_string, elastic_name, storage, bulk=False, bulk_size=400): self.conn = ES(connection_string, bulk_size=bulk_size) self.bulk_size = bulk_size self.name = elastic_name self.storage = storage self.bulk = bulk def update_mapping(self, name): meta = self.storage.meta(name) indexes = meta['indexes'] properties = self.default_indexes.copy() try: self.conn.create_index(self.name) except IndexAlreadyExistsException: pass for index_name, _type in indexes.items(): index = None if _type == 'str': index = { 'type': 'string', 'index': 'not_analyzed', } elif _type == 'full': index = { 'type': 'string', 'index': 'analyzed', } elif _type == 'bool': index = { 'type': 'boolean' } elif _type == 'int': index = { 'type': 'integer', } elif _type in ('datetime', 'date'): index = { 'type': 'date', } elif _type == 'float': index = { 'type': 'float', } if index is not None: properties[index_name] = index self.conn.indices.put_mapping( doc_type=name, mapping={ 'ignore_conflicts': True, 'properties': properties }, indices=[self.name]) def id(self, container_name, key): return '%s-%s' % (container_name, key) def index(self, container_name, doc, key): # need to add data to the index that isn't actually persisted data = { 'zelastic_doc_id': key } meta = self.storage.meta(container_name) indexes = meta['indexes'] for index in indexes.keys(): if index in doc: data[index] = doc[index] self.conn.index( data, self.name, container_name, self.id(container_name, key), bulk=self.bulk) def delete(self, container_name, key): self.conn.delete( self.name, container_name, self.id(container_name, key), bulk=self.bulk) def delete_all(self, container_name): self.conn.delete_mapping( self.name, container_name) def search(self, container_name, query, **kwargs): return self.conn.search( query, indexes=[self.name], doc_types=[container_name], **kwargs) def getFacets(self, container_name, field, size=100): return self.conn.search_raw({ "facets": { field: { "terms": { "all_terms": True, "field": field, "size": size, "order": "term" } } } }, indexes=[self.name], doc_type=container_name)
from __future__ import unicode_literals from pyes import ES if __name__ == "__main__": conn = ES(["localhost:9200"]) indices = ("content_index", "title_index") for index in indices: if not conn.exists_index(index): conn.create_index(index)