class Indexer: def __init__(self, es_host, batch_mode=True, batch_size=100): self.client = ES(es_host) self.batch_mode = batch_mode self.client.bulk_size = int(batch_size) def bulk_index(self, index, type, shapefile, sleep_time=0.1): print 'Indexing [%s] docs into [%s] from %s' % (type, index, shapefile) index_count = 0 id_re = re.compile('^.*?"id"\s*:\s*"([^"]+)"') parens_re = re.compile('\(.*?\)') for line in input(shapefile): id = id_re.match(line).group(1) # cleanup any lines that contain parentheticals line = parens_re.sub('', line).strip() # sweet dec/encodings bro line = line.decode('latin-1').encode('utf-8') id = id.decode('latin-1').encode('utf-8') try: self.client.index(line, index, type, id, bulk=self.batch_mode) except UnicodeDecodeError as e: print "Error processing line with id %s: %s" % (id, e.message) except NoServerAvailable as e: print "The server failed to respond while indexing %s: [%s]. Sleeping %d seconds and retrying..." % (id, e.message, sleep_time) sleep(5) try: print "Retrying indexing of %s" % id self.client.index(line, index, type, id, bulk=self.batch_mode) except NoServerAvailable as e: print "Failed to reconnect again. Skipping indexing %s" % id except Exception as e: print "This happened: %s" % e index_count += 1 if index_count % int(self.client.bulk_size) == 0: print 'Indexing batch of %d, starting from %s' % (self.client.bulk_size, id) sleep(sleep_time) # index remaining bulk entries self.client.force_bulk()
dataset = shelve.open("samples.shelve") mapping = { u'description': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'name': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'age': {'store': 'yes', 'type': u'integer'}, } conn.create_index("test-index") conn.put_mapping("test-type", {'properties':mapping}, ["test-index"]) start = datetime.now() for k, userdata in dataset.items(): # conn.index(userdata, "test-index", "test-type", k) conn.index(userdata, "test-index", "test-type", k, bulk=True) conn.force_bulk() end = datetime.now() print "time:", end-start dataset.close()
es = ES() index_name = "my_index" type_name = "my_type" from utils_pyes import create_and_add_mapping create_and_add_mapping(es, index_name, type_name) es.index(doc={"name": "Joe Tester", "parsedtext": "Joe Testere nice guy", "uuid": "11111", "position": 1}, index=index_name, doc_type=type_name, id=1) es.index(doc={"name": "data1", "value": "value1"}, index=index_name, doc_type=type_name + "2", id=1, parent=1) es.index(doc={"name": "Bill Baloney", "parsedtext": "Bill Testere nice guy", "uuid": "22222", "position": 2}, index=index_name, doc_type=type_name, id=2, bulk=True) es.index(doc={"name": "data2", "value": "value2"}, index=index_name, doc_type=type_name + "2", id=2, parent=2, bulk=True) es.index(doc={"name": "Bill Clinton", "parsedtext": """Bill is not nice guy""", "uuid": "33333", "position": 3}, index=index_name, doc_type=type_name, id=3, bulk=True) es.force_bulk() es.update(index=index_name, doc_type=type_name, id=2, script='ctx._source.position += 1') es.update(index=index_name, doc_type=type_name, id=2, script='ctx._source.position += 1', bulk=True) es.delete(index=index_name, doc_type=type_name, id=1, bulk=True) es.delete(index=index_name, doc_type=type_name, id=3) es.force_bulk() es.indices.refresh(index_name) es.indices.delete_index(index_name)
class ProcessSpiderData(Task): def run(self, spider_name): cities = [] backup_source = [] backup_created_date = None self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500) java = JavaInterface() self.extractor = java.ArticleSentencesExtractor.INSTANCE self.logger = ProcessSpiderData.get_logger() spider = Data.objects.get(name=spider_name) source = spider.source if spider and len(source): backup_created_date = spider.created_date index_new = '%s_%d' % (spider.name, int(time.time())) # create new index (not connected to alias) self.elastic.create_index(index_new) self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new) for item in source: item = self._process_content(item) item = self._get_location(item) if item.has_key('city'): cities.append(item['city']) self._create_index(index_new, item) backup_source.append(item) # save new index (in bulk) self.elastic.force_bulk() # create alias indices_old = self.elastic.get_alias(spider.name) self.elastic.set_alias(spider.name, [index_new]) # delete all indices for index in indices_old: self.elastic.delete_index_if_exists(index) # optimize self.elastic.optimize(index_new, refresh=True) # save backup (currently processed data) if len(backup_source) and backup_created_date: self._process_cities(set(cities), spider_name) cache.clear() obj = DataBackup.objects.get_or_create( name=spider_name, created_date=backup_created_date ) obj[0].source = binascii.hexlify(bz2.compress( JSONEncoder().encode(backup_source) )) obj[0].save() # force java & ES garbage collection self.elastic.connection.close() del self.extractor del java return True def _process_content(self, item): if len(item['content']): item['content'] = self.extractor.getText(jpype.JString(item['content'])) return item def _get_location(self, item): if not item.has_key('city'): return item try: geo = geocoders.GeoNames() places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False) if places: place, (lat, lon) = places[0] if isinstance(places, list) else places if place: item['pin'] = { 'location': { 'lat': lat, 'lon': lon } } except: pass return item def _create_index(self, index, item): id = item['id'] del item['id'] try: self.elastic.get(index, 'job', id) except ElasticSearchException: self.elastic.index( dumps(item, cls=DjangoJSONEncoder), index, 'job', id, bulk=True ) def _process_cities(self, cities, spider_name): cities_current = City.objects.filter(indices__contains='"%s"' % spider_name) # save lists of saved cities cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ] cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ] for city in cities: city = unicode(city.strip().lower()) city = normalize_spaces.sub(' ', city) city = remove_braces.sub('', city) city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')] city_clean = ' '.join(filter(None, city_clean)) city, created = City.objects.get_or_create(name = city_clean[:255]) if created: city.indices = [spider_name] else: city.indices.append(spider_name) city.indices = list(set(city.indices)) city.save() if city.name in cities_old_single: cities_old_single.remove(city.name) if city.name in cities_old_multi: cities_old_multi.remove(city.name) # remove unlinked citie City.objects.filter(name__in=cities_old_single).delete() for item in City.objects.filter(name__in=cities_old_multi): if spider_name in item.indices: item.indices.remove(spider_name) item.save()
dataset = shelve.open("samples.shelve") mapping = { u'description': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'name': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'age': {'store': 'yes', 'type': u'integer'}, } conn.create_index("test-index") conn.put_mapping("test-type", {'properties':mapping}, ["test-index"]) start = datetime.now() for k, userdata in dataset.items(): # conn.index(userdata, "test-index", "test-type", k) conn.index(userdata, "test-index", "test-type", k, bulk=True) conn.force_bulk() end = datetime.now() print "time:", end - start dataset.close()
id=2, parent=2, bulk=True) es.index(doc={ "name": "Bill Clinton", "parsedtext": """Bill is not nice guy""", "uuid": "33333", "position": 3 }, index=index_name, doc_type=type_name, id=3, bulk=True) es.force_bulk() es.update(index=index_name, doc_type=type_name, id=2, script='ctx._source.position += 1') es.update(index=index_name, doc_type=type_name, id=2, script='ctx._source.position += 1', bulk=True) es.delete(index=index_name, doc_type=type_name, id=1, bulk=True) es.delete(index=index_name, doc_type=type_name, id=3) es.force_bulk()