class ProcessSpiderData(Task): def run(self, spider_name): cities = [] backup_source = [] backup_created_date = None self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500) java = JavaInterface() self.extractor = java.ArticleSentencesExtractor.INSTANCE self.logger = ProcessSpiderData.get_logger() spider = Data.objects.get(name=spider_name) source = spider.source if spider and len(source): backup_created_date = spider.created_date index_new = '%s_%d' % (spider.name, int(time.time())) # create new index (not connected to alias) self.elastic.create_index(index_new) self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new) for item in source: item = self._process_content(item) item = self._get_location(item) if item.has_key('city'): cities.append(item['city']) self._create_index(index_new, item) backup_source.append(item) # save new index (in bulk) self.elastic.force_bulk() # create alias indices_old = self.elastic.get_alias(spider.name) self.elastic.set_alias(spider.name, [index_new]) # delete all indices for index in indices_old: self.elastic.delete_index_if_exists(index) # optimize self.elastic.optimize(index_new, refresh=True) # save backup (currently processed data) if len(backup_source) and backup_created_date: self._process_cities(set(cities), spider_name) cache.clear() obj = DataBackup.objects.get_or_create( name=spider_name, created_date=backup_created_date ) obj[0].source = binascii.hexlify(bz2.compress( JSONEncoder().encode(backup_source) )) obj[0].save() # force java & ES garbage collection self.elastic.connection.close() del self.extractor del java return True def _process_content(self, item): if len(item['content']): item['content'] = self.extractor.getText(jpype.JString(item['content'])) return item def _get_location(self, item): if not item.has_key('city'): return item try: geo = geocoders.GeoNames() places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False) if places: place, (lat, lon) = places[0] if isinstance(places, list) else places if place: item['pin'] = { 'location': { 'lat': lat, 'lon': lon } } except: pass return item def _create_index(self, index, item): id = item['id'] del item['id'] try: self.elastic.get(index, 'job', id) except ElasticSearchException: self.elastic.index( dumps(item, cls=DjangoJSONEncoder), index, 'job', id, bulk=True ) def _process_cities(self, cities, spider_name): cities_current = City.objects.filter(indices__contains='"%s"' % spider_name) # save lists of saved cities cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ] cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ] for city in cities: city = unicode(city.strip().lower()) city = normalize_spaces.sub(' ', city) city = remove_braces.sub('', city) city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')] city_clean = ' '.join(filter(None, city_clean)) city, created = City.objects.get_or_create(name = city_clean[:255]) if created: city.indices = [spider_name] else: city.indices.append(spider_name) city.indices = list(set(city.indices)) city.save() if city.name in cities_old_single: cities_old_single.remove(city.name) if city.name in cities_old_multi: cities_old_multi.remove(city.name) # remove unlinked citie City.objects.filter(name__in=cities_old_single).delete() for item in City.objects.filter(name__in=cities_old_multi): if spider_name in item.indices: item.indices.remove(spider_name) item.save()
class ESIndexerBase(object): ES_HOST = ES_HOST ES_INDEX_NAME = ES_INDEX_NAME ES_INDEX_TYPE = 'gene' def __init__(self): self.conn = ES(self.ES_HOST, default_indexes=[self.ES_INDEX_NAME], timeout=10.0) self.step = 10000 def create_index(self): try: print self.conn.open_index(self.ES_INDEX_NAME) except IndexMissingException: print self.conn.create_index(self.ES_INDEX_NAME) def delete_index_type(self, index_type): '''Delete all indexes for a given index_type.''' index_name = self.ES_INDEX_NAME # index_type = self.ES_INDEX_TYPE #Check if index_type exists mapping = self.conn.get_mapping(index_type, index_name) if index_name not in mapping or index_type not in mapping[index_name]: print 'Error: index type "%s" does not exist in index "%s".' % (index_type, index_name) return path = '/%s/%s' % (index_name, index_type) if ask('Confirm to delete all data under "%s":' % path) == 'Y': return self.conn.delete_mapping(index_name, index_type) def index(self, doc, index_type, id=None): '''add a doc to the index. If id is not None, the existing doc will be updated. ''' # index_type = self.ES_INDEX_TYPE return self.conn.index(doc, self.ES_INDEX_NAME, index_type, id=id) def delete_index(self, index_type, id): '''delete a doc from the index based on passed id.''' # index_type = self.ES_INDEX_TYPE return self.conn.delete(self.ES_INDEX_NAME, index_type, id) def optimize(self): return self.conn.optimize(self.ES_INDEX_NAME, wait_for_merge=True) def get_field_mapping(self): import dataload reload(dataload) dataload.register_sources() return dataload.get_mapping() def build_index(self, doc_d, update_mapping=False, bulk=True): index_name = self.ES_INDEX_NAME index_type = self.ES_INDEX_TYPE #Test if index exists try: print "Opening index...", self.conn.open_index(index_name) except NotFoundException: print 'Error: index "%s" does not exist. Create it first.' % index_name return -1 try: cur_mapping = self.conn.get_mapping(index_type, index_name) empty_mapping = False except ElasticSearchException: #if no existing mapping available for index_type #force update_mapping to True empty_mapping = True update_mapping = True # empty_mapping = not cur_mapping[index_name].get(index_type, {}) # if empty_mapping: # #if no existing mapping available for index_type # #force update_mapping to True # update_mapping = True if update_mapping: print "Updating mapping...", if not empty_mapping: print "\n\tRemoving existing mapping...", print self.conn.delete_mapping(index_name, index_type) _mapping = self.get_field_mapping() print self.conn.put_mapping(index_type, _mapping, [index_name]) print "Building index..." t0 = time.time() for doc_id, doc in doc_d.items(): self.conn.index(doc, index_name, index_type, doc_id, bulk=bulk) print self.conn.flush() print self.conn.refresh() print "Done[%s]" % timesofar(t0) def query(self, qs, fields='symbol,name', **kwargs): _q = StringQuery(qs) res = self.conn.search(_q, fields=fields, **kwargs) return res