Esempio n. 1
0
class Indexer:
    def __init__(self, es_host, batch_mode=True, batch_size=100):
        self.client = ES(es_host)
        self.batch_mode = batch_mode
        self.client.bulk_size = int(batch_size)

    def bulk_index(self, index, type, shapefile, sleep_time=0.1):
        print 'Indexing [%s] docs into [%s] from %s' % (type, index, shapefile)

        index_count = 0

        id_re = re.compile('^.*?"id"\s*:\s*"([^"]+)"')
        parens_re = re.compile('\(.*?\)')

        for line in input(shapefile):
            id = id_re.match(line).group(1)

            # cleanup any lines that contain parentheticals
            line = parens_re.sub('', line).strip()

            # sweet dec/encodings bro
            line = line.decode('latin-1').encode('utf-8')
            id = id.decode('latin-1').encode('utf-8')

            try:
                self.client.index(line, index, type, id, bulk=self.batch_mode)
            except UnicodeDecodeError as e:
                print "Error processing line with id %s: %s" % (id, e.message)
            except NoServerAvailable as e:
                print "The server failed to respond while indexing %s: [%s]. Sleeping %d seconds and retrying..." % (id, e.message, sleep_time)
                sleep(5)
                try:
                    print "Retrying indexing of %s" % id
                    self.client.index(line, index, type, id, bulk=self.batch_mode)
                except NoServerAvailable as e:
                    print "Failed to reconnect again. Skipping indexing %s" % id
                except Exception as e:
                    print "This happened: %s" % e

            index_count += 1
            if index_count % int(self.client.bulk_size) == 0:
                print 'Indexing batch of %d, starting from %s' % (self.client.bulk_size, id)
                sleep(sleep_time)

        # index remaining bulk entries
        self.client.force_bulk()
Esempio n. 2
0
dataset = shelve.open("samples.shelve")

mapping = { u'description': {'boost': 1.0,
                 'index': 'analyzed',
                 'store': 'yes',
                 'type': u'string',
                 "term_vector" : "with_positions_offsets"
                 },
         u'name': {'boost': 1.0,
                    'index': 'analyzed',
                    'store': 'yes',
                    'type': u'string',
                    "term_vector" : "with_positions_offsets"
                    },
         u'age': {'store': 'yes',
                    'type': u'integer'},    
                    }
conn.create_index("test-index")
conn.put_mapping("test-type", {'properties':mapping}, ["test-index"])

start = datetime.now()
for k, userdata in dataset.items():
#    conn.index(userdata, "test-index", "test-type", k)
    conn.index(userdata, "test-index", "test-type", k, bulk=True)
conn.force_bulk()
end = datetime.now()

print "time:", end-start
dataset.close()

es = ES()

index_name = "my_index"
type_name = "my_type"

from utils_pyes import create_and_add_mapping

create_and_add_mapping(es, index_name, type_name)

es.index(doc={"name": "Joe Tester", "parsedtext": "Joe Testere nice guy", "uuid": "11111", "position": 1},
         index=index_name, doc_type=type_name, id=1)
es.index(doc={"name": "data1", "value": "value1"}, index=index_name, doc_type=type_name + "2", id=1, parent=1)
es.index(doc={"name": "Bill Baloney", "parsedtext": "Bill Testere nice guy", "uuid": "22222", "position": 2},
         index=index_name, doc_type=type_name, id=2, bulk=True)
es.index(doc={"name": "data2", "value": "value2"}, index=index_name, doc_type=type_name + "2", id=2, parent=2,
         bulk=True)
es.index(doc={"name": "Bill Clinton", "parsedtext": """Bill is not
        nice guy""", "uuid": "33333", "position": 3}, index=index_name, doc_type=type_name, id=3, bulk=True)

es.force_bulk()

es.update(index=index_name, doc_type=type_name, id=2, script='ctx._source.position += 1')
es.update(index=index_name, doc_type=type_name, id=2, script='ctx._source.position += 1', bulk=True)

es.delete(index=index_name, doc_type=type_name, id=1, bulk=True)
es.delete(index=index_name, doc_type=type_name, id=3)

es.force_bulk()
es.indices.refresh(index_name)

es.indices.delete_index(index_name)
Esempio n. 4
0
class ProcessSpiderData(Task):
    def run(self, spider_name):
        cities = []
        backup_source = []
        backup_created_date = None

        self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500)
        java = JavaInterface()

        self.extractor = java.ArticleSentencesExtractor.INSTANCE
        self.logger = ProcessSpiderData.get_logger()

        spider = Data.objects.get(name=spider_name)
        source = spider.source

        if spider and len(source):
            backup_created_date = spider.created_date
            index_new = '%s_%d' % (spider.name, int(time.time()))

            # create new index (not connected to alias)
            self.elastic.create_index(index_new)
            self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new)

            for item in source:
                item = self._process_content(item)
                item = self._get_location(item)

                if item.has_key('city'):
                    cities.append(item['city'])

                self._create_index(index_new, item)
                backup_source.append(item)

            # save new index (in bulk)
            self.elastic.force_bulk()

            # create alias
            indices_old = self.elastic.get_alias(spider.name)
            self.elastic.set_alias(spider.name, [index_new])

            # delete all indices
            for index in indices_old:
                self.elastic.delete_index_if_exists(index)

            # optimize
            self.elastic.optimize(index_new, refresh=True)

        # save backup (currently processed data)
        if len(backup_source) and backup_created_date:
            self._process_cities(set(cities), spider_name)
            cache.clear()

            obj = DataBackup.objects.get_or_create(
                name=spider_name,
                created_date=backup_created_date
            )

            obj[0].source = binascii.hexlify(bz2.compress(
                JSONEncoder().encode(backup_source)
            ))

            obj[0].save()

        # force java & ES garbage collection
        self.elastic.connection.close()
        del self.extractor
        del java

        return True

    def _process_content(self, item):
        if len(item['content']):
            item['content'] = self.extractor.getText(jpype.JString(item['content']))
        return item

    def _get_location(self, item):
        if not item.has_key('city'):
            return item

        try:
            geo = geocoders.GeoNames()
            places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False)

            if places:
                place, (lat, lon) = places[0] if isinstance(places, list) else places
                if place: item['pin'] = {
                    'location': { 'lat': lat, 'lon': lon }
                 }
        except: pass
        return item

    def _create_index(self, index, item):
        id = item['id']
        del item['id']

        try:
            self.elastic.get(index, 'job', id)
        except ElasticSearchException:
            self.elastic.index(
                dumps(item, cls=DjangoJSONEncoder),
                index, 'job', id, bulk=True
            )

    def _process_cities(self, cities, spider_name):
        cities_current = City.objects.filter(indices__contains='"%s"' % spider_name)

        # save lists of saved cities
        cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ]
        cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ]

        for city in cities:
            city = unicode(city.strip().lower())
            city = normalize_spaces.sub(' ', city)
            city = remove_braces.sub('', city)

            city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')]
            city_clean = ' '.join(filter(None, city_clean))

            city, created = City.objects.get_or_create(name = city_clean[:255])

            if created:
                city.indices = [spider_name]
            else:
                city.indices.append(spider_name)
                city.indices = list(set(city.indices))

            city.save()

            if city.name in cities_old_single: cities_old_single.remove(city.name)
            if city.name in cities_old_multi: cities_old_multi.remove(city.name)

        # remove unlinked citie
        City.objects.filter(name__in=cities_old_single).delete()

        for item in City.objects.filter(name__in=cities_old_multi):
            if spider_name in item.indices:
                item.indices.remove(spider_name)
                item.save()
Esempio n. 5
0
dataset = shelve.open("samples.shelve")

mapping = { u'description': {'boost': 1.0,
                 'index': 'analyzed',
                 'store': 'yes',
                 'type': u'string',
                 "term_vector" : "with_positions_offsets"
                 },
         u'name': {'boost': 1.0,
                    'index': 'analyzed',
                    'store': 'yes',
                    'type': u'string',
                    "term_vector" : "with_positions_offsets"
                    },
         u'age': {'store': 'yes',
                    'type': u'integer'},
                    }
conn.create_index("test-index")
conn.put_mapping("test-type", {'properties':mapping}, ["test-index"])

start = datetime.now()
for k, userdata in dataset.items():
#    conn.index(userdata, "test-index", "test-type", k)
    conn.index(userdata, "test-index", "test-type", k, bulk=True)
conn.force_bulk()
end = datetime.now()

print "time:", end - start
dataset.close()

Esempio n. 6
0
         id=2,
         parent=2,
         bulk=True)
es.index(doc={
    "name": "Bill Clinton",
    "parsedtext": """Bill is not
        nice guy""",
    "uuid": "33333",
    "position": 3
},
         index=index_name,
         doc_type=type_name,
         id=3,
         bulk=True)

es.force_bulk()

es.update(index=index_name,
          doc_type=type_name,
          id=2,
          script='ctx._source.position += 1')
es.update(index=index_name,
          doc_type=type_name,
          id=2,
          script='ctx._source.position += 1',
          bulk=True)

es.delete(index=index_name, doc_type=type_name, id=1, bulk=True)
es.delete(index=index_name, doc_type=type_name, id=3)

es.force_bulk()