Example #1
0
    def tearDown(self):
        self.log.warning("before tearDown es")
	self._unlink_es_cluster()
	self._stop_es_replication()
	if self.es_host != None:
		conn = ES(self.es_host + ":9200")
	        conn.delete_index_if_exists("default")
        super(ElasticSearchSupport, self).tearDown()
        self.log.warning("after tearDown es")
Example #2
0
    def setUp(self):
	self.es_host = None
	self.es_cluster_name = None
	self._state = []
        super(ElasticSearchSupport, self).setUp()
        self.es_host = self.input.param("es_host", "127.0.0.1")
        self.es_port = self.input.param("es_port", 9091)
	conn = ES(self.es_host + ":9200")
        if not self.input.param("skip_cleanup", True) or self.case_number == 1:
                conn.delete_index_if_exists("default")
	conn.create_index("default")
        self.log.warning("waiting for ES index to be ready to use")
        time.sleep(30)
	self._link_es_cluster()
	self._start_es_replication()
        self.log.warning("after setUp es")
Example #3
0
    def handle(self, *args, **kwargs):
        delete_all = kwargs.get('all')
        elastic = ES(settings.SEARCH_HOSTS)
        indices = []

        if delete_all:
            indices.extend(elastic.get_indices(True))
            indices.extend(elastic.get_closed_indices())

            for index in indices:
                elastic.delete_index_if_exists(index)
        else:
            for source_name in args:
                indices_aliased = [index for index in elastic.get_alias(source_name) if index == source_name]
                elastic.delete_index_if_exists(source_name)

                if indices_aliased:
                    elastic.delete_alias(source_name, indices_aliased)

                    for index in indices_aliased:
                        elastic.delete_index_if_exists(index)

        if len(indices) and len(args):
            elastic.connection.close()
            self.stdout.write("Successfully deleted indicies & aliases.\n")

        elastic.connection.close()
class SampleMaker(object):
    def __init__(self, name):
        log = open(name, "wb")
        self.log = log
        self.conn = ES(("http", "127.0.0.1", 9200), timeout=300.0, log_curl=True, dump_curl=log)
        self.index_name = "test-index"
        self.document_type = "test-type"
        self.conn.delete_index_if_exists(self.index_name)
        self.init_default_index()


    def init_default_index(self):
        from pyes.helpers import SettingsBuilder
        settings = SettingsBuilder()
        from pyes.mappings import DocumentObjectField
        from pyes.mappings import IntegerField
        from pyes.mappings import NestedObject
        from pyes.mappings import StringField, DateField, BooleanField, GeoPointField, FloatField

        docmapping = DocumentObjectField(name=self.document_type)
        docmapping.add_property(
            StringField(name="description", store=True, term_vector="with_positions_offsets", index="analyzed"))
        docmapping.add_property(
            StringField(name="name", store=True, term_vector="with_positions_offsets", index="analyzed"))
        docmapping.add_property(StringField(name="tag", store=True, index="not_analyzed"))
        docmapping.add_property(IntegerField(name="age", store=True))
        docmapping.add_property(FloatField(name="price"))
        docmapping.add_property(DateField(name="date", store=True))
        docmapping.add_property(BooleanField(name="in_stock", store=True, index="not_analyzed"))
        docmapping.add_property(GeoPointField(name="position"))
        nested_object = NestedObject(name="metadata")
        nested_object.add_property(StringField(name="name", store=True))
        nested_object.add_property(StringField(name="value", store=True))
        nested_object.add_property(IntegerField(name="num", store=True))
        docmapping.add_property(nested_object)
        settings.add_mapping(docmapping)

        self.conn.ensure_index(self.index_name, settings)

    def generate_datafile(self, number_items=1000):
        """
        Generate a dataset with number_items elements.
        """

        names = get_names()
        totalnames = len(names)
        #init random seeder
        random.seed()
        #calculate items
        #    names = random.sample(names, number_items)
        for i in xrange(number_items):
            data = {"name": names[random.randint(0, totalnames - 1)],
                   "age": random.randint(1, 100),
                   "price": random.random()*100.0,
                   "tag":[words(1, False) for r in xrange(random.randint(1, 5))],
                   "in_stock": random.choice([True, False]),
                   "date": datetime.now()+timedelta(days=random.choice([1, -1])*random.randint(0,1000)),
                   "position": {
                       "lat" : random.choice([1, -1])* random.random()*90.0,
                        "lon" : random.choice([1, -1])* random.random()*180.0

                   },
                   "description": words(random.randint(1, 100), False),
                   "metadata":[{"name":names[random.randint(0, totalnames - 1)],
                                "value":str(random.randint(1, 5)), "num":random.randint(1, 50) } for r in xrange(random.randint(1, 5))]
                   }
            self.conn.index(data, self.index_name, self.document_type, id=str(i+1))


    def close(self):
        self.conn.flush(self.index_name)
        self.log.close()
Example #5
0
class ProcessSpiderData(Task):
    def run(self, spider_name):
        cities = []
        backup_source = []
        backup_created_date = None

        self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500)
        java = JavaInterface()

        self.extractor = java.ArticleSentencesExtractor.INSTANCE
        self.logger = ProcessSpiderData.get_logger()

        spider = Data.objects.get(name=spider_name)
        source = spider.source

        if spider and len(source):
            backup_created_date = spider.created_date
            index_new = '%s_%d' % (spider.name, int(time.time()))

            # create new index (not connected to alias)
            self.elastic.create_index(index_new)
            self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new)

            for item in source:
                item = self._process_content(item)
                item = self._get_location(item)

                if item.has_key('city'):
                    cities.append(item['city'])

                self._create_index(index_new, item)
                backup_source.append(item)

            # save new index (in bulk)
            self.elastic.force_bulk()

            # create alias
            indices_old = self.elastic.get_alias(spider.name)
            self.elastic.set_alias(spider.name, [index_new])

            # delete all indices
            for index in indices_old:
                self.elastic.delete_index_if_exists(index)

            # optimize
            self.elastic.optimize(index_new, refresh=True)

        # save backup (currently processed data)
        if len(backup_source) and backup_created_date:
            self._process_cities(set(cities), spider_name)
            cache.clear()

            obj = DataBackup.objects.get_or_create(
                name=spider_name,
                created_date=backup_created_date
            )

            obj[0].source = binascii.hexlify(bz2.compress(
                JSONEncoder().encode(backup_source)
            ))

            obj[0].save()

        # force java & ES garbage collection
        self.elastic.connection.close()
        del self.extractor
        del java

        return True

    def _process_content(self, item):
        if len(item['content']):
            item['content'] = self.extractor.getText(jpype.JString(item['content']))
        return item

    def _get_location(self, item):
        if not item.has_key('city'):
            return item

        try:
            geo = geocoders.GeoNames()
            places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False)

            if places:
                place, (lat, lon) = places[0] if isinstance(places, list) else places
                if place: item['pin'] = {
                    'location': { 'lat': lat, 'lon': lon }
                 }
        except: pass
        return item

    def _create_index(self, index, item):
        id = item['id']
        del item['id']

        try:
            self.elastic.get(index, 'job', id)
        except ElasticSearchException:
            self.elastic.index(
                dumps(item, cls=DjangoJSONEncoder),
                index, 'job', id, bulk=True
            )

    def _process_cities(self, cities, spider_name):
        cities_current = City.objects.filter(indices__contains='"%s"' % spider_name)

        # save lists of saved cities
        cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ]
        cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ]

        for city in cities:
            city = unicode(city.strip().lower())
            city = normalize_spaces.sub(' ', city)
            city = remove_braces.sub('', city)

            city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')]
            city_clean = ' '.join(filter(None, city_clean))

            city, created = City.objects.get_or_create(name = city_clean[:255])

            if created:
                city.indices = [spider_name]
            else:
                city.indices.append(spider_name)
                city.indices = list(set(city.indices))

            city.save()

            if city.name in cities_old_single: cities_old_single.remove(city.name)
            if city.name in cities_old_multi: cities_old_multi.remove(city.name)

        # remove unlinked citie
        City.objects.filter(name__in=cities_old_single).delete()

        for item in City.objects.filter(name__in=cities_old_multi):
            if spider_name in item.indices:
                item.indices.remove(spider_name)
                item.save()
Example #6
0
class SampleMaker(object):
    def __init__(self, name):
        log = open(name, "wb")
        self.log = log
        self.conn = ES(("http", "127.0.0.1", 9200),
                       timeout=300.0,
                       log_curl=True,
                       dump_curl=log)
        self.index_name = "test-index"
        self.document_type = "test-type"
        self.conn.delete_index_if_exists(self.index_name)
        self.init_default_index()

    def init_default_index(self):
        from pyes.helpers import SettingsBuilder
        settings = SettingsBuilder()
        from pyes.mappings import DocumentObjectField
        from pyes.mappings import IntegerField
        from pyes.mappings import NestedObject
        from pyes.mappings import StringField, DateField, BooleanField, GeoPointField, FloatField

        docmapping = DocumentObjectField(name=self.document_type)
        docmapping.add_property(
            StringField(name="description",
                        store=True,
                        term_vector="with_positions_offsets",
                        index="analyzed"))
        docmapping.add_property(
            StringField(name="name",
                        store=True,
                        term_vector="with_positions_offsets",
                        index="analyzed"))
        docmapping.add_property(
            StringField(name="tag", store=True, index="not_analyzed"))
        docmapping.add_property(IntegerField(name="age", store=True))
        docmapping.add_property(FloatField(name="price"))
        docmapping.add_property(DateField(name="date", store=True))
        docmapping.add_property(
            BooleanField(name="in_stock", store=True, index="not_analyzed"))
        docmapping.add_property(GeoPointField(name="position"))
        nested_object = NestedObject(name="metadata")
        nested_object.add_property(StringField(name="name", store=True))
        nested_object.add_property(StringField(name="value", store=True))
        nested_object.add_property(IntegerField(name="num", store=True))
        docmapping.add_property(nested_object)
        settings.add_mapping(docmapping)

        self.conn.ensure_index(self.index_name, settings)

    def generate_datafile(self, number_items=1000):
        """
        Generate a dataset with number_items elements.
        """

        names = get_names()
        totalnames = len(names)
        #init random seeder
        random.seed()
        #calculate items
        #    names = random.sample(names, number_items)
        for i in xrange(number_items):
            data = {
                "name":
                names[random.randint(0, totalnames - 1)],
                "age":
                random.randint(1, 100),
                "price":
                random.random() * 100.0,
                "tag": [words(1, False) for r in xrange(random.randint(1, 5))],
                "in_stock":
                random.choice([True, False]),
                "date":
                datetime.now() + timedelta(days=random.choice([1, -1]) *
                                           random.randint(0, 1000)),
                "position": {
                    "lat": random.choice([1, -1]) * random.random() * 90.0,
                    "lon": random.choice([1, -1]) * random.random() * 180.0
                },
                "description":
                words(random.randint(1, 100), False),
                "metadata": [{
                    "name": names[random.randint(0, totalnames - 1)],
                    "value": str(random.randint(1, 5)),
                    "num": random.randint(1, 50)
                } for r in xrange(random.randint(1, 5))]
            }
            self.conn.index(data,
                            self.index_name,
                            self.document_type,
                            id=str(i + 1))

    def close(self):
        self.conn.flush(self.index_name)
        self.log.close()