Example #1
0
def init():
    conn = ES('127.0.0.1:9200')
    try:
        conn.delete_index("zhihu")
    except:
        pass
    conn.create_index("zhihu")
    mapping = {
        u'id': {
            'store': 'yes',
            'type': u'integer'
        },
        u'link': {
            'store': 'yes',
            'type': u'string'
        },
        u'title': {
            'boost': 1.0,
            'index': 'analyzed',
            'store': 'yes',
            'type': u'string'
        },
    }
    conn.put_mapping("answer", {'properties': mapping}, ["zhihu"])
    for item in Data().getData():
        conn.index(item, "zhihu", "answer", item['id'])
    conn.refresh(["zhihu"])
    return redirect('/list')
def setup_store():
    connection = ES(settings.THUMBNAIL_ELASTIC_SEARCH_SERVERS)
    try:
        connection.create_index_if_missing(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX)
    except:
        pass
    try:
        connection.put_mapping(settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,
                               settings.THUMBNAIL_ELASTIC_SEARCH_MAPPING,
                               indexes=[settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,])
    except:
        pass
Example #3
0
    def handle(self, *args, **kwargs):
        init_all = kwargs.get('all')
        
        elastic = ES(settings.SEARCH_HOSTS)
        aliases = settings.SEARCH_ALIASES if init_all else args
        indices_new = []

        for alias in aliases:
            index_new = '%s_%d' % (alias, int(time.time()))
            indices_new.append(index_new)
            
            elastic.create_index_if_missing(index_new)
            elastic.add_alias(alias, [index_new])

        if len(aliases):
            elastic.put_mapping('job', {'job':{'properties':mapping}}, indices_new)
            self.stdout.write("Successfully created indices mapping.\n")

        elastic.connection.close()
Example #4
0
File: app.py Project: iamsk/es-demo
def init():
    conn = ES('127.0.0.1:9200')
    try:
        conn.delete_index("zhihu")
    except:
        pass
    conn.create_index("zhihu")
    mapping = {
        u'id': {'store': 'yes',
                'type': u'integer'},
        u'link': {'store': 'yes',
                  'type': u'string'},
        u'title': {'boost': 1.0,
                   'index': 'analyzed',
                   'store': 'yes',
                   'type': u'string'},
    }
    conn.put_mapping("answer", {'properties': mapping}, ["zhihu"])
    for item in Data().getData():
        conn.index(item, "zhihu", "answer", item['id'])
    conn.refresh(["zhihu"])
    return redirect('/list')
Example #5
0
def ext_process(listname, hostname, url, filepath, msg):
    """Here's where you put your code to deal with the just archived message.

    Arguments here are the list name, the host name, the URL to the just
    archived message, the file system path to the just archived message and
    the message object.

    These can be replaced or augmented as needed.
    """
    from pyes import ES
    from pyes.exceptions import ClusterBlockException, NoServerAvailable
    import datetime

    #CHANGE this settings to reflect your configuration
    _ES_SERVERS = ['127.0.0.1:9500'] # I prefer thrift
    _indexname = "mailman"
    _doctype = "mail"
    date = datetime.datetime.today()

    try:
        iconn = ES(_ES_SERVERS)
        status = None
        try:
            status = iconn.status(_indexname)
            logger.debug("Indexer status:%s" % status)
        except:
            iconn.create_index(_indexname)
            time.sleep(1)
            status = iconn.status(_indexname)
            mappings = { u'text': {'boost': 1.0,
                                     'index': 'analyzed',
                                     'store': 'yes',
                                     'type': u'string',
                                     "term_vector" : "with_positions_offsets"},
                             u'url': {'boost': 1.0,
                                        'index': 'not_analyzed',
                                        'store': 'yes',
                                        'type': u'string',
                                        "term_vector" : "no"},
                             u'title': {'boost': 1.0,
                                        'index': 'analyzed',
                                        'store': 'yes',
                                        'type': u'string',
                                        "term_vector" : "with_positions_offsets"},
                             u'date': {'store': 'yes',
                                        'type': u'date'}}
            time.sleep(1)
            status = iconn.put_mapping(_doctype, mappings, _indexname)


        data = dict(url=url,
                    title=msg.get('subject'),
                    date=date,
                    text=str(msg)
                    )
        iconn.index(data, _indexname, _doctype)

        syslog('debug', 'listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
               listname, hostname, url, filepath, msg)
    except ClusterBlockException:
        syslog('error', 'Cluster in revocery state: listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
               listname, hostname, url, filepath, msg)
    except NoServerAvailable:
        syslog('error', 'No server available: listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
               listname, hostname, url, filepath, msg)
    except:
        import traceback
        syslog('error', 'Unknown: listname: %s, hostname: %s, url: %s, path: %s, msg: %s\nstacktrace: %s',
               listname, hostname, url, filepath, msg, repr(traceback.format_exc()))

    return
Example #6
0
def ext_process(listname, hostname, url, filepath, msg):
    """Here's where you put your code to deal with the just archived message.

    Arguments here are the list name, the host name, the URL to the just
    archived message, the file system path to the just archived message and
    the message object.

    These can be replaced or augmented as needed.
    """
    from pyes import ES
    from pyes.exceptions import ClusterBlockException, NoServerAvailable
    import datetime

    #CHANGE this settings to reflect your configuration
    _ES_SERVERS = ['127.0.0.1:9500']  # I prefer thrift
    _indexname = "mailman"
    _doctype = "mail"
    date = datetime.datetime.today()

    try:
        iconn = ES(_ES_SERVERS)
        status = None
        try:
            status = iconn.status(_indexname)
            logger.debug("Indexer status:%s" % status)
        except:
            iconn.create_index(_indexname)
            time.sleep(1)
            status = iconn.status(_indexname)
            mappings = {
                u'text': {
                    'store': 'true',
                    'type': u'text',
                    "term_vector": "with_positions_offsets"
                },
                u'url': {
                    'store': 'true',
                    'type': u'keyword'
                },
                u'title': {
                    'store': 'true',
                    'type': u'text',
                    "term_vector": "with_positions_offsets"
                },
                u'date': {
                    'store': 'true',
                    'type': u'date'
                }
            }
            time.sleep(1)
            status = iconn.put_mapping(_doctype, mappings, _indexname)

        data = dict(url=url,
                    title=msg.get('subject'),
                    date=date,
                    text=str(msg))
        iconn.index(data, _indexname, _doctype)

        syslog('debug',
               'listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
               listname, hostname, url, filepath, msg)
    except ClusterBlockException:
        syslog(
            'error',
            'Cluster in revocery state: listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
            listname, hostname, url, filepath, msg)
    except NoServerAvailable:
        syslog(
            'error',
            'No server available: listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
            listname, hostname, url, filepath, msg)
    except:
        import traceback
        syslog(
            'error',
            'Unknown: listname: %s, hostname: %s, url: %s, path: %s, msg: %s\nstacktrace: %s',
            listname, hostname, url, filepath, msg,
            repr(traceback.format_exc()))

    return
Example #7
0
dataset = shelve.open("samples.shelve")

mapping = { u'description': {'boost': 1.0,
                 'index': 'analyzed',
                 'store': 'yes',
                 'type': u'string',
                 "term_vector" : "with_positions_offsets"
                 },
         u'name': {'boost': 1.0,
                    'index': 'analyzed',
                    'store': 'yes',
                    'type': u'string',
                    "term_vector" : "with_positions_offsets"
                    },
         u'age': {'store': 'yes',
                    'type': u'integer'},    
                    }
conn.create_index("test-index")
conn.put_mapping("test-type", {'properties':mapping}, ["test-index"])

start = datetime.now()
for k, userdata in dataset.items():
#    conn.index(userdata, "test-index", "test-type", k)
    conn.index(userdata, "test-index", "test-type", k, bulk=True)
conn.force_bulk()
end = datetime.now()

print "time:", end-start
dataset.close()

Example #8
0
class ProcessSpiderData(Task):
    def run(self, spider_name):
        cities = []
        backup_source = []
        backup_created_date = None

        self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500)
        java = JavaInterface()

        self.extractor = java.ArticleSentencesExtractor.INSTANCE
        self.logger = ProcessSpiderData.get_logger()

        spider = Data.objects.get(name=spider_name)
        source = spider.source

        if spider and len(source):
            backup_created_date = spider.created_date
            index_new = '%s_%d' % (spider.name, int(time.time()))

            # create new index (not connected to alias)
            self.elastic.create_index(index_new)
            self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new)

            for item in source:
                item = self._process_content(item)
                item = self._get_location(item)

                if item.has_key('city'):
                    cities.append(item['city'])

                self._create_index(index_new, item)
                backup_source.append(item)

            # save new index (in bulk)
            self.elastic.force_bulk()

            # create alias
            indices_old = self.elastic.get_alias(spider.name)
            self.elastic.set_alias(spider.name, [index_new])

            # delete all indices
            for index in indices_old:
                self.elastic.delete_index_if_exists(index)

            # optimize
            self.elastic.optimize(index_new, refresh=True)

        # save backup (currently processed data)
        if len(backup_source) and backup_created_date:
            self._process_cities(set(cities), spider_name)
            cache.clear()

            obj = DataBackup.objects.get_or_create(
                name=spider_name,
                created_date=backup_created_date
            )

            obj[0].source = binascii.hexlify(bz2.compress(
                JSONEncoder().encode(backup_source)
            ))

            obj[0].save()

        # force java & ES garbage collection
        self.elastic.connection.close()
        del self.extractor
        del java

        return True

    def _process_content(self, item):
        if len(item['content']):
            item['content'] = self.extractor.getText(jpype.JString(item['content']))
        return item

    def _get_location(self, item):
        if not item.has_key('city'):
            return item

        try:
            geo = geocoders.GeoNames()
            places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False)

            if places:
                place, (lat, lon) = places[0] if isinstance(places, list) else places
                if place: item['pin'] = {
                    'location': { 'lat': lat, 'lon': lon }
                 }
        except: pass
        return item

    def _create_index(self, index, item):
        id = item['id']
        del item['id']

        try:
            self.elastic.get(index, 'job', id)
        except ElasticSearchException:
            self.elastic.index(
                dumps(item, cls=DjangoJSONEncoder),
                index, 'job', id, bulk=True
            )

    def _process_cities(self, cities, spider_name):
        cities_current = City.objects.filter(indices__contains='"%s"' % spider_name)

        # save lists of saved cities
        cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ]
        cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ]

        for city in cities:
            city = unicode(city.strip().lower())
            city = normalize_spaces.sub(' ', city)
            city = remove_braces.sub('', city)

            city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')]
            city_clean = ' '.join(filter(None, city_clean))

            city, created = City.objects.get_or_create(name = city_clean[:255])

            if created:
                city.indices = [spider_name]
            else:
                city.indices.append(spider_name)
                city.indices = list(set(city.indices))

            city.save()

            if city.name in cities_old_single: cities_old_single.remove(city.name)
            if city.name in cities_old_multi: cities_old_multi.remove(city.name)

        # remove unlinked citie
        City.objects.filter(name__in=cities_old_single).delete()

        for item in City.objects.filter(name__in=cities_old_multi):
            if spider_name in item.indices:
                item.indices.remove(spider_name)
                item.save()
Example #9
0
dataset = shelve.open("samples.shelve")

mapping = { u'description': {'boost': 1.0,
                 'index': 'analyzed',
                 'store': 'yes',
                 'type': u'string',
                 "term_vector" : "with_positions_offsets"
                 },
         u'name': {'boost': 1.0,
                    'index': 'analyzed',
                    'store': 'yes',
                    'type': u'string',
                    "term_vector" : "with_positions_offsets"
                    },
         u'age': {'store': 'yes',
                    'type': u'integer'},
                    }
conn.create_index("test-index")
conn.put_mapping("test-type", {'properties':mapping}, ["test-index"])

start = datetime.now()
for k, userdata in dataset.items():
#    conn.index(userdata, "test-index", "test-type", k)
    conn.index(userdata, "test-index", "test-type", k, bulk=True)
conn.force_bulk()
end = datetime.now()

print "time:", end - start
dataset.close()

Example #10
0
class ESIndexerBase(object):
    ES_HOST = ES_HOST
    ES_INDEX_NAME = ES_INDEX_NAME
    ES_INDEX_TYPE = 'gene'

    def __init__(self):
        self.conn = ES(self.ES_HOST, default_indexes=[self.ES_INDEX_NAME],
        	           timeout=10.0)
        self.step = 10000

    def create_index(self):
        try:
            print self.conn.open_index(self.ES_INDEX_NAME)
        except IndexMissingException:
            print self.conn.create_index(self.ES_INDEX_NAME)

    def delete_index_type(self, index_type):
        '''Delete all indexes for a given index_type.'''
        index_name = self.ES_INDEX_NAME
#        index_type = self.ES_INDEX_TYPE
        #Check if index_type exists
        mapping = self.conn.get_mapping(index_type, index_name)
        if index_name not in mapping or index_type not in mapping[index_name]:
            print 'Error: index type "%s" does not exist in index "%s".' % (index_type, index_name)
            return
        path = '/%s/%s' % (index_name, index_type)
        if ask('Confirm to delete all data under "%s":' % path) == 'Y':
            return self.conn.delete_mapping(index_name, index_type)

    def index(self, doc, index_type, id=None):
        '''add a doc to the index. If id is not None, the existing doc will be
           updated.
        '''
#        index_type = self.ES_INDEX_TYPE
        return self.conn.index(doc, self.ES_INDEX_NAME, index_type, id=id)

    def delete_index(self, index_type, id):
        '''delete a doc from the index based on passed id.'''
#        index_type = self.ES_INDEX_TYPE
        return self.conn.delete(self.ES_INDEX_NAME, index_type, id)

    def optimize(self):
        return self.conn.optimize(self.ES_INDEX_NAME, wait_for_merge=True)

    def get_field_mapping(self):
        import dataload
        reload(dataload)
        dataload.register_sources()
        return dataload.get_mapping()

    def build_index(self, doc_d, update_mapping=False, bulk=True):
        index_name = self.ES_INDEX_NAME
        index_type = self.ES_INDEX_TYPE

        #Test if index exists
        try:
            print "Opening index...", self.conn.open_index(index_name)
        except NotFoundException:
            print 'Error: index "%s" does not exist. Create it first.' % index_name
            return -1

        try:
            cur_mapping = self.conn.get_mapping(index_type, index_name)
            empty_mapping = False
        except ElasticSearchException:
            #if no existing mapping available for index_type
            #force update_mapping to True
            empty_mapping = True
            update_mapping = True

#        empty_mapping = not cur_mapping[index_name].get(index_type, {})
#        if empty_mapping:
#            #if no existing mapping available for index_type
#            #force update_mapping to True
#            update_mapping = True

        if update_mapping:
            print "Updating mapping...",
            if not empty_mapping:
                print "\n\tRemoving existing mapping...",
                print self.conn.delete_mapping(index_name, index_type)
            _mapping = self.get_field_mapping()
            print self.conn.put_mapping(index_type,
                                   _mapping,
                                   [index_name])
        print "Building index..."
        t0 = time.time()
        for doc_id, doc in doc_d.items():
            self.conn.index(doc, index_name, index_type, doc_id, bulk=bulk)
        print self.conn.flush()
        print self.conn.refresh()
        print "Done[%s]" % timesofar(t0)

    def query(self, qs, fields='symbol,name', **kwargs):
        _q = StringQuery(qs)
        res = self.conn.search(_q, fields=fields, **kwargs)
        return res