コード例 #1
0
def index():
    """docstring for fname"""
    import time
    fptr = open(sys.argv[1], 'rb')
    line_count = 0
    conn = ES(["localhost:9200"])
    #conn.create_index('test-index')
    start = time.clock()
    numb_exceptions = 0

    for line in fptr:
        if ((line_count % 10000) == 0):
            end = time.clock()
            minutes = (end - start) / 60.0
            print 'Done with %d took %f min. ' %(line_count, minutes)
            print 'number of exceptions ', numb_exceptions
        line_count += 1
        data = json.loads(line)
        post_id = int(data['post_id'])
        if post_id and data:
            try:
                conn.index(data, "test-index", "test-type", post_id)
            except Exception:
                numb_exceptions += 1
                continue

    print 'number of exceptions ', numb_exceptions
コード例 #2
0
def init():
    conn = ES('127.0.0.1:9200')
    try:
        conn.delete_index("zhihu")
    except:
        pass
    conn.create_index("zhihu")
    mapping = {
        u'id': {
            'store': 'yes',
            'type': u'integer'
        },
        u'link': {
            'store': 'yes',
            'type': u'string'
        },
        u'title': {
            'boost': 1.0,
            'index': 'analyzed',
            'store': 'yes',
            'type': u'string'
        },
    }
    conn.put_mapping("answer", {'properties': mapping}, ["zhihu"])
    for item in Data().getData():
        conn.index(item, "zhihu", "answer", item['id'])
    conn.refresh(["zhihu"])
    return redirect('/list')
コード例 #3
0
class ElasticSearchPipeline(object):
    def __init__(self):

        self.settings = get_project_settings()

        basic_auth = {'username': self.settings['ELASTICSEARCH_USERNAME'], 'password': self.settings['ELASTICSEARCH_PASSWORD']}

        if self.settings['ELASTICSEARCH_PORT']:

            uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT'])
        else:
            uri = "%s" % (self.settings['ELASTICSEARCH_SERVER'])

        self.es = ES([uri], basic_auth=basic_auth)

    def process_item(self, item, spider):
        if self.__get_uniq_key() is None:
            log.msg("ELASTICSEARCH_UNIQ_KEY is NONE")
            self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'],
                          id=item['id'], op_type='create',)
        else:
            log.msg("Generation SHA1")
            self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'],
                          hashlib.sha1(item[self.__get_uniq_key()]).hexdigest())
        log.msg("Item send to Elastic Search %s" %
                    (self.settings['ELASTICSEARCH_INDEX']),
                    level=log.DEBUG, spider=spider)
        return item

    def __get_uniq_key(self):
        if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings['ELASTICSEARCH_UNIQ_KEY'] == "":
            return None
        return self.settings['ELASTICSEARCH_UNIQ_KEY']
コード例 #4
0
class ElasticSearchPipeline(object):

    def __init__(self):
        from pyes import ES
        self.settings = get_project_settings()
        if self.settings['ELASTICSEARCH_PORT']:
            uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT'])
        else:
            uri = "%s" % (self.settings['ELASTICSEARCH_SERVER'])
        self.es = ES([uri])

    def process_item(self, item, spider):
        if self.__get_uniq_key() is None:
            self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'],
                          id=item['id'], op_type='create',)
        else:
            self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'],
                          self._get_item_key(item))
        return item

    def _get_item_key(self, item):
        uniq = self.__get_uniq_key()
        if isinstance(uniq, list):
            values = [item[key] for key in uniq]
            value = ''.join(values)
        else:
            value = uniq

        return hashlib.sha1(value).hexdigest()

    def __get_uniq_key(self):
        if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings['ELASTICSEARCH_UNIQ_KEY'] == "":
            return None
        return self.settings['ELASTICSEARCH_UNIQ_KEY']
コード例 #5
0
class ElasticSearchPipeline(object):
    def __init__(self):
        self.settings = get_project_settings()

        basic_auth = {'username': self.settings['ELASTICSEARCH_USERNAME'], 'password': self.settings['ELASTICSEARCH_PASSWORD']}

        if self.settings['ELASTICSEARCH_PORT']:
            uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT'])
        else:
            uri = "%s" % (self.settings['ELASTICSEARCH_SERVER'])

        self.es = ES([uri], basic_auth=basic_auth)

    def index_item(self, item):
        if self.settings['ELASTICSEARCH_UNIQ_KEY']:
            local_id = hashlib.sha1(item[uniq_key)]).hexdigest()
            log.msg("Generated unique key %s" % local_id, level=self.settings['ELASTICSEARCH_LOG_LEVEL'])
            op_type = 'none'
        else:
            op_type = 'create'
            local_id = item['id']
        self.es.index(dict(item),
                      self.settings['ELASTICSEARCH_INDEX'],
                      self.settings['ELASTICSEARCH_TYPE'],
                      id=local_id,
                      op_type=op_type)
コード例 #6
0
ファイル: stash.py プロジェクト: hoffmann/stash
class StashHandler(logging.Handler):
    def __init__(self, constr, whitelist=None, blacklist=None):
        logging.Handler.__init__(self)
        self.conn = ES(constr)
        if blacklist is None:
            blacklist = set()
        self.whitelist = whitelist
        self.blacklist = blacklist
        self.record_type = 'record'

    @property
    def index_name(self):
        return 'logstash-'+datetime.date.today().strftime('%Y.%m.%d')

    def emit(self, record):
        if self.whitelist is None:
            d = { k: record.__dict__[k] for k in record.__dict__ if k not in self.blacklist }
        else:
            d = { k: record.__dict__[k] for k in record.__dict__ if k in self.whitelist and k not in self.blacklist }
        entry = {
            "@fields": d,
            "@message": record.msg, 
            "@source": "gelf://localhost", 
            "@source_host": "gelf://localhost", 
            "@source_path": "/", 
            "@tags": [], 
            "@timestamp": datetime.datetime.utcnow().isoformat(), 
            "@type": self.record_type}
        self.conn.index(entry, self.index_name, self.record_type)
コード例 #7
0
ファイル: search.py プロジェクト: FacturaVirtual/vosae-app
 def es_index(self):
     conn = ES(settings.ES_SERVERS, basic_auth=settings.ES_AUTH)
     conn.index(
         doc=self.get_search_kwargs(),
         index=self.tenant.slug,
         doc_type=self.Meta.document_type,
         id=unicode(self.id)
     )
コード例 #8
0
ファイル: pipelines.py プロジェクト: HackerEcology/qrator
class ElasticSearchPipeline(object):
    def __init__(self):
        self.conn = ES('localhost:9200')
        # self.file = open('urls.csv', 'wb')
        # self.file.write('spider,url' + '\n')

    def process_item(self, item, spider):
        #self.file.write(spider.name + ',' + spider.start_urls[0] + '\n')
        self.conn.index(dict(item), "qrator", spider.name)
        return item
コード例 #9
0
class ElasticSearchPipeline(object):
    def __init__(self):

        self.settings = get_project_settings()

        basic_auth = {
            'username': self.settings['ELASTICSEARCH_USERNAME'],
            'password': self.settings['ELASTICSEARCH_PASSWORD']
        }

        if self.settings['ELASTICSEARCH_PORT']:

            uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'],
                             self.settings['ELASTICSEARCH_PORT'])
        else:
            uri = "%s" % (self.settings['ELASTICSEARCH_SERVER'])

        self.es = ES([uri], basic_auth=basic_auth)

    def process_item(self, item, spider):
        if self.__get_uniq_key() is None:
            log.msg("ELASTICSEARCH_UNIQ_KEY is NONE")
            self.es.index(
                dict(item),
                self.settings['ELASTICSEARCH_INDEX'],
                self.settings['ELASTICSEARCH_TYPE'],
                id=item['id'],
                op_type='create',
            )
        else:
            self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'],
                          self.settings['ELASTICSEARCH_TYPE'],
                          self._get_item_key(item))
        log.msg("Item send to Elastic Search %s" %
                (self.settings['ELASTICSEARCH_INDEX']),
                level=log.DEBUG,
                spider=spider)
        return item

    def _get_item_key(self, item):
        uniq = self.__get_uniq_key()

        if isinstance(uniq, list):
            values = [item[key] for key in uniq]
            value = ''.join(values)
        else:
            value = uniq

        return hashlib.sha1(value).hexdigest()

    def __get_uniq_key(self):
        if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings[
                'ELASTICSEARCH_UNIQ_KEY'] == "":
            return None
        return self.settings['ELASTICSEARCH_UNIQ_KEY']
コード例 #10
0
ファイル: pipelines.py プロジェクト: HackerEcology/qrator
class ElasticSearchPipeline(object):

    def __init__(self):    
        self.conn = ES('localhost:9200') 
        # self.file = open('urls.csv', 'wb')
        # self.file.write('spider,url' + '\n')

    def process_item(self, item, spider):        
        #self.file.write(spider.name + ',' + spider.start_urls[0] + '\n')
        self.conn.index(dict(item), "qrator", spider.name)
        return item
コード例 #11
0
def processData(esurl, esindex, estype, shpPath, simplify, tolerance, startfrom):

    # Open a file for reading
    try:
        with open(shpPath):
            pass
    except IOError:
        print "Unable to locate file: " + shpPath

    # open the es connection
    from pyes import ES

    conn = ES(esurl, timeout=60, bulk_size=10)

    # check that a tolerance is passed when simplifying.
    if simplify == True:
        if tolerance == None:
            raise ValueError("You must pass a valid tolerance if simplifying geometry")

    # use fiona to open the shapefile and read it
    try:
        with fiona.open(shpPath) as source:

            for f in source:

                featid = int(f["id"])
                if featid > startfrom:

                    # grab the geom
                    from shapely.geometry import shape

                    geom = shape(f["geometry"])

                    # simplify if required
                    if validateGeometry(geom):
                        if simplify == True:
                            geom = simplifyGeometry(geom, tolerance)

                    # if the geom is valid then push it into es
                    if validateGeometry(geom):
                        data = json.dumps(f)
                        key = f["id"]
                        conn.index(data, esindex, estype, key, bulk=True)

                    else:
                        logging.error("Invalid Geometry: " + f["id"])

    except:
        raise
コード例 #12
0
def processData(esurl, esindex, estype, shpPath, simplify, tolerance,
                startfrom):

    # Open a file for reading
    try:
        with open(shpPath):
            pass
    except IOError:
        print 'Unable to locate file: ' + shpPath

    #open the es connection
    from pyes import ES
    conn = ES(esurl, timeout=60, bulk_size=10)

    #check that a tolerance is passed when simplifying.
    if (simplify == True):
        if (tolerance == None):
            raise ValueError(
                'You must pass a valid tolerance if simplifying geometry')

    #use fiona to open the shapefile and read it
    try:
        with fiona.open(shpPath) as source:

            for f in source:

                featid = int(f['id'])
                if (featid > startfrom):

                    #grab the geom
                    from shapely.geometry import shape
                    geom = shape(f['geometry'])

                    #simplify if required
                    if (validateGeometry(geom)):
                        if (simplify == True):
                            geom = simplifyGeometry(geom, tolerance)

                    #if the geom is valid then push it into es
                    if (validateGeometry(geom)):
                        data = json.dumps(f)
                        key = f['id']
                        conn.index(data, esindex, estype, key, bulk=True)

                    else:
                        logging.error('Invalid Geometry: ' + f['id'])

    except:
        raise
コード例 #13
0
class ElasticSearchSink(object):
    def __init__(self, server, index, type):
        from pyes import ES
        self.cxn = ES(server)
        self.index = index
        self.type = type

    def __call__(self, event):
        if isinstance(event, list):
            self.cxn.bulk_size = len(event)
            for e in event:
                self.cxn.index(e, self.index, self.type, bulk=True)
            self.cxn.flush_bulk()
        else:
            self.cxn.index(event, self.index, self.type)
コード例 #14
0
ファイル: example.py プロジェクト: ftdysa/slurp
class ElasticSearchSink(object):
    def __init__(self, server, index, type):
        from pyes import ES
        self.cxn = ES(server)
        self.index = index
        self.type = type

    def __call__(self, event):
        if isinstance(event, list):
            self.cxn.bulk_size = len(event)
            for e in event:
                self.cxn.index(e, self.index, self.type, bulk=True)
            self.cxn.flush_bulk()
        else:
            self.cxn.index(event, self.index, self.type)
コード例 #15
0
ファイル: importers.py プロジェクト: chrisguiney/pyeodi
class Importer(object):
    base_filename = "TicketNetworkDataFeed"

    model_map = {
        "performers": {
            "file": "Performers.csv",
            "model": Performer,
        },
        "events": {
            "file": "Events.csv",
            "model": Event,
        },
        "venues": {
            "file": "Venues.csv",
            "model": Venue,
        }
    }

    def __init__(self,
                 data_type,
                 csv_path="/tmp/",
                 es_hosts=("http://localhost:9200", )):
        self.data_type = data_type
        self.doc_type = "ticketnetwork_%s" % self.data_type
        self.csv_path = csv_path
        self.es = ES(es_hosts)

    def model(self):
        return self.model_map[self.data_type]["model"]

    def filepath(self):
        return os.path.join(
            self.csv_path, '-'.join(
                [self.base_filename, self.model_map[self.data_type]["file"]]))

    def __call__(self, *args, **kwargs):
        with open(self.filepath()) as f:
            reader = DictReader(f)
            for entry in reader:
                sanitize(entry)
                model = self.model()(entry)
                d = model.dict()
                self.es.index(d,
                              "oedi_sources",
                              self.doc_type,
                              model.hash(),
                              bulk=True)
            self.es.flush_bulk(True)
コード例 #16
0
ファイル: bulk.py プロジェクト: softius/elasticsearch-helper
def main(argv):
    start = 1
    if len(sys.argv) > 1:
        if sys.argv[1]:
            start = sys.argv[1]

    bulksize = 1000

    es = ES(("http", "localhost", 9200), bulk_size=bulksize)

    c0 = 0
    t0 = time.time()
    c1 = 0
    t1 = time.time()
    for n in range(start, start + 1000000):
        result = es.index(
            {
                'a': random_string_generator(),
                'b': random_string_generator(),
                'c': random_string_generator(),
                'd': random_string_generator(),
                'e': random_string_generator(),
                'f': random_string_generator(),
                'g': random_string_generator(),
                'h': random_string_generator(),
                'i': random_string_generator(),
                'j': random_string_generator(),
                'k': random_string_generator(),
                'l': random_string_generator(),
                'm': random_string_generator(),
                'n': random_string_generator(),
                'o': random_string_generator(),
                'p': random_string_generator(),
                'q': random_string_generator(),
                'r': random_string_generator(),
                's': random_string_generator(),
                't': random_string_generator(),
                'u': random_string_generator(),
                'v': random_string_generator(),
                'w': random_string_generator(),
                'x': random_string_generator(),
                'y': random_string_generator(),
                'z': random_string_generator()
            },
            'pyindex',
            'pytype',
            n,
            bulk=True)
        c0 = c0 + bulksize
        c1 = c1 + bulksize
        if result:
            d0 = (time.time() - t0)
            d1 = (time.time() - t1)
            now = datetime.datetime.utcnow()
            print("{0},{1},{2},{3},{4},{5},{6},{7}".format(
                now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), result.took, c0, d0,
                c0 / (d0 * bulksize), c1, d1, c1 / (d1 * bulksize)))
            c1 = 0
            t1 = time.time()
コード例 #17
0
ファイル: indexer.py プロジェクト: knockrentals/us-shapes
class Indexer:
    def __init__(self, es_host, batch_mode=True, batch_size=100):
        self.client = ES(es_host)
        self.batch_mode = batch_mode
        self.client.bulk_size = int(batch_size)

    def bulk_index(self, index, type, shapefile, sleep_time=0.1):
        print 'Indexing [%s] docs into [%s] from %s' % (type, index, shapefile)

        index_count = 0

        id_re = re.compile('^.*?"id"\s*:\s*"([^"]+)"')
        parens_re = re.compile('\(.*?\)')

        for line in input(shapefile):
            id = id_re.match(line).group(1)

            # cleanup any lines that contain parentheticals
            line = parens_re.sub('', line).strip()

            # sweet dec/encodings bro
            line = line.decode('latin-1').encode('utf-8')
            id = id.decode('latin-1').encode('utf-8')

            try:
                self.client.index(line, index, type, id, bulk=self.batch_mode)
            except UnicodeDecodeError as e:
                print "Error processing line with id %s: %s" % (id, e.message)
            except NoServerAvailable as e:
                print "The server failed to respond while indexing %s: [%s]. Sleeping %d seconds and retrying..." % (id, e.message, sleep_time)
                sleep(5)
                try:
                    print "Retrying indexing of %s" % id
                    self.client.index(line, index, type, id, bulk=self.batch_mode)
                except NoServerAvailable as e:
                    print "Failed to reconnect again. Skipping indexing %s" % id
                except Exception as e:
                    print "This happened: %s" % e

            index_count += 1
            if index_count % int(self.client.bulk_size) == 0:
                print 'Indexing batch of %d, starting from %s' % (self.client.bulk_size, id)
                sleep(sleep_time)

        # index remaining bulk entries
        self.client.force_bulk()
コード例 #18
0
class ElasticSearchPipeline(object):
    def __init__(self, settings):
        basic_auth = {'username': settings.get('ELASTICSEARCH_USERNAME'),
                      'password': settings.get('ELASTICSEARCH_PASSWORD')}
        if settings.get('ELASTICSEARCH_PORT'):
            uri = "%s:%d" % (settings.get('ELASTICSEARCH_SERVER'), settings.get('ELASTICSEARCH_PORT'))
        else:
            uri = "%s" % (settings.get('ELASTICSEARCH_SERVER'))
        self.es = ES([uri], basic_auth=basic_auth)
        self.settings = settings

    @classmethod
    def from_crawler(cls, crawler):
        pipe = cls(crawler.settings)
        return pipe

    def process_item(self, item, spider):
        if self.__get_uniq_key() is None:
            log.info("ELASTICSEARCH_UNIQ_KEY is NONE")
            self.es.index(dict(item), self.settings.get('ELASTICSEARCH_INDEX'), self.settings.get('ELASTICSEARCH_TYPE'),
                          id=item['id'], op_type='create', )
        else:
            self.es.index(dict(item), self.settings.get('ELASTICSEARCH_INDEX'), self.settings.get('ELASTICSEARCH_TYPE'),
                          self._get_item_key(item))
        log.debug("Item send to Elastic Search %s" %
                  (self.settings.get('ELASTICSEARCH_INDEX')), spider=spider)
        return item

    def _get_item_key(self, item):
        uniq = self.__get_uniq_key()

        if isinstance(uniq, list):
            values = [item[key] for key in uniq]
            value = ''.join(values)
        else:
            value = uniq

        return hashlib.sha1(value).hexdigest()

    def __get_uniq_key(self):
        if not self.settings.get('ELASTICSEARCH_UNIQ_KEY'):
            return self.settings.get('ELASTICSEARCH_UNIQ_KEY')
        else:
            return None
コード例 #19
0
def index(fname, index_name, keys_to_tag):
    fptr = open(fname, 'rb')
    line_count = 0
    conn = ES(["localhost:9200"])
    if not conn.exists_index(index_name):
        conn.create_index(index_name)
    start = time.clock()
    numb_exceptions = 0

    for line in fptr:
        if ((line_count % 10000) == 0):
            end = time.clock()
            minutes = (end - start) / 60.0
            print 'File: %s Done with %d took %f min. ' %(fname, line_count, minutes)
            print 'number of exceptions ', numb_exceptions
        line_count += 1
        data = json.loads(line)
        if not data.get('tags'):
            continue
        post_id = int(data['post_id'])
        found_content = False
        for k in keys_to_tag:
            if data.get(k):
                found_content = True
        if not found_content:
            continue
        index_data = dict()
        for k in keys_to_tag:
            value = data.get(k)
            if (value and (k == 'content')):
                try:
                    stripped_value = utils.strip_tags(value)
                except Exception:
                    stripped_value = value
                index_data[k] = stripped_value
        if post_id and data:
            try:
                conn.index(index_data, index_name, "test-type", post_id)
            except Exception:
                numb_exceptions += 1
                continue

    print 'number of exceptions ', numb_exceptions
コード例 #20
0
ファイル: bulk_load.py プロジェクト: evethandar/habakkuk
def main(fn, args):
    conn = ES(args.host, bulk_size=10*args.bulksize)
    if fn.endswith(".gz"):
        fp = gzip.open(fn)
    else:
        fp = open(fn)

    count = 0
    total = 0

    try:
        for line in fp:
            doc = json.loads(line.strip())
            if doc.get("_id"):
                _id = doc["_id"]
                del doc["_id"]
            else:
                _id = None

            conn.index(doc=doc,
                       index=args.index,
                       doc_type=args.doctype,
                       id=_id,
                       bulk=True)
            count+=1
            total+=1
            if count % args.bulksize == 0:
                flush(conn, count)
                count = 0
    except:
        print "traceback", "".join(traceback.format_exception(*sys.exc_info()))
        raise
    finally:
        fp.close()

    try:
        flush(conn, count)
        conn.refresh(args.index)
    except:
        pass

    print "Indexed %s docs total"%total
コード例 #21
0
class ElasticSearchPipeline(object):
    def __init__(self):
        uri = "%s:%d" % (settings['ELASTICSEARCH_SERVER'], settings['ELASTICSEARCH_PORT'])
        self.es = ES([uri])

    def process_item(self, item, spider):
        if self.__get_uniq_key() is None:
            self.es.index(dict(item), settings['ELASTICSEARCH_INDEX'], settings['ELASTICSEARCH_TYPE'])
        else:
            self.es.index(dict(item), settings['ELASTICSEARCH_INDEX'], settings['ELASTICSEARCH_TYPE'],
                          hashlib.sha1(item[self.__get_uniq_key()]).hexdigest())
        log.msg("Item send to Elastic Search %s" %
                    (settings['ELASTIC_SEARCH_INDEX']),
                    level=log.DEBUG, spider=spider)  
        return item

    def __get_uniq_key(self):
        if not settings['ELASTICSEARCH_UNIQ_KEY'] or settings['ELASTICSEARCH_UNIQ_KEY'] == "":
            return None
        return settings['ELASTICSEARCH_UNIQ_KEY']
コード例 #22
0
def main(argv):
    start = 1
    if len(sys.argv) > 1:
        if sys.argv[1]:
            start = sys.argv[1]

    bulksize = 1000

    es = ES(("http", "localhost", 9200), bulk_size=bulksize)

    c0 = 0
    t0 = time.time()
    c1 = 0
    t1 = time.time()
    for n in range(start, start + 1000000):
        result = es.index({ 
                 'a' : random_string_generator(),
                 'b' : random_string_generator(),
                 'c' : random_string_generator(),
                 'd' : random_string_generator(),
                 'e' : random_string_generator(),
                 'f' : random_string_generator(),
                 'g' : random_string_generator(),
                 'h' : random_string_generator(),
                 'i' : random_string_generator(),
                 'j' : random_string_generator(),
                 'k' : random_string_generator(),
                 'l' : random_string_generator(),
                 'm' : random_string_generator(),
                 'n' : random_string_generator(),
                 'o' : random_string_generator(),
                 'p' : random_string_generator(),
                 'q' : random_string_generator(),
                 'r' : random_string_generator(),
                 's' : random_string_generator(),
                 't' : random_string_generator(),
                 'u' : random_string_generator(),
                 'v' : random_string_generator(),
                 'w' : random_string_generator(),
                 'x' : random_string_generator(),
                 'y' : random_string_generator(),
                 'z' : random_string_generator()
             }, 'pyindex', 'pytype', n, bulk=True)
        c0 = c0 + bulksize
        c1 = c1 + bulksize
        if result:
            d0 = (time.time() - t0) 
            d1 = (time.time() - t1) 
            now = datetime.datetime.utcnow()
            print("{0},{1},{2},{3},{4},{5},{6},{7}"
                .format(now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), result.took, c0, d0, c0/(d0*bulksize), c1, d1, c1/(d1*bulksize)))
            c1 = 0
            t1 = time.time()
コード例 #23
0
ファイル: importers.py プロジェクト: chrisguiney/pyeodi
class Importer(object):
    base_filename = "TicketNetworkDataFeed"

    model_map = {
        "performers": {
            "file": "Performers.csv",
            "model": Performer,
        },
        "events": {
            "file": "Events.csv",
            "model": Event,
        },
        "venues": {
            "file": "Venues.csv",
            "model": Venue,
        }
    }

    def __init__(self, data_type, csv_path="/tmp/", es_hosts=("http://localhost:9200",)):
        self.data_type = data_type
        self.doc_type = "ticketnetwork_%s" % self.data_type
        self.csv_path = csv_path
        self.es = ES(es_hosts)

    def model(self):
        return self.model_map[self.data_type]["model"]

    def filepath(self):
        return os.path.join(self.csv_path,
                            '-'.join([self.base_filename, self.model_map[self.data_type]["file"]]))

    def __call__(self, *args, **kwargs):
        with open(self.filepath()) as f:
            reader = DictReader(f)
            for entry in reader:
                sanitize(entry)
                model = self.model()(entry)
                d = model.dict()
                self.es.index(d, "oedi_sources", self.doc_type, model.hash(), bulk=True)
            self.es.flush_bulk(True)
コード例 #24
0
ファイル: app.py プロジェクト: iamsk/es-demo
def init():
    conn = ES('127.0.0.1:9200')
    try:
        conn.delete_index("zhihu")
    except:
        pass
    conn.create_index("zhihu")
    mapping = {
        u'id': {'store': 'yes',
                'type': u'integer'},
        u'link': {'store': 'yes',
                  'type': u'string'},
        u'title': {'boost': 1.0,
                   'index': 'analyzed',
                   'store': 'yes',
                   'type': u'string'},
    }
    conn.put_mapping("answer", {'properties': mapping}, ["zhihu"])
    for item in Data().getData():
        conn.index(item, "zhihu", "answer", item['id'])
    conn.refresh(["zhihu"])
    return redirect('/list')
コード例 #25
0
def callback(body, message):
    """Do actual work."""

    logger.info("body in callback() is %s" % body)

    # pull lat/lon, time
    path = body
    sd = SD(path)
    lat = N.array(sd.select('Latitude').get())
    lon = N.array(sd.select('Longitude').get())
    t = N.array(sd.select('Time').get())
    sd.end()
    #logger.info("lat: %s" % str(lat.shape))
    #logger.info("lon: %s" % str(lon.shape))
    #logger.info("time: %s" % str(t.shape))

    # build metadata json
    id = os.path.basename(path)
    md = {
        "id": id,
        "dataset": "AIRX2RET",
        "starttime": t[0,0],
        "endtime": t[44,29],
        "location": {
            "coordinates": [[
                [ lon[0,0], lat[0,0] ],
                [ lon[0,29], lat[0,29] ],
                [ lon[44,29], lat[44,29] ],
                [ lon[44,0], lat[44,0] ],
                [ lon[0,0], lat[0,0] ],
            ]], 
            "type": "polygon"
        }, 
        "urls": "http://mozart/data/public/products/%s" % id
    }

    # publish
    pub_dir = '/data/public/products'
    ensure_dir(pub_dir)
    shutil.move(path, os.path.join(pub_dir, id))

    # insert into ElasticSearch
    index = doctype = 'airs'
    conn = ES('http://localhost:9200')
    mapping = json.load(open('grq_mapping.json'))
    if not conn.indices.exists_index(index):
        conn.indices.create_index(index, mapping)
    conn.indices.put_mapping(doctype, mapping, index)
    ret = conn.index(md, index, doctype, md['id'])

    message.ack()
コード例 #26
0
class KVStore(KVStoreBase):
    def __init__(self, *args, **kwargs):
        super(KVStore, self).__init__(*args, **kwargs)
        self.connection = ES(settings.THUMBNAIL_ELASTIC_SEARCH_SERVERS)

    def _get_raw(self, key):
        try:
            #import pdb; pdb.set_trace()
            value = self.connection.get(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, 
                                        settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,
                                        key)
            return value['_source']['value']
        except:
            return None

    def _set_raw(self, key, value):
        ret = self.connection.index({"value": value}, 
                                    settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,
                                    settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,
                                    key)
        return ret['ok']
    
    def _delete_raw(self, *keys):
        rets = []
        for key in keys:
            try:
                ret = self.connection.delete(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,
                                             settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,
                                             key)
                rets.append(ret['ok'])
            except:
                rets.append(False)
        return rets

    def _find_keys_raw(self, prefix):
        search = Search(query=PrefixQuery("_id", prefix), size=1000, start=0, fields=[])
        results = self.connection.search(search, 
                                         indexes=[settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,], 
                                         doc_types=[settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,])
        return [hit['_id'] for hit in results['hits']['hits']]
コード例 #27
0
ファイル: search.py プロジェクト: tcpr1/vosae-app
 def es_index(self):
     conn = ES(settings.ES_SERVERS, basic_auth=settings.ES_AUTH)
     conn.index(doc=self.get_search_kwargs(),
                index=self.tenant.slug,
                doc_type=self.Meta.document_type,
                id=unicode(self.id))
コード例 #28
0
ファイル: esbuilder.py プロジェクト: gkarthik/mygene.hub
class ESIndexerBase(object):
    ES_HOST = ES_HOST
    ES_INDEX_NAME = ES_INDEX_NAME
    ES_INDEX_TYPE = 'gene'

    def __init__(self):
        self.conn = ES(self.ES_HOST, default_indexes=[self.ES_INDEX_NAME],
        	           timeout=10.0)
        self.step = 10000

    def create_index(self):
        try:
            print self.conn.open_index(self.ES_INDEX_NAME)
        except IndexMissingException:
            print self.conn.create_index(self.ES_INDEX_NAME)

    def delete_index_type(self, index_type):
        '''Delete all indexes for a given index_type.'''
        index_name = self.ES_INDEX_NAME
#        index_type = self.ES_INDEX_TYPE
        #Check if index_type exists
        mapping = self.conn.get_mapping(index_type, index_name)
        if index_name not in mapping or index_type not in mapping[index_name]:
            print 'Error: index type "%s" does not exist in index "%s".' % (index_type, index_name)
            return
        path = '/%s/%s' % (index_name, index_type)
        if ask('Confirm to delete all data under "%s":' % path) == 'Y':
            return self.conn.delete_mapping(index_name, index_type)

    def index(self, doc, index_type, id=None):
        '''add a doc to the index. If id is not None, the existing doc will be
           updated.
        '''
#        index_type = self.ES_INDEX_TYPE
        return self.conn.index(doc, self.ES_INDEX_NAME, index_type, id=id)

    def delete_index(self, index_type, id):
        '''delete a doc from the index based on passed id.'''
#        index_type = self.ES_INDEX_TYPE
        return self.conn.delete(self.ES_INDEX_NAME, index_type, id)

    def optimize(self):
        return self.conn.optimize(self.ES_INDEX_NAME, wait_for_merge=True)

    def get_field_mapping(self):
        import dataload
        reload(dataload)
        dataload.register_sources()
        return dataload.get_mapping()

    def build_index(self, doc_d, update_mapping=False, bulk=True):
        index_name = self.ES_INDEX_NAME
        index_type = self.ES_INDEX_TYPE

        #Test if index exists
        try:
            print "Opening index...", self.conn.open_index(index_name)
        except NotFoundException:
            print 'Error: index "%s" does not exist. Create it first.' % index_name
            return -1

        try:
            cur_mapping = self.conn.get_mapping(index_type, index_name)
            empty_mapping = False
        except ElasticSearchException:
            #if no existing mapping available for index_type
            #force update_mapping to True
            empty_mapping = True
            update_mapping = True

#        empty_mapping = not cur_mapping[index_name].get(index_type, {})
#        if empty_mapping:
#            #if no existing mapping available for index_type
#            #force update_mapping to True
#            update_mapping = True

        if update_mapping:
            print "Updating mapping...",
            if not empty_mapping:
                print "\n\tRemoving existing mapping...",
                print self.conn.delete_mapping(index_name, index_type)
            _mapping = self.get_field_mapping()
            print self.conn.put_mapping(index_type,
                                   _mapping,
                                   [index_name])
        print "Building index..."
        t0 = time.time()
        for doc_id, doc in doc_d.items():
            self.conn.index(doc, index_name, index_type, doc_id, bulk=bulk)
        print self.conn.flush()
        print self.conn.refresh()
        print "Done[%s]" % timesofar(t0)

    def query(self, qs, fields='symbol,name', **kwargs):
        _q = StringQuery(qs)
        res = self.conn.search(_q, fields=fields, **kwargs)
        return res
コード例 #29
0
ファイル: __init__.py プロジェクト: wildcardcorp/zelastic
class ElasticCatalog(object):
    default_indexes = {
        'zelastic_doc_id': {
            'type': 'string',
            'index': 'not_analyzed'
        }
    }

    def __init__(self, connection_string, elastic_name, storage, bulk=False,
                 bulk_size=400):
        self.conn = ES(connection_string, bulk_size=bulk_size)
        self.bulk_size = bulk_size
        self.name = elastic_name
        self.storage = storage
        self.bulk = bulk

    def update_mapping(self, name):
        meta = self.storage.meta(name)
        indexes = meta['indexes']
        properties = self.default_indexes.copy()
        try:
            self.conn.create_index(self.name)
        except IndexAlreadyExistsException:
            pass
        for index_name, _type in indexes.items():
            index = None
            if _type == 'str':
                index = {
                    'type': 'string',
                    'index': 'not_analyzed',
                }
            elif _type == 'full':
                index = {
                    'type': 'string',
                    'index': 'analyzed',
                }
            elif _type == 'bool':
                index = {
                    'type': 'boolean'
                }
            elif _type == 'int':
                index = {
                    'type': 'integer',
                }
            elif _type in ('datetime', 'date'):
                index = {
                    'type': 'date',
                }
            elif _type == 'float':
                index = {
                    'type': 'float',
                }
            if index is not None:
                properties[index_name] = index
        self.conn.indices.put_mapping(
            doc_type=name,
            mapping={
                'ignore_conflicts': True,
                'properties': properties
            },
            indices=[self.name])

    def id(self, container_name, key):
        return '%s-%s' % (container_name, key)

    def index(self, container_name, doc, key):
        # need to add data to the index that isn't actually persisted
        data = {
            'zelastic_doc_id': key
        }
        meta = self.storage.meta(container_name)
        indexes = meta['indexes']
        for index in indexes.keys():
            if index in doc:
                data[index] = doc[index]
        self.conn.index(
            data,
            self.name,
            container_name,
            self.id(container_name, key),
            bulk=self.bulk)

    def delete(self, container_name, key):
        self.conn.delete(
            self.name,
            container_name,
            self.id(container_name, key),
            bulk=self.bulk)

    def delete_all(self, container_name):
        self.conn.delete_mapping(
            self.name,
            container_name)

    def search(self, container_name, query, **kwargs):
        return self.conn.search(
            query,
            indexes=[self.name],
            doc_types=[container_name],
            **kwargs)

    def getFacets(self, container_name, field, size=100):
        return self.conn.search_raw({
            "facets": {
                field: {
                    "terms": {
                        "all_terms": True,
                        "field": field,
                        "size": size,
                        "order": "term"
                    }
                }
            }
        }, indexes=[self.name], doc_type=container_name)
コード例 #30
0
ファイル: elastic.py プロジェクト: shuxiang/pyweb-utils
class Elastic(object):

    def init_app(self, app):
        self.conn = ES(app.config['ELASTIC_URL'], timeout=2)
        #self.remote_conns = [ES(url) for url in app.config['REMOTE_ELASTIC_URL']]

    def search(self, start=0, size=20, doc_types='resource', indices='order_index', sort=None, **kwargs):
        # set filter
        filters = []
        for k,v in kwargs.items():
            if k and k!='complete_time':
                filters.append(TermFilter(k, v))
            elif k and v!='' and k=='complete_time':
                ct = kwargs['complete_time']
                if len(ct) == 2:
                    filters.append(RangeFilter(ESRange('complete_time', from_value=ct[0], to_value=ct[1])))
                else:
                    filters.append(RangeFilter(ESRange('complete_time', from_value=ct[0])))
        
        _filter = None
        if filters:
            _filter = ANDFilter(filters)

        bq = MatchAllQuery()
        # filtered
        q = FilteredQuery(bq, _filter)

        # sort
        if sort:
            sf = SortFactory()
            for s in sort:
                sf.add(s)
            s = Search(q, sort=sf)
        else:
            s = Search(q)

        # result
        return self.conn.search(s, indices=indices, doc_types=doc_types, start=start, size=size)

    def delete(self, index='order_index', doc_type='resource', id=''):
        return self.conn.delete(index=index, doc_type=doc_type, id=id)

    def create(self, index='order_index', doc_type='resource', doc=None):
        # try:
        #     self.delete(index, doc_type, doc['id'])
        # except NotFoundException:
        #     pass
        try:
            return self.conn.index(doc, index, doc_type, id=doc['id'])
        except:# not connection
            pass

    def multi_create(self, index='order_index', doc_type='resource', doc=None):
        """如果同步缓存到远程,要使用celery"""
        try:
            return self.conn.index(doc, index, doc_type, id=doc['id'])
        except:# not connection
            pass
            
        try:
            for rconn in self.remote_conns:
                rconn.index(doc, index, doc_type, id=doc['id'])
        except:
            print '--------sync cache to remote error------'
コード例 #31
0
ファイル: elasticsearch.py プロジェクト: jmf-mordis/tamarco
class ElasticSearchHandler(logging.Handler):
    """Logging handler that sends the logs to a Elasticsearch instance."""
    def __init__(
        self,
        conn_strs=None,
        record_type="record",
        level=logging.NOTSET,
        fqdn=False,
        service_name=None,
        deploy_name=None,
        version=0,
    ):
        """Initialize the handler.

        Args:
            conn_strs (list): List of Elasticsearch connections strings.
            record_type (str): The record type always will be 'record'.
            level (str): Logging level. Default: NOTSET
            fqdn (bool): If True, the host field in the log record will be the fully qualified domain. Otherwise,
                the system hostname.
            service_name (str): Service name.
            deploy_name (str): Deploy name.
            version (int): If 1 it is used the Logstash formatter version 1. Otherwise, the logstash formatter
                version 0.
        """
        logging.Handler.__init__(self, level=level)
        self.conn_strs = conn_strs if conn_strs else ["127.0.0.1:9200"]
        self.connected = False
        self.conn = None
        self.try_conn()
        self.record_type = record_type
        if version == 1:
            self.formatter = logstash.LogstashFormatterVersion1(
                record_type, fqdn, service_name, deploy_name)
        else:
            self.formatter = logstash.LogstashFormatterVersion0(
                record_type, fqdn, service_name, deploy_name)

    def try_conn(self):
        """Try a new connection to the Elasticsearch."""
        try:
            self.conn = ES(self.conn_strs, timeout=5)
            self.connected = True
        except NoServerAvailable:
            print("Error connecting to elasticsearch for logging")

    @property
    def index_name(self):
        """Construct the logs Elasticsearch index.

        Returns:
            string: Logstash index.
        """
        return "logstash-" + datetime.date.today().strftime("%Y.%m.%d")

    def emit(self, record):
        """Emit the specified log record.

        Args:
            record (LogRecord): Entry log to emit.
        """
        entry = self.formatter.format(record)
        if self.connected:
            self.conn.index(entry, self.index_name, self.record_type)
        else:
            self.try_conn()
class SampleMaker(object):
    def __init__(self, name):
        log = open(name, "wb")
        self.log = log
        self.conn = ES(("http", "127.0.0.1", 9200), timeout=300.0, log_curl=True, dump_curl=log)
        self.index_name = "test-index"
        self.document_type = "test-type"
        self.conn.delete_index_if_exists(self.index_name)
        self.init_default_index()


    def init_default_index(self):
        from pyes.helpers import SettingsBuilder
        settings = SettingsBuilder()
        from pyes.mappings import DocumentObjectField
        from pyes.mappings import IntegerField
        from pyes.mappings import NestedObject
        from pyes.mappings import StringField, DateField, BooleanField, GeoPointField, FloatField

        docmapping = DocumentObjectField(name=self.document_type)
        docmapping.add_property(
            StringField(name="description", store=True, term_vector="with_positions_offsets", index="analyzed"))
        docmapping.add_property(
            StringField(name="name", store=True, term_vector="with_positions_offsets", index="analyzed"))
        docmapping.add_property(StringField(name="tag", store=True, index="not_analyzed"))
        docmapping.add_property(IntegerField(name="age", store=True))
        docmapping.add_property(FloatField(name="price"))
        docmapping.add_property(DateField(name="date", store=True))
        docmapping.add_property(BooleanField(name="in_stock", store=True, index="not_analyzed"))
        docmapping.add_property(GeoPointField(name="position"))
        nested_object = NestedObject(name="metadata")
        nested_object.add_property(StringField(name="name", store=True))
        nested_object.add_property(StringField(name="value", store=True))
        nested_object.add_property(IntegerField(name="num", store=True))
        docmapping.add_property(nested_object)
        settings.add_mapping(docmapping)

        self.conn.ensure_index(self.index_name, settings)

    def generate_datafile(self, number_items=1000):
        """
        Generate a dataset with number_items elements.
        """

        names = get_names()
        totalnames = len(names)
        #init random seeder
        random.seed()
        #calculate items
        #    names = random.sample(names, number_items)
        for i in xrange(number_items):
            data = {"name": names[random.randint(0, totalnames - 1)],
                   "age": random.randint(1, 100),
                   "price": random.random()*100.0,
                   "tag":[words(1, False) for r in xrange(random.randint(1, 5))],
                   "in_stock": random.choice([True, False]),
                   "date": datetime.now()+timedelta(days=random.choice([1, -1])*random.randint(0,1000)),
                   "position": {
                       "lat" : random.choice([1, -1])* random.random()*90.0,
                        "lon" : random.choice([1, -1])* random.random()*180.0

                   },
                   "description": words(random.randint(1, 100), False),
                   "metadata":[{"name":names[random.randint(0, totalnames - 1)],
                                "value":str(random.randint(1, 5)), "num":random.randint(1, 50) } for r in xrange(random.randint(1, 5))]
                   }
            self.conn.index(data, self.index_name, self.document_type, id=str(i+1))


    def close(self):
        self.conn.flush(self.index_name)
        self.log.close()
コード例 #33
0
ファイル: process.py プロジェクト: mkramb/zaposlim.se
class ProcessSpiderData(Task):
    def run(self, spider_name):
        cities = []
        backup_source = []
        backup_created_date = None

        self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500)
        java = JavaInterface()

        self.extractor = java.ArticleSentencesExtractor.INSTANCE
        self.logger = ProcessSpiderData.get_logger()

        spider = Data.objects.get(name=spider_name)
        source = spider.source

        if spider and len(source):
            backup_created_date = spider.created_date
            index_new = '%s_%d' % (spider.name, int(time.time()))

            # create new index (not connected to alias)
            self.elastic.create_index(index_new)
            self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new)

            for item in source:
                item = self._process_content(item)
                item = self._get_location(item)

                if item.has_key('city'):
                    cities.append(item['city'])

                self._create_index(index_new, item)
                backup_source.append(item)

            # save new index (in bulk)
            self.elastic.force_bulk()

            # create alias
            indices_old = self.elastic.get_alias(spider.name)
            self.elastic.set_alias(spider.name, [index_new])

            # delete all indices
            for index in indices_old:
                self.elastic.delete_index_if_exists(index)

            # optimize
            self.elastic.optimize(index_new, refresh=True)

        # save backup (currently processed data)
        if len(backup_source) and backup_created_date:
            self._process_cities(set(cities), spider_name)
            cache.clear()

            obj = DataBackup.objects.get_or_create(
                name=spider_name,
                created_date=backup_created_date
            )

            obj[0].source = binascii.hexlify(bz2.compress(
                JSONEncoder().encode(backup_source)
            ))

            obj[0].save()

        # force java & ES garbage collection
        self.elastic.connection.close()
        del self.extractor
        del java

        return True

    def _process_content(self, item):
        if len(item['content']):
            item['content'] = self.extractor.getText(jpype.JString(item['content']))
        return item

    def _get_location(self, item):
        if not item.has_key('city'):
            return item

        try:
            geo = geocoders.GeoNames()
            places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False)

            if places:
                place, (lat, lon) = places[0] if isinstance(places, list) else places
                if place: item['pin'] = {
                    'location': { 'lat': lat, 'lon': lon }
                 }
        except: pass
        return item

    def _create_index(self, index, item):
        id = item['id']
        del item['id']

        try:
            self.elastic.get(index, 'job', id)
        except ElasticSearchException:
            self.elastic.index(
                dumps(item, cls=DjangoJSONEncoder),
                index, 'job', id, bulk=True
            )

    def _process_cities(self, cities, spider_name):
        cities_current = City.objects.filter(indices__contains='"%s"' % spider_name)

        # save lists of saved cities
        cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ]
        cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ]

        for city in cities:
            city = unicode(city.strip().lower())
            city = normalize_spaces.sub(' ', city)
            city = remove_braces.sub('', city)

            city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')]
            city_clean = ' '.join(filter(None, city_clean))

            city, created = City.objects.get_or_create(name = city_clean[:255])

            if created:
                city.indices = [spider_name]
            else:
                city.indices.append(spider_name)
                city.indices = list(set(city.indices))

            city.save()

            if city.name in cities_old_single: cities_old_single.remove(city.name)
            if city.name in cities_old_multi: cities_old_multi.remove(city.name)

        # remove unlinked citie
        City.objects.filter(name__in=cities_old_single).delete()

        for item in City.objects.filter(name__in=cities_old_multi):
            if spider_name in item.indices:
                item.indices.remove(spider_name)
                item.save()
コード例 #34
0
class DocManager():
    """The DocManager class creates a connection to the backend engine and
        adds/removes documents, and in the case of rollback, searches for them.

        The reason for storing id/doc pairs as opposed to doc's is so that
        multiple updates to the same doc reflect the most up to date version as
        opposed to multiple, slightly different versions of a doc.

        We are using elastic native fields for _id and ns, but we also store
        them as fields in the document, due to compatibility issues.
        """

    def __init__(self, url, auto_commit=True, unique_key='_id'):
        """Verify Elastic URL and establish a connection.
        """

        if verify_url(url) is False:
            raise SystemError
        self.elastic = ES(server=url)
        self.auto_commit = auto_commit
        self.doc_type = 'string'  # default type is string, change if needed
        self.unique_key = unique_key
        if auto_commit:
            self.run_auto_commit()

    def stop(self):
        """ Stops the instance
        """
        self.auto_commit = False

    def upsert(self, doc):
        """Update or insert a document into Elastic

        If you'd like to have different types of document in your database,
        you can store the doc type as a field in Mongo and set doc_type to
        that field. (e.g. doc_type = doc['_type'])

        """

        doc_type = self.doc_type
        index = doc['ns']
        doc[self.unique_key] = str(doc[self.unique_key])
        doc_id = doc[self.unique_key]
        id_query = TextQuery('_id', doc_id)
        elastic_cursor = self.elastic.search(query=id_query, indices=index)

        try:
            self.elastic.index(bsjson.dumps(doc), index, doc_type, doc_id)
        except ValueError:
            logging.info("Could not update %s" % (doc,))
        self.elastic.refresh()

    def remove(self, doc):
        """Removes documents from Elastic

        The input is a python dictionary that represents a mongo document.
        """
        try:
            self.elastic.delete(doc['ns'], 'string', str(doc[self.unique_key]))
        except (NotFoundException, TypeMissingException, IndexMissingException):
            pass

    def _remove(self):
        """For test purposes only. Removes all documents in test.test
        """
        try:
            self.elastic.delete('test.test', 'string', '')
        except (NotFoundException, TypeMissingException, IndexMissingException):
            pass

    def search(self, start_ts, end_ts):
        """Called to query Elastic for documents in a time range.
        """
        res = ESRange('_ts', from_value=start_ts, to_value=end_ts)
        results = self.elastic.search(RangeQuery(res))
        return results

    def _search(self):
        """For test purposes only. Performs search on Elastic with empty query.
        Does not have to be implemented.
        """
        results = self.elastic.search(MatchAllQuery())
        return results

    def commit(self):
        """This function is used to force a refresh/commit.
        """
        retry_until_ok(self.elastic.refresh)

    def run_auto_commit(self):
        """Periodically commits to the Elastic server.
        """
        self.elastic.refresh()

        if self.auto_commit:
            Timer(1, self.run_auto_commit).start()

    def get_last_doc(self):
        """Returns the last document stored in the Elastic engine.
        """

        result = self.elastic.search(MatchAllQuery(), size=1, sort='_ts:desc')
        for item in result:
            return item
コード例 #35
0
from tools.FileTools import FileTools
from tools.FormatTranslator import FormatTranslator
from pyes.aggs import TermsAgg
  
  
ftool = FileTools()
ftrans = FormatTranslator() 
  
# 1. Create Connection
conn = ES()
  
# 2. Index Data
dataset_json = open("../dataset.json")
dataset = json.load(dataset_json)['data']
for data in dataset:
    conn.index(data, "example_index", "example_type", "example_id_"+str(dataset.index(data)))
      
# 3. Create Simple Query
query = MatchAllQuery()
  
# 4. Create Simple Aggregation
agg = TermsAgg('agg1', field="name",sub_aggs=[],size=100)
  
# 5. Get Result
search = Search(query,size=5)
search.agg.add(agg)
print search.serialize()
  
result = conn.search(search, "example_index", "example_type" )
  
for i in result:
コード例 #36
0
def import_instruments(instrs, es_url, index, alias):
    """Create JSON ES docs and import."""

    prefix = {
        "bibo": "http://purl.org/ontology/bibo/",
        "dcterms": "http://purl.org/dc/terms/",
        "eos": "http://nasa.gov/eos.owl#",
        "gcis": "http://data.globalchange.gov/gcis.owl#",
        "hysds": "http://hysds.jpl.nasa.gov/hysds/0.1#",
        "info": "http://info-uri.info/",
        "xlink": "http://www.w3.org/1999/xlink"
    }

    conn = ES(es_url)
    if not conn.indices.exists_index(index):
        conn.indices.create_index(index)

    # track agencies/organizations
    orgs = {}

    for instr in instrs:
        identifier = "eos:%s" % instr['Instrument Name Short']
        id = hashlib.md5(identifier).hexdigest()
        if 'Instrument Technology' in instr and not EMPTY.search(instr['Instrument Technology']):
            sensor = "eos:%s" % instr['Instrument Technology']
        else:
            if 'Instrument Type' in instr and not EMPTY.search(instr['Instrument Type']):
                sensor = "eos:%s" % instr['Instrument Type']
            else:
                if 'Subtype' in instr and not EMPTY.search(instr['Subtype']):
                    sensor = "eos:%s" % instr['Subtype']
                else:
                    if 'Type' in instr and not EMPTY.search(instr['Type']):
                        sensor = "eos:%s" % instr['Type']
                    else:
                        if 'Class' in instr and not EMPTY.search(instr['Class']):
                            sensor = "eos:%s" % instr['Class']
                        else:
                            sensor = None
        #print(instr['Instrument Technology'], sensor)
        platform = None
        if 'Instrument Agencies' in instr and not EMPTY.search(instr['Instrument Agencies']):
            org = "eos:%s" % instr['Instrument Agencies']
            if org not in orgs:
               orgs[org] = {
                   "prov_es_json": {
                       "prefix": prefix,
                       "agent": {
                           org: {
                               "prov:type": {
                                   "type": "prov:QualifiedName",
                                   "$": "prov:Organization",
                               },
                           },
                       },
                   },
                   "identifier": org,
                   "prov:type": "prov:Organization",
               }
               if len(conn.search(query=TermQuery("_id", org),
                                  indices=[alias])) > 0: pass
               else: conn.index(orgs[org], index, 'agent', org)
        else: org = None
        doc = {
            "prov_es_json": {
                "prefix": prefix,
                "entity": {
                    identifier: {
                        "gcis:hasSensor": sensor,
                        "gcis:inPlatform": platform,
                        "prov:type": "eos:instrument",
                        "gcis:hasGoverningOrganization": org,
                    },
                },
            },
            "gcis:hasSensor": sensor,
            "gcis:inPlatform": platform,
            "prov:type": "eos:instrument",
            "gcis:hasGoverningOrganization": org,
            "identifier": identifier,
        }
        if len(conn.search(query=TermQuery("_id", identifier),
                           indices=[alias])) > 0: pass
        else: conn.index(doc, index, 'entity', identifier)
コード例 #37
0
ファイル: archive_and_index.py プロジェクト: ttimasdf/pyes
def ext_process(listname, hostname, url, filepath, msg):
    """Here's where you put your code to deal with the just archived message.

    Arguments here are the list name, the host name, the URL to the just
    archived message, the file system path to the just archived message and
    the message object.

    These can be replaced or augmented as needed.
    """
    from pyes import ES
    from pyes.exceptions import ClusterBlockException, NoServerAvailable
    import datetime

    #CHANGE this settings to reflect your configuration
    _ES_SERVERS = ['127.0.0.1:9500']  # I prefer thrift
    _indexname = "mailman"
    _doctype = "mail"
    date = datetime.datetime.today()

    try:
        iconn = ES(_ES_SERVERS)
        status = None
        try:
            status = iconn.status(_indexname)
            logger.debug("Indexer status:%s" % status)
        except:
            iconn.create_index(_indexname)
            time.sleep(1)
            status = iconn.status(_indexname)
            mappings = {
                u'text': {
                    'store': 'true',
                    'type': u'text',
                    "term_vector": "with_positions_offsets"
                },
                u'url': {
                    'store': 'true',
                    'type': u'keyword'
                },
                u'title': {
                    'store': 'true',
                    'type': u'text',
                    "term_vector": "with_positions_offsets"
                },
                u'date': {
                    'store': 'true',
                    'type': u'date'
                }
            }
            time.sleep(1)
            status = iconn.put_mapping(_doctype, mappings, _indexname)

        data = dict(url=url,
                    title=msg.get('subject'),
                    date=date,
                    text=str(msg))
        iconn.index(data, _indexname, _doctype)

        syslog('debug',
               'listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
               listname, hostname, url, filepath, msg)
    except ClusterBlockException:
        syslog(
            'error',
            'Cluster in revocery state: listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
            listname, hostname, url, filepath, msg)
    except NoServerAvailable:
        syslog(
            'error',
            'No server available: listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
            listname, hostname, url, filepath, msg)
    except:
        import traceback
        syslog(
            'error',
            'Unknown: listname: %s, hostname: %s, url: %s, path: %s, msg: %s\nstacktrace: %s',
            listname, hostname, url, filepath, msg,
            repr(traceback.format_exc()))

    return
コード例 #38
0
def import_instruments(instrs, es_url, index, alias):
    """Create JSON ES docs and import."""

    prefix = {
        "bibo": "http://purl.org/ontology/bibo/",
        "dcterms": "http://purl.org/dc/terms/",
        "eos": "http://nasa.gov/eos.owl#",
        "gcis": "http://data.globalchange.gov/gcis.owl#",
        "hysds": "http://hysds.jpl.nasa.gov/hysds/0.1#",
        "info": "http://info-uri.info/",
        "xlink": "http://www.w3.org/1999/xlink"
    }

    conn = ES(es_url)
    if not conn.indices.exists_index(index):
        conn.indices.create_index(index)

    # track agencies/organizations
    orgs = {}

    for instr in instrs:
        identifier = "eos:%s" % instr['Instrument Name Short']
        id = hashlib.md5(identifier).hexdigest()
        if 'Instrument Technology' in instr and not EMPTY.search(
                instr['Instrument Technology']):
            sensor = "eos:%s" % instr['Instrument Technology']
        else:
            if 'Instrument Type' in instr and not EMPTY.search(
                    instr['Instrument Type']):
                sensor = "eos:%s" % instr['Instrument Type']
            else:
                if 'Subtype' in instr and not EMPTY.search(instr['Subtype']):
                    sensor = "eos:%s" % instr['Subtype']
                else:
                    if 'Type' in instr and not EMPTY.search(instr['Type']):
                        sensor = "eos:%s" % instr['Type']
                    else:
                        if 'Class' in instr and not EMPTY.search(
                                instr['Class']):
                            sensor = "eos:%s" % instr['Class']
                        else:
                            sensor = None
        #print(instr['Instrument Technology'], sensor)
        platform = None
        if 'Instrument Agencies' in instr and not EMPTY.search(
                instr['Instrument Agencies']):
            org = "eos:%s" % instr['Instrument Agencies']
            if org not in orgs:
                orgs[org] = {
                    "prov_es_json": {
                        "prefix": prefix,
                        "agent": {
                            org: {
                                "prov:type": {
                                    "type": "prov:QualifiedName",
                                    "$": "prov:Organization",
                                },
                            },
                        },
                    },
                    "identifier": org,
                    "prov:type": "prov:Organization",
                }
                if len(
                        conn.search(query=TermQuery("_id", org),
                                    indices=[alias])) > 0:
                    pass
                else:
                    conn.index(orgs[org], index, 'agent', org)
        else:
            org = None
        doc = {
            "prov_es_json": {
                "prefix": prefix,
                "entity": {
                    identifier: {
                        "gcis:hasSensor": sensor,
                        "gcis:inPlatform": platform,
                        "prov:type": "eos:instrument",
                        "gcis:hasGoverningOrganization": org,
                    },
                },
            },
            "gcis:hasSensor": sensor,
            "gcis:inPlatform": platform,
            "prov:type": "eos:instrument",
            "gcis:hasGoverningOrganization": org,
            "identifier": identifier,
        }
        if len(conn.search(query=TermQuery("_id", identifier),
                           indices=[alias])) > 0:
            pass
        else:
            conn.index(doc, index, 'entity', identifier)
コード例 #39
0
from pyes import ES

es = ES()

index_name = "my_index"
type_name = "my_type"

from utils_pyes import create_and_add_mapping

create_and_add_mapping(es, index_name, type_name)

es.index(doc={
    "name": "Joe Tester",
    "parsedtext": "Joe Testere nice guy",
    "uuid": "11111",
    "position": 1
},
         index=index_name,
         doc_type=type_name,
         id=1)
es.index(doc={
    "name": "data1",
    "value": "value1"
},
         index=index_name,
         doc_type=type_name + "2",
         id=1,
         parent=1)
es.index(doc={
    "name": "Bill Baloney",
    "parsedtext": "Bill Testere nice guy",
コード例 #40
0
def insertElasticsearch(_index, _type, data):
    # conn = ES('127.0.0.1:9200', timeout=3.5)
    # conn = ES('192.168.30.63:9200', timeout=3.5)
    # conn = ES('140.92.13.186:9200', timeout=3.5)
    conn = ES('localhost:9200', timeout=3.5)
    conn.index(data, _index, _type)
コード例 #41
0
from pyes import ES, Search, MatchAllQuery
from tools.FileTools import FileTools
from tools.FormatTranslator import FormatTranslator
from pyes.aggs import TermsAgg

ftool = FileTools()
ftrans = FormatTranslator()

# 1. Create Connection
conn = ES()

# 2. Index Data
dataset_json = open("../dataset.json")
dataset = json.load(dataset_json)['data']
for data in dataset:
    conn.index(data, "example_index", "example_type",
               "example_id_" + str(dataset.index(data)))

# 3. Create Simple Query
query = MatchAllQuery()

# 4. Create Simple Aggregation
agg = TermsAgg('agg1', field="name", sub_aggs=[], size=100)

# 5. Get Result
search = Search(query, size=5)
search.agg.add(agg)
print search.serialize()

result = conn.search(search, "example_index", "example_type")

for i in result:
コード例 #42
0
ファイル: performance.py プロジェクト: akheron/pyes
dataset = shelve.open("samples.shelve")

mapping = { u'description': {'boost': 1.0,
                 'index': 'analyzed',
                 'store': 'yes',
                 'type': u'string',
                 "term_vector" : "with_positions_offsets"
                 },
         u'name': {'boost': 1.0,
                    'index': 'analyzed',
                    'store': 'yes',
                    'type': u'string',
                    "term_vector" : "with_positions_offsets"
                    },
         u'age': {'store': 'yes',
                    'type': u'integer'},    
                    }
conn.create_index("test-index")
conn.put_mapping("test-type", {'properties':mapping}, ["test-index"])

start = datetime.now()
for k, userdata in dataset.items():
#    conn.index(userdata, "test-index", "test-type", k)
    conn.index(userdata, "test-index", "test-type", k, bulk=True)
conn.force_bulk()
end = datetime.now()

print "time:", end-start
dataset.close()

コード例 #43
0
ファイル: performance.py プロジェクト: glitchdotcom/pyes
dataset = shelve.open("samples.shelve")

mapping = { u'description': {'boost': 1.0,
                 'index': 'analyzed',
                 'store': 'yes',
                 'type': u'string',
                 "term_vector" : "with_positions_offsets"
                 },
         u'name': {'boost': 1.0,
                    'index': 'analyzed',
                    'store': 'yes',
                    'type': u'string',
                    "term_vector" : "with_positions_offsets"
                    },
         u'age': {'store': 'yes',
                    'type': u'integer'},
                    }
conn.create_index("test-index")
conn.put_mapping("test-type", {'properties':mapping}, ["test-index"])

start = datetime.now()
for k, userdata in dataset.items():
#    conn.index(userdata, "test-index", "test-type", k)
    conn.index(userdata, "test-index", "test-type", k, bulk=True)
conn.force_bulk()
end = datetime.now()

print "time:", end - start
dataset.close()

from pyes import ES

es = ES()

index_name = "my_index"
type_name = "my_type"

from utils_pyes import create_and_add_mapping

create_and_add_mapping(es, index_name, type_name)

es.index(doc={"name": "Joe Tester", "parsedtext": "Joe Testere nice guy", "uuid": "11111", "position": 1},
         index=index_name, doc_type=type_name, id=1)
es.index(doc={"name": "data1", "value": "value1"}, index=index_name, doc_type=type_name + "2", id=1, parent=1)
es.index(doc={"name": "Bill Baloney", "parsedtext": "Bill Testere nice guy", "uuid": "22222", "position": 2},
         index=index_name, doc_type=type_name, id=2, bulk=True)
es.index(doc={"name": "data2", "value": "value2"}, index=index_name, doc_type=type_name + "2", id=2, parent=2,
         bulk=True)
es.index(doc={"name": "Bill Clinton", "parsedtext": """Bill is not
        nice guy""", "uuid": "33333", "position": 3}, index=index_name, doc_type=type_name, id=3, bulk=True)

es.force_bulk()

es.update(index=index_name, doc_type=type_name, id=2, script='ctx._source.position += 1')
es.update(index=index_name, doc_type=type_name, id=2, script='ctx._source.position += 1', bulk=True)

es.delete(index=index_name, doc_type=type_name, id=1, bulk=True)
es.delete(index=index_name, doc_type=type_name, id=3)

es.force_bulk()
es.indices.refresh(index_name)
コード例 #45
0
class SampleMaker(object):
    def __init__(self, name):
        log = open(name, "wb")
        self.log = log
        self.conn = ES(("http", "127.0.0.1", 9200),
                       timeout=300.0,
                       log_curl=True,
                       dump_curl=log)
        self.index_name = "test-index"
        self.document_type = "test-type"
        self.conn.delete_index_if_exists(self.index_name)
        self.init_default_index()

    def init_default_index(self):
        from pyes.helpers import SettingsBuilder
        settings = SettingsBuilder()
        from pyes.mappings import DocumentObjectField
        from pyes.mappings import IntegerField
        from pyes.mappings import NestedObject
        from pyes.mappings import StringField, DateField, BooleanField, GeoPointField, FloatField

        docmapping = DocumentObjectField(name=self.document_type)
        docmapping.add_property(
            StringField(name="description",
                        store=True,
                        term_vector="with_positions_offsets",
                        index="analyzed"))
        docmapping.add_property(
            StringField(name="name",
                        store=True,
                        term_vector="with_positions_offsets",
                        index="analyzed"))
        docmapping.add_property(
            StringField(name="tag", store=True, index="not_analyzed"))
        docmapping.add_property(IntegerField(name="age", store=True))
        docmapping.add_property(FloatField(name="price"))
        docmapping.add_property(DateField(name="date", store=True))
        docmapping.add_property(
            BooleanField(name="in_stock", store=True, index="not_analyzed"))
        docmapping.add_property(GeoPointField(name="position"))
        nested_object = NestedObject(name="metadata")
        nested_object.add_property(StringField(name="name", store=True))
        nested_object.add_property(StringField(name="value", store=True))
        nested_object.add_property(IntegerField(name="num", store=True))
        docmapping.add_property(nested_object)
        settings.add_mapping(docmapping)

        self.conn.ensure_index(self.index_name, settings)

    def generate_datafile(self, number_items=1000):
        """
        Generate a dataset with number_items elements.
        """

        names = get_names()
        totalnames = len(names)
        #init random seeder
        random.seed()
        #calculate items
        #    names = random.sample(names, number_items)
        for i in xrange(number_items):
            data = {
                "name":
                names[random.randint(0, totalnames - 1)],
                "age":
                random.randint(1, 100),
                "price":
                random.random() * 100.0,
                "tag": [words(1, False) for r in xrange(random.randint(1, 5))],
                "in_stock":
                random.choice([True, False]),
                "date":
                datetime.now() + timedelta(days=random.choice([1, -1]) *
                                           random.randint(0, 1000)),
                "position": {
                    "lat": random.choice([1, -1]) * random.random() * 90.0,
                    "lon": random.choice([1, -1]) * random.random() * 180.0
                },
                "description":
                words(random.randint(1, 100), False),
                "metadata": [{
                    "name": names[random.randint(0, totalnames - 1)],
                    "value": str(random.randint(1, 5)),
                    "num": random.randint(1, 50)
                } for r in xrange(random.randint(1, 5))]
            }
            self.conn.index(data,
                            self.index_name,
                            self.document_type,
                            id=str(i + 1))

    def close(self):
        self.conn.flush(self.index_name)
        self.log.close()
コード例 #46
0
ファイル: archive_and_index.py プロジェクト: 0x64746b/pyes
def ext_process(listname, hostname, url, filepath, msg):
    """Here's where you put your code to deal with the just archived message.

    Arguments here are the list name, the host name, the URL to the just
    archived message, the file system path to the just archived message and
    the message object.

    These can be replaced or augmented as needed.
    """
    from pyes import ES
    from pyes.exceptions import ClusterBlockException, NoServerAvailable
    import datetime

    #CHANGE this settings to reflect your configuration
    _ES_SERVERS = ['127.0.0.1:9500'] # I prefer thrift
    _indexname = "mailman"
    _doctype = "mail"
    date = datetime.datetime.today()

    try:
        iconn = ES(_ES_SERVERS)
        status = None
        try:
            status = iconn.status(_indexname)
            logger.debug("Indexer status:%s" % status)
        except:
            iconn.create_index(_indexname)
            time.sleep(1)
            status = iconn.status(_indexname)
            mappings = { u'text': {'boost': 1.0,
                                     'index': 'analyzed',
                                     'store': 'yes',
                                     'type': u'string',
                                     "term_vector" : "with_positions_offsets"},
                             u'url': {'boost': 1.0,
                                        'index': 'not_analyzed',
                                        'store': 'yes',
                                        'type': u'string',
                                        "term_vector" : "no"},
                             u'title': {'boost': 1.0,
                                        'index': 'analyzed',
                                        'store': 'yes',
                                        'type': u'string',
                                        "term_vector" : "with_positions_offsets"},
                             u'date': {'store': 'yes',
                                        'type': u'date'}}
            time.sleep(1)
            status = iconn.put_mapping(_doctype, mappings, _indexname)


        data = dict(url=url,
                    title=msg.get('subject'),
                    date=date,
                    text=str(msg)
                    )
        iconn.index(data, _indexname, _doctype)

        syslog('debug', 'listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
               listname, hostname, url, filepath, msg)
    except ClusterBlockException:
        syslog('error', 'Cluster in revocery state: listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
               listname, hostname, url, filepath, msg)
    except NoServerAvailable:
        syslog('error', 'No server available: listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
               listname, hostname, url, filepath, msg)
    except:
        import traceback
        syslog('error', 'Unknown: listname: %s, hostname: %s, url: %s, path: %s, msg: %s\nstacktrace: %s',
               listname, hostname, url, filepath, msg, repr(traceback.format_exc()))

    return
コード例 #47
0
ファイル: CMSElastic.py プロジェクト: elasticaso/WMCore
class ElasticSearchServer(ESDBRequests):
    """
    An object representing the CouchDB server, use it to list, create, delete
    and connect to databases.

    More info http://wiki.apache.org/couchdb/HTTP_database_API
    """

    def __init__(self, dburl = 'http://localhost:9200', indices, types, usePYCurl = False, ckey = None, cert = None, capath = None):
        """
        Set up a connection to the CouchDB server
        """
        check_server_url(dburl)
        # PYCurl TODO
        # Same with cert and key
        self.url = dburl
        self.ESconn = ES(dburl)
        self.ckey = ckey
        self.cert = cert
        check_name(indices)
        check_name(types)
        self.indices = indices
        self.types = types

    def listDatabases(self):
        "List all the databases the server hosts"
        # TODO
        return self.get('/_all_dbs')

    def createDatabase(self, schema):
        """
        A database must be named with all lowercase characters (a-z),
        digits (0-9), or any of the _$()+-/ characters and must end with a slash
        in the URL.
        """
        self.ESconn.indices.create_index_if_missing(self.indices)
        self.ESconn.indices.put_mapping(self.types, {'properties': schema}, [self.indices])

    def insertDoc(self, doc, _id):
        """ TODO """
        self.ESconn.index(doc, self.indices, self.types, _id)

    def deleteDoc(self,  _id):
        self.ESconn.delete(self.indices, self.types, _id)

    def termBoolQuery(self, query):
        """ query - dict
            must:
                key = key in the database
                value = searchable value
            should
                key = key in the database
                value = searchable value
            must_not
                key = key in the database
                value = searchable value
        """
        queryMust = []
        queryShould = []
        queryMustNot = []
        for item in ["must", "should", "must_not"]:
            if item in query:
                for dictVals in query[item]:
                    for dictKey in dictVals:
                        tempq = TermQuery(dictKey, dictVals[dictKey])
                        if item == "must":
                            queryMust.append(tempq)
                        elif item == "should":
                            queryShould.append(tempq)
                        elif item == "must_not":
                            queryMustNot.append(tempq)
        query = BoolQuery(must=None if not queryMust else queryMust,
                          should=None if not queryShould else queryShould,
                          must_not=None if not queryMustNot else queryMustNot)

        search = Search(query)
        results = self.ESconn.search(search, self.indices)
        response = {"status_code": 200, "message": "Successful", "content": []}
        response["content"] = [result for result in results]
        return response
コード例 #48
0

conn = ES('localhost:9200')
type_name = 'shrimp'
es_index = 'shrimp'

db = MySQLdb.connect(
    host="localhost",  # your host, usually localhost
    user="******",  # your username
    passwd="password",  # your password
    db="shrimp",  # name of the data base
    cursorclass=MySQLdb.cursors.SSCursor)

# you must create a Cursor object. It will let
#  you execute all the queries you need
cur = db.cursor()

# Use all the SQL you like
cur.execute("select * from PERIODS;")
row = cur.fetchone()
while row is not None:
    MysqlToES = collections.OrderedDict()
    MysqlToES['period'] = row[4]
    MysqlToES['a'] = row[1]
    MysqlToES['b'] = row[2]
    MysqlToES['c'] = row[3]
    MyESJson = json.dumps(MysqlToES)
    thisid = id_generator()
    conn.index(MyESJson, es_index, type_name, id=thisid)
    row = cur.fetchone()
コード例 #49
0
# get source and destination index
src = sys.argv[1]
dest = sys.argv[2]

# get connection and create destination index
conn = ES(es_url)
if not conn.indices.exists_index(dest):
    conn.indices.create_index(dest)

# index all docs from source index to destination index
query = {
  "fields": "_source",
  "query": {
    "match_all": {}
  }
}
r = requests.post('%s/%s/_search?search_type=scan&scroll=60m&size=100' % (es_url, src), data=json.dumps(query))
scan_result = r.json()
count = scan_result['hits']['total']
scroll_id = scan_result['_scroll_id']
results = []
while True:
    r = requests.post('%s/_search/scroll?scroll=60m' % es_url, data=scroll_id)
    res = r.json()
    scroll_id = res['_scroll_id']
    if len(res['hits']['hits']) == 0: break
    for hit in res['hits']['hits']:
        doc = hit['_source']
        conn.index(hit['_source'], dest, hit['_type'], hit['_id'])
        print "indexed %s" % hit['_id']