Example #1
0
def get_elasticsearch_connection():
    es_conn = env.get_service(label='elasticsearch-swarm-1.7.1')
    if es_conn:
        es = ElasticSearch(es_conn.get_url(url='uri'))
    else:
        es = ElasticSearch('http://localhost:9200')
    return es
    def set_in_index(self, documentList):
        """
        Store the list of documents in the Elasticsearch index via HTTP APIs

        @type  documentList: List
        @param documentList: List of image layer JSON documents
        """
        #Get the Elasticsearch address from the config file
        cfg = config.load()

        #Store the document list in Elasticsearch
        es = ElasticSearch(cfg.search_options.get("address"))
        try:
            es.bulk_index(cfg.search_options.get("index"),
                          cfg.search_options.get("type"),
                          documentList,
                          id_field='id')
        except InvalidJsonResponseError:
            logger.debug("InvalidJsonResponseError!")
        except Timeout:
            logger.debug("Timeout!")
        except ConnectionError:
            logger.debug("ConnectionError!")
        except ElasticHttpError:
            logger.debug("ElasticHttpError!")
        except InvalidJsonResponseError:
            logger.debug("InvalidJsonResponseError!")
        except ElasticHttpNotFoundError:
            logger.debug("ElasticHttpNotFoundError!")
Example #3
0
def search(field, queryStr):
    es_server = 'http://localhost:9200/'
    es_index = 'memex'
    es_doc_type = 'page'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    if environ.get('ELASTICSEARCH_INDEX'):
        es_index = environ['ELASTICSEARCH_INDEX']
    if environ.get('ELASTICSEARCH_DOC_TYPE'):
        es_doc_type = environ['ELASTICSEARCH_DOC_TYPE']

    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query": {
                "query_string": {
                    "fields": [field],
                    "query": ' and  '.join(queryStr[0:]),
                }
            },
            "fields": [field]
        }
        print query
        res = es.search(query, index=es_index, doc_type=es_doc_type)
        hits = res['hits']
        print 'Document found: %d' % hits['total']
        return hits['hits']
Example #4
0
def init_es(
    hosts=_hosts,
    port=9200,
):
    urls = map(lambda x: 'http://' + x, hosts)
    es = ElasticSearch(urls=urls, port=port, timeout=2 * 60)
    return es
Example #5
0
def init_schema():
    """Should be called at application startup. Makes sure the mappings and
    index exist."""
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    try:
        es.create_index(settings.ELASTIC_SEARCH_INDEX)
    except IndexAlreadyExistsError:
        pass

    #   Does not replace if exact mapping already exists
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                   {'reg_tree': {
                       'properties': NODE_SEARCH_SCHEMA
                   }})
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'layer',
                   {'layer': {
                       'properties': LAYER_SCHEMA
                   }})
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'notice',
                   {'notice': {
                       'properties': LAYER_SCHEMA
                   }})
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'diff',
                   {'diff': {
                       'properties': DIFF_SCHEMA
                   }})
Example #6
0
def search(field, queryStr):
    es_server = 'http://localhost:9200/'
    es_index = 'memex'
    es_doc_type = 'page'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    if environ.get('ELASTICSEARCH_INDEX'):
        es_index = environ['ELASTICSEARCH_INDEX']
    if environ.get('ELASTICSEARCH_DOC_TYPE'):
        es_doc_type = environ['ELASTICSEARCH_DOC_TYPE']

    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query": {
                "query_string": {
                    "fields": [field],
                    "query": ' and  '.join(queryStr[0:]),
                }
            },
            "fields": [field]
        }
        print query
        res = es.search(query, index=es_index, doc_type=es_doc_type, size=500)
        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #7
0
def get_image(url):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if url:
        query = {
            "query": {
                "term": {
                    "url": url
                }
            },
            "fields": ["thumbnail", "thumbnail_name"]
        }
        res = es.search(query,
                        index=environ['ELASTICSEARCH_INDEX']
                        if environ.get('ELASTICSEARCH_INDEX') else 'memex',
                        doc_type=environ['ELASTICSEARCH_DOC_TYPE']
                        if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')

        hits = res['hits']['hits']
        if (len(hits) > 0):
            try:
                img = base64.b64decode(hits[0]['fields']['thumbnail'][0])
                img_name = hits[0]['fields']['thumbnail_name'][0]
                return [img_name, img]
            except KeyError:
                print "No thumbnail found"
        else:
            print "No thumbnail found"
    return [None, None]
Example #8
0
    def __init__(self,
                 es_url='http://localhost:9200/',
                 batch_size=10,
                 **kwargs):
        """
        Do what is necessary to create/open the index.
        """
        self.batch_size = batch_size
        self.batch_count = 0
        self.es_url = es_url
        self.fast = kwargs.get('fast', False)
        if kwargs.get('noisy', False):
            from logging import getLogger, StreamHandler, DEBUG
            import sys
            logger = getLogger('pyelasticsearch')
            logger.setLevel(DEBUG)
            logger.addHandler(StreamHandler(sys.stdout))

        self.es = ElasticSearch(self.es_url)
        try:
            self.es.count('*')
        except ConnectionError:
            print "Error connecting to ElasticSearch server!"
            raise
        self.urls = defaultdict(
            set)  #track urls to be deleted before committing new content
        self.batches = defaultdict(list)  #site: [list of docs]
Example #9
0
    def __init__(self, config={}):
        """
        :param config: A dict containing at least es_endpoint field.

        .. code-block:: python

            {
                "es_endpoint": "http://localhost:9200",
                "es_namespace": "test1"
            }

        """
        self.log = logging.getLogger("%s.ElasticSearchHelper" % __name__)
        self.base_uri = config.get('es_endpoint', 'http://localhost:9200')
        self.log.info("Using ElasticSearch Endpoint '{0}'".format(
            self.base_uri))
        self.namespace = config.get('es_namespace', '')
        self.index = '{0}documents'.format(self.namespace)
        self.doc_type = 'fields'
        self.document_path = '/{0}/{1}'.format(self.index, self.doc_type)
        self.search_path = '{0}/_search'.format(self.document_path)
        self.document_uri = urljoin(self.base_uri, self.document_path)
        self.search_uri = urljoin(self.base_uri, self.search_path)
        self.log.info("document_uri is '{0}' search_uri is '{0}'".format(
            self.document_uri, self.search_uri))
        self.conn = es = ElasticSearch(self.base_uri)
Example #10
0
def get_context(terms):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(terms) > 0:

        query = {
            "query": { 
                "match": {
                    "text": {
                        "query": ' and  '.join(terms[0:]),
                        "operator" : "and"
                    }
                }
             },
            "highlight" : {
                "fields" : {
                    "text": {
                        "fragment_size" : 100, "number_of_fragments" : 1
                    }
                }
            }
        }
        print query
        res = es.search(query, index='memex', doc_type='page')
        hits = res['hits']
        print 'Document found: %d' % hits['total']
        highlights = []
        for hit in hits['hits']:
            highlights.append(hit['highlight']['text'])
        return highlights
Example #11
0
def term_search(field, queryStr):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query" : {
                "match": {
                    field: {
                        "query": queryStr,
                        "operator" : "and"
                        }
                    }
                },
            "fields": ["url"]
            }
        print query
        res = es.search(query, index='memex', doc_type='page', size=500)
        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #12
0
def get_image(url, output_path=""):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if output_path:
        output_path = output_path+'/'

    if url:
        query = {
            "query": {
                "term": {
                    "url": url
                }
            },
            "fields": ["thumbnail"]
        }
        res = es.search(query, index='memex', doc_type='page')
        hits = res['hits']
        if (len(hits) > 0):
            img = base64.b64decode(hits['hits'][0]['fields']['thumbnail'][0])
            with open(output_path+urllib2.quote(url).replace("/", "%2F")+'.png','wb') as f:
                f.write(img)
        else:
            print "No thumbnail found"
Example #13
0
def delete(config, tree_names, all, force):
    """Delete indices and their catalog entries.
    
    This deletes the indices that have the format version of the copy of DXR
    this runs under.
    
    """
    es = ElasticSearch(config.es_hosts)
    if all:
        echo('Deleting catalog...')
        es.delete_index(config.es_catalog_index)
        # TODO: Delete tree indices as well.
    else:
        for tree_name in tree_names:
            frozen_id = '%s/%s' % (FORMAT, tree_name)
            try:
                frozen = es.get(config.es_catalog_index, TREE, frozen_id)
            except ElasticHttpNotFoundError:
                raise ClickException('No tree "%s" in catalog.' % tree_name)
            # Delete the index first. That way, if that fails, we can still
            # try again; we won't have lost the catalog entry. Refresh is
            # infrequent enough that we wouldn't avoid a race around a
            # catalogued but deleted instance the other way around.
            try:
                es.delete_index(frozen['_source']['es_alias'])
            except ElasticHttpNotFoundError:
                # It's already gone. Fine. Just remove the catalog entry.
                pass
            es.delete(config.es_catalog_index, TREE, frozen_id)
Example #14
0
def term_search(field, queryStr):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query": {
                "match": {
                    field: {
                        "query": ' '.join(queryStr),
                        "minimum_should_match": "100%"
                    }
                }
            },
            "fields": ["url"]
        }
        print query
        res = es.search(query,
                        index=environ['ELASTICSEARCH_INDEX']
                        if environ.get('ELASTICSEARCH_INDEX') else 'memex',
                        doc_type=environ['ELASTICSEARCH_DOC_TYPE']
                        if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page',
                        size=500)

        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #15
0
def IndexData(request):
    es = ElasticSearch(settings.ELASTIC_SEARCH)
    for file in fileHolder:
        index = file['segment_name'].lower()
        rawfiles = file['rawfiles']
        data_for_es = file['dataFrames']
        try:
            es.delete_index(index.replace(" ", ""))
        except:
            pass
    es.create_index(index.replace(" ", ""))

    ## Loop dataframe and to elasticsearch index
    docs = json.loads(data_for_es.to_json(orient='records'))
    es.bulk((es.index_op(doc) for doc in docs),
            index=index.replace(" ", ""),
            doc_type=index)

    ##Create segment template
    file_names = []
    for file in rawfiles:
        file_names.append(file.name)

    segment = Segments(name=index,
                       files_added=",".join(file_names),
                       es_index=index.replace(" ", ""))
    segment.save()

    segment = Segments.objects.get(name=index)

    return render(request, 'analyse.html', {'segment': segment})
Example #16
0
def includeme(config):
    settings = config.registry.settings

    search_enabled = asbool(settings.get('search.enabled', False))
    logger.info('elastic_search_enabled=%s' % search_enabled)

    # Enable searching?
    if not search_enabled:
        config.add_request_method(lambda request: {'enabled': False},
                                  'search_settings',
                                  reify=True)
        return

    search_settings = _get_search_settings(settings)

    global _es_client

    if _es_client is None:
        _es_client = ElasticSearch('http://%(host)s:%(port)s/' %
                                   search_settings)

    config.add_request_method(get_search_settings,
                              'search_settings',
                              reify=True)

    config.add_request_method(lambda request: SafeEs(_es_client),
                              'es',
                              reify=True)
Example #17
0
def update_process_datetime(doc_id, timestamp):
    ''' Updates the last_update_date for the document id passed into function.
    The document id in will be the name of another index in the cluster.
  '''
    connection_string = 'http://localhost:9200'
    process_index = 'openfdametadata'
    _type = 'last_run'
    _map = {}
    _map[_type] = {}
    _map[_type]['properties'] = {}
    _map[_type]['properties']['last_update_date'] = {}
    _map[_type]['properties']['last_update_date']['type'] = 'date'
    _map[_type]['properties']['last_update_date'][
        'format'] = 'dateOptionalTime'

    es = ElasticSearch(connection_string)
    try:
        es.create_index(process_index)
        logging.info('Creating index %s', process_index)
    except exceptions.IndexAlreadyExistsError as e:
        logging.info('%s already exists', process_index)

    try:
        es.put_mapping(process_index, doc_type=_type, mapping=_map)
        logging.info('Successfully created mapping')
    except:
        logging.fatal('Could not create the mapping')

    new_doc = {}
    new_doc['last_update_date'] = timestamp
    es.index(process_index,
             doc_type=_type,
             id=doc_id,
             doc=new_doc,
             overwrite_existing=True)
Example #18
0
def get_es(urls=None, timeout=DEFAULT_TIMEOUT, force_new=False, **settings):
    """Create a pyelasticsearch `ElasticSearch` object and return it.

    This will aggressively re-use `ElasticSearch` objects with the
    following rules:

    1. if you pass the same argument values to `get_es()`, then it
       will return the same `ElasticSearch` object
    2. if you pass different argument values to `get_es()`, then it
       will return different `ElasticSearch` object
    3. it caches each `ElasticSearch` object that gets created
    4. if you pass in `force_new=True`, then you are guaranteed to get
       a fresh `ElasticSearch` object AND that object will not be
       cached

    :arg urls: list of uris; ElasticSearch hosts to connect to,
        defaults to ``['http://localhost:9200']``
    :arg timeout: int; the timeout in seconds, defaults to 5
    :arg force_new: Forces get_es() to generate a new ElasticSearch
        object rather than pulling it from cache.
    :arg settings: other settings to pass into ElasticSearch
        constructor See
        `<http://pyelasticsearch.readthedocs.org/en/latest/api/>`_ for
        more details.

    Examples::

        # Returns cached ElasticSearch object
        es = get_es()

        # Returns a new ElasticSearch object
        es = get_es(force_new=True)

        es = get_es(urls=['http://localhost:9200'])

        es = get_es(urls=['http://localhost:9200'], timeout=10,
                    max_retries=3)

    """
    # Cheap way of de-None-ifying things
    urls = urls or DEFAULT_URLS

    # v0.7: Check for 'hosts' instead of 'urls'. Take this out in v1.0.
    if 'hosts' in settings:
        raise DeprecationWarning('"hosts" is deprecated in favor of "urls".')

    if not force_new:
        key = _build_key(urls, timeout, **settings)
        if key in _cached_elasticsearch:
            return _cached_elasticsearch[key]

    es = ElasticSearch(urls, timeout=timeout, **settings)

    if not force_new:
        # We don't need to rebuild the key here since we built it in
        # the previous if block, so it's in the namespace. Having said
        # that, this is a little ew.
        _cached_elasticsearch[key] = es

    return es
Example #19
0
def _check_es_health(product, env):
    if product is not None:
        hosts = [_url_for_host(env)]
    else:
        logging.info(
            "No product specified ; Checking health of all Elasticsearch hosts for env '%s'\n"
            % env)
        all_hosts = set(product_host.values())
        hosts = []
        for host in all_hosts:
            hosts.append(_url_for_host(env, host))

    es = ElasticSearch(hosts, port=port)

    # Add check on elasticsearch health
    health = es.health()

    if health['status'] == 'red':
        logging.error(
            "Elasticsearch status is red. Search will hang. Exiting\n")
        sys.exit(-1)
    elif health['status'] == 'yellow':
        logging.warning(
            'Elasticsearch status is yellow. Search quality will be degraded\n'
        )
Example #20
0
    def testErrorHandling(self):
        # Wrong port.
        conn = ElasticSearch('http://example.com:1009200/')
        self.assertRaises(ElasticSearchError, conn.count, "*:*")

        # Test invalid JSON.
        self.assertRaises(ElasticSearchError, conn._prep_request, unittest.TestCase)
        self.assertRaises(ElasticSearchError, conn._prep_response, '{"busted" "json" "that": ["is] " wrong')
Example #21
0
def includeme(config):
    config.add_route('index', '/index')
    config.scan(__name__)

    if 'elasticsearch.server' in config.registry.settings:
        es = ElasticSearch(config.registry.settings['elasticsearch.server'])
        es.session.hooks['response'].append(requests_timing_hook('es'))
        config.registry[ELASTIC_SEARCH] = es
Example #22
0
 def __init__(self, config=None, es_instance=None):
     if es_instance:
         self.es = es_instance
     else:
         self.config = config
         self.excludes_fields = self.config['excludes_fields']
         self.es = ElasticSearch('http://{host}:{port}/'.format(
             host=self.config['host'], port=self.config['port']))
Example #23
0
def query(request):
    es = ElasticSearch(settings.ELASTIC_SEARCH)
    query = {"query": {"bool": {}}}

    #Building the query
    dict_value = dict(request.POST)
    for key in dict_value['query']:
        key = key

    value = ast.literal_eval(key)
    AndQueries = []
    OrQueries = []

    for index, key in enumerate(value['exact_query']):
        if key['condition'] == 'is equal to':
            query_values = {"term": {key['column']: key['value']}}
        if key['condition'] == 'is less than':
            query_values = {"range": {key['column']: {"lt": key['value']}}}
        if key['condition'] == 'is greater than':
            query_values = {"range": {key['column']: {"gt": key['value']}}}
        if key['condition'] == 'is less than or equal to':
            query_values = {"range": {key['column']: {"lte": key['value']}}}
        if key['condition'] == 'is greater than or equal to':
            query_values = {"range": {key['column']: {"gte": key['value']}}}
        if key['condition'] == 'is not equal to':
            query_values = {
                "must_not": {
                    "term": {
                        key['column']: key['value']
                    }
                }
            }

        if key['operation'] == 'and':
            AndQueries.append(query_values)
        if key['operation'] == 'or':
            OrQueries.append(query_values)
        if key['operation'] == '':
            if index < (len(value['exact_query']) - 1):
                next_value = value['exact_query'][index + 1]
                if next_value['operation'] == 'and':
                    AndQueries.append(query_values)
                if next_value['operation'] == 'or':
                    OrQueries.append(query_values)
            else:
                query['query']['bool']['must'] = query_values

    if len(AndQueries) != 0:
        query['query']['bool']['must'] = AndQueries
    if len(OrQueries) != 0:
        query['query']['bool']['should'] = OrQueries

    results = es.search(query, index=dict_value['index'][0], size=10000)
    return HttpResponse(json.dumps({
        'success': "Added successfully",
        'results': results
    }),
                        content_type="application/json")
Example #24
0
def add_document(entries):
    es_server = 'http://localhost:9200/'
    if os.environ.get('ELASTICSEARCH_SERVER'):
        es_server = os.environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    es.bulk([es.index_op(doc) for doc in entries],
            index='memex',
            doc_type='page')
Example #25
0
def _query_applications(product_group, indices):
    hosts = [_url_for_host(env)]

    es = ElasticSearch(hosts, port=port)
    es_results = es.search(APPLICATIONS_QUERY, index=indices)

    applications = map((lambda result: result['term']),
                       es_results['facets']['applications']['terms'])
    return applications
def search(request, doc_type):
    """Search elastic search for any matches in the node's text"""
    term = request.GET.get('q', '')
    version = request.GET.get('version', '')
    regulation = request.GET.get('regulation', '')
    is_root = request.GET.get('is_root')
    is_subpart = request.GET.get('is_subpart')
    try:
        page = int(request.GET.get('page', '0'))
    except ValueError:
        page = 0

    if not term:
        return user_error('No query term')
    if not validate_boolean(is_root):
        return user_error('Parameter "is_root" must be "true" or "false"')
    if not validate_boolean(is_subpart):
        return user_error('Parameter "is_subpart" must be "true" or "false"')

    query = {
        'fields':
        ['text', 'label', 'version', 'regulation', 'title', 'label_string'],
        'from':
        page * PAGE_SIZE,
        'size':
        PAGE_SIZE,
    }
    text_match = {'match': {'text': term, 'doc_type': doc_type}}
    if version or regulation:
        term = {}
        if version:
            term['version'] = version
        if regulation:
            term['regulation'] = regulation
        if is_root:
            term['is_root'] = is_root
        if is_subpart:
            term['is_subpart'] = is_subpart
        query['query'] = {
            'filtered': {
                'query': text_match,
                'filter': {
                    'term': term
                }
            }
        }
    else:
        query['query'] = text_match
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX)

    return success({
        'total_hits':
        results['hits']['total'],
        'results':
        transform_results([h['fields'] for h in results['hits']['hits']])
    })
 def query(self, query_dict, maxhits=10000):
     from pyelasticsearch import ElasticSearch
     if self.es_connection is None:
         es_server = self.server.rstrip('/') + ':9200'
         self.es_connection = ElasticSearch(es_server)
     results = self.es_connection.search(query_dict, index='encoded',
                                         doc_type=self.search_name,
                                         size=maxhits)
     return results
Example #28
0
def cli(index_name, delete_index, mapping_file, settings_file, doc_type, host,
        docs_per_chunk, bytes_per_chunk, parallel, quiet, parser, config_file,
        user, passwd):

    with open(config_file, "rb") as f:
        con = json.loads(f.read())
    host = con['es_config']['host']
    echo('Using host: ' + host, quiet)
    es = ElasticSearch(host)

    if con['db']['type'] == "oracle":
        db = import_module('cx_Oracle')
        collection = db.connect(user, passwd, con['db']['con_str'])
    else:
        db = import_module('MySQLdb')
        collection = db.connect(con['db']['con_str'][0],
                                user,
                                passwd,
                                con['db']['con_str'][1],
                                charset=con['db']['con_str'][2])

    if delete_index:  # 删除索引
        try:
            stamp = 0
            es.delete_index(index_name)
            echo('Deleted: ' + index_name, quiet)
        except ElasticHttpNotFoundError:
            echo('Index ' + index_name + ' not found, nothing to delete',
                 quiet)

    try:
        if settings_file:
            with open(settings_file, 'r') as f:
                settings_json = json.loads(f.read())
            es.create_index(index_name, settings=settings_json)
        else:
            es.create_index(index_name)
        echo('Created new index: ' + index_name, quiet)
    except Exception:
        echo('Index ' + index_name + ' already exists', quiet)

    echo('Using document type: ' + doc_type, quiet)

    es.put_mapping(index_name, doc_type, con['mapping'])

    parser_fun = None
    if parser is not None:
        # 加载解释函数
        parser_fun = import_module(PARSER_PATH + '.' + parser)

    documents = documents_from_file(es, collection, quiet, parser_fun, con)

    perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk,
                       bytes_per_chunk, parallel)
    print "end:" + time.strftime(
        ISOTIMEFORMAT, time.localtime()) + '/n all records import complete.'
Example #29
0
def main(argv):
    index = argv[1]
    doc_type = 'log'
    url = []
    urls = argv[2].strip().split(',')
    for u in urls:
        url.append(u)

    es = ElasticSearch(urls=url, timeout=60, max_retries=0)
    create_mapping(es, index, doc_type)
Example #30
0
def feed(index='monolith', type='downloads', es_port=9200):
    client = ElasticSearch('http://0.0.0.0:%d/' % es_port)
    platforms = ['Mac OS X', 'Windows 8', 'Ubuntu']

    # indexing a year of data (2012)
    first_day = datetime.datetime(2012, 1, 1)
    last_day = datetime.datetime(2012, 12, 31)
    day_range = last_day - first_day

    for month in range(1, 13):
        name = 'time_2012-%.2d' % month
        try:
            client.delete_index(name)
        except Exception:
            pass
        client.create_index(name,
                            settings={
                                'number_of_shards': 1,
                                'number_of_replicas': 0,
                                'analysis': {
                                    'analyzer': {
                                        'default': {
                                            'type': 'custom',
                                            'tokenizer': 'keyword'
                                        }
                                    }
                                },
                                'store': {
                                    'compress': {
                                        'stored': 'true'
                                    }
                                },
                            })

    # indexing 100 apps
    for add_on in range(100):
        docs = defaultdict(list)
        for delta in range(day_range.days):
            date = first_day + datetime.timedelta(days=delta)
            data = {
                'date': date,
                'os': random.choice(platforms),
                'downloads_count': random.randint(1000, 1500),
                'users_count': random.randint(10000, 15000),
                'add_on': add_on + 1
            }
            docs[date.month].append(data)
        for month, values in docs.items():
            client.bulk_index('time_2012-%.2d' % month, type, values)
            sys.stdout.write('.')
            sys.stdout.flush()

    client.optimize('time_*', max_num_segments=1, wait_for_merge=True)
    client.flush()
    sys.stdout.write('\nDone!\n')