Esempio n. 1
0
 def refresh(self, index=None):
     index = index or Index.objects.get_current().prefixed_name
     # Any time we're doing a refresh, we're making sure that the
     # index is ready to be queried.  Given that, it's almost
     # always the case that we want to run all the generated tasks,
     # then refresh.
     connections.get_connection().indices.refresh(index=index)
Esempio n. 2
0
    def destroy(self):
        """Destroy an index."""
        self._refresh_connection()

        self.push_queue = []
        index_name = self.document_class()._get_index()  # pylint: disable=protected-access
        connections.get_connection().indices.delete(index_name, ignore=404)

        self._mapping_created = False
Esempio n. 3
0
def store():

    with Store(store_type='elasticsearch', nodes='192.168.99.100:9200') as s:
        s._load_plugin(nodes='192.168.99.100:9200')
        try:
            connections.get_connection().indices.delete(index='indicators-*')
            connections.get_connection().indices.delete(index='tokens')
        except Exception as e:
            pass
        yield s
Esempio n. 4
0
    def create(self, data):
        logger.debug(data)
        for v in ['admin', 'read', 'write']:
            if data.get(v):
                data[v] = True

        if data.get('token') is None:
            data['token'] = self._generate()

        t = Token(**data)

        if t.save():
            connections.get_connection().indices.flush(index='tokens')
            return t.to_dict()
def reindex_tokens():
    TokenBackup.init()
    connections.create_connection(hosts=ES_NODES)
    backup_results = connections.get_connection().reindex(body={"source": {"index": INDEX_NAME}, "dest": {"index": BACKUP_INDEX_NAME}}, request_timeout=3600)
    if backup_results.get('created') + backup_results.get('updated') == backup_results.get('total'):
        Index(INDEX_NAME).delete()
    else:
        return ('Tokens did not backup properly')
    time.sleep(1)
    Token.init()
    reindex_results = connections.get_connection().reindex(body={"source": {"index": BACKUP_INDEX_NAME}, "dest": {"index": INDEX_NAME}}, request_timeout=3600)
    if reindex_results.get('created') + reindex_results.get('updated') == reindex_results.get('total'):
        return ('Tokens reindexed successfully!')
    else:
        return ('Tokens did not reindex from backup properly')
Esempio n. 6
0
    def delete(self, data):
        if not (data.get('token') or data.get('username')):
            return 'username or token required'

        rv = list(self.search(data, raw=True))

        if not rv:
            return 0

        for t in rv:
            t = Token.get(t['_id'])
            t.delete()

        connections.get_connection().indices.flush(index='tokens')
        return len(rv)
Esempio n. 7
0
    def token_edit(self, data):
        if not data.get('token'):
            return 'token required for updating'

        s = Token.search()

        s = s.filter('term', token=data['token'])
        rv = s.execute()

        if not rv.hits.total > 0:
            return 'token not found'

        d = rv.hits.hits[0]
        d.update(fields=data)
        connections.get_connection().indices.flush(index='tokens')
Esempio n. 8
0
    def _perform_index_sync(self, sql_table_cls, es_doc_cls, id_logger):
        es_doc = es_doc_cls()

        elasticsearch_conn = connections.get_connection()

        sync_timestamp = current_server_timestamp()

        pending_insertions = self._compute_dirty_documents(
            sql_table_cls, es_doc.doc_type)

        bulk_op = self._synchronisation_op(es_doc, pending_insertions)

        self._logging(logging.INFO, 'Performing synchronization.')

        for ok, info in parallel_bulk(elasticsearch_conn, bulk_op):
            obj_id = info['index']['_id'] \
                if 'index' in info else info['update']['_id']

            if ok:
                # Mark the task as handled so we don't retreat it next time
                self._logging(logging.INFO,
                              'Document %s has been synced successfully.'
                              % obj_id)

                sql_table_cls.update_last_sync(obj_id, sync_timestamp)
            else:
                id_logger(obj_id, logging.ERROR,
                          'Error while syncing document %s index.' % obj_id)

        # Refresh indices to increase research speed
        elasticsearch_dsl.Index(es_doc.index).refresh()
Esempio n. 9
0
    def handle(self, *args, **options):
        usings = options.get("using") or settings.ELASTICSEARCH_CONNECTIONS.keys()

        for using in usings:
            # figure out if there is a conflict with the analysis defined in ES
            # and the analysis defined in Python land for this connection
            index_name = settings.ELASTICSEARCH_CONNECTIONS[using]['index_name']
            es = connections.get_connection(using)
            result = is_analysis_compatible(using)
            if result is False:
                if options.get("clopen"):
                    # get the existing analysis setting in ES, and combine
                    # those with the ones defined in Python. Close the index,
                    # update the settings, and re-open it
                    analysis = combined_analysis(using)
                    es.indices.close(index=index_name, ignore=[404])
                    es.indices.put_settings(index=index_name, body={'analysis': analysis}, ignore=[404])
                    es.indices.open(index=index_name, ignore=[404])
                else:
                    self.stderr.write(
                        "The analysis defined in ES and the analysis defined by your Indexes are not compatible. Aborting."
                        "Use --clopen to close the index, update the analysis, and open the index again."
                    )
                    self.stderr.write(diff_analysis(using))
                    exit(1)

        super().handle(*args, **options)
        if self.confirmed:
            call_command("update_index", *args, **options)
def main():

    es = connections.get_connection()

    dry = '--dry' in sys.argv
    if not dry:
        utils.add_file_logger(logger, __file__)
    preprints = Preprint.objects.filter(primary_file__isnull=False).select_related('primary_file', 'provider')
    total_preprints = preprints.count()
    logger.info('Collecting data on {} preprints...'.format(total_preprints))

    batch_to_update = []
    for i, preprint in enumerate(preprints, 1):
        preprint_id = preprint._id
        provider_id = preprint.provider._id
        file_id = preprint.primary_file._id
        page_counters = (
            PageCounter.objects
            .filter(
                _id__startswith='download:{preprint_id}:{file_id}:'.format(
                    preprint_id=preprint_id,
                    file_id=file_id
                )
            ).values_list('_id', 'date')
        )
        for page_counter in page_counters:
            page_counter__id, date = page_counter
            version_num = page_counter__id.split(':')[-1]
            for date, totals in date.items():
                timestamp = datetime.datetime.strptime(date, '%Y/%m/%d').replace(tzinfo=pytz.utc)
                batch_to_update.append({
                    '_index': 'osf_preprintdownload_{}'.format(timestamp.strftime(settings.ELASTICSEARCH_METRICS_DATE_FORMAT)),
                    '_source': {
                        'count': totals['total'],
                        'path': '/{}'.format(file_id),
                        'preprint_id': preprint_id,
                        'provider_id': provider_id,
                        'timestamp': timestamp,
                        'user_id': None,  # Pagecounter never tracked this
                        'version': int(version_num) + 1
                    },
                    '_type': 'doc'
                })

                if len(batch_to_update) >= MAX_BATCH_SIZE:
                    logger.info('Bulk-indexing data from {} PageCounter records'.format(len(batch_to_update)))
                    if not dry:
                        bulk(es, batch_to_update, max_retries=3, chunk_size=CHUNK_SIZE, request_timeout=REQUEST_TIMEOUT)
                    batch_to_update = []
                    # Allow elasticsearch to catch up
                    print('{}/{} preprints completed ({:.2f}%)'.format(i + 1, total_preprints, (i + 1) / total_preprints * 100))
                    sleep(THROTTLE_PERIOD)

    # Index final batch
    if len(batch_to_update):
        logger.info('Bulk-indexing data from {} PageCounter records'.format(len(batch_to_update)))
        if not dry:
            bulk(es, batch_to_update, max_retries=3, chunk_size=CHUNK_SIZE, request_timeout=REQUEST_TIMEOUT)

    logger.info('This will migrate {} Pagecounter entries to Elasticsearch'.format(len(batch_to_update)))
Esempio n. 11
0
def index_model(label):
    logger.debug('index_model')
    Model = None
    SerializerClass = None
    try:
        Model = apps.get_model(label)
    except LookupError as e:
        logger.error(e)
        raise e
    try:
        SerializerClass = search_config.get_serializer_for_model(label)
    except LookupError as e:
        logger.error(e)
        raise e

    if Model and SerializerClass:
        serializer = SerializerClass()
        conn = connections.get_connection()  # Get default connection

        queryset = Model.objects.all()
        if hasattr(queryset, 'published'):
            queryset = queryset.published()
        if serializer.related_object_fields:
            queryset = queryset.prefetch_related(*serializer.related_object_fields)

        model_docs = (serializer.create_document(item) for item in queryset)
        doc_dicts = (doc.to_dict(include_meta=True) for doc in model_docs)

        return es_bulk(conn, doc_dicts)
Esempio n. 12
0
 def query_articles(self, query, prefs):
     client = connections.get_connection()
     search = Search(using=client, index='articles')
     q = Q('bool', must=[Q('exists', field='watson_analyzed'),
                         Q('match', watson_success=True),
                         Q('match', body=query)])
     search = search.query(q)
     search.execute()
     documents = []
     for hit in search[:100]:
         if '#' not in hit.url and '?' not in hit.url:
             documents.append({
                 'id': hit.meta.id,
                 'title': hit.title,
                 'body': hit.body,
                 'url': hit.url,
                 'score': hit.meta.score,
                 'tone': dict(
                     joy=hit.tone.joy,
                     fear=hit.tone.fear,
                     sadness=hit.tone.sadness,
                     disgust=hit.tone.disgust,
                     anger=hit.tone.anger
                 ),
                 'top_image': hit.top_image
             })
     if len(documents) < 10:
         return documents
     else:
         return select_k_and_sort(documents, prefs)
Esempio n. 13
0
 def update_sentiments(self):
     from watson_developer_cloud import ToneAnalyzerV3Beta
     tone_analyzer = ToneAnalyzerV3Beta(username='******',
                                password='******',
                                version='2016-02-11')
     client = connections.get_connection()
     search = Search(using=client, index='articles', doc_type='article')
     q = Q('bool', must=[Q('missing', field='watson_analyzed')])
     search = search.query(q)
     counter = 0
     for result in search.scan():
         doc = Article.get(result.meta.id)
         try:
             analysis = tone_analyzer.tone(text=doc.body)
             tone_categories = analysis['document_tone']['tone_categories']
             emotion_tones = list(filter(lambda x: x['category_id'] == 'emotion_tone', tone_categories))[0]
             doc.tone = {}
             for tone in emotion_tones['tones']:
                 doc.tone[tone['tone_id']] = tone['score']
             doc.watson_success = True
         except WatsonException:
             continue
         finally:
             doc.watson_analyzed = True
             doc.save()
             counter += 1
         print(counter)
     if counter == 0:
         raise RealError()
Esempio n. 14
0
    def handle(self, *args, **options):
        Index(ElasticAddress._doc_type.index).delete(ignore=404)
        ElasticAddress.init()

        es = connections.get_connection('default')
        es.indices.put_settings(
            index=ElasticAddress._doc_type.index,
            body={
                "number_of_replicas": 0,
                'index.max_result_window': 50000
            }
        )

        Address.objects.reindex()

        self.stdout.write(
            'Loaded {} addresses to persistence storage'.format(
                Address.objects.count()))

        ownership_idx.delete(ignore=404)
        ownership_idx.create()
        ElasticOwnership.init()
        Ownership.objects.select_related("prop__address").reindex()

        self.stdout.write(
            'Loaded {} ownerships to persistence storage'.format(
                Ownership.objects.count()))
Esempio n. 15
0
    def _index_all_blogitems(self):
        iterator = BlogItem.objects.all()
        category_names = dict((x.id, x.name) for x in Category.objects.all())
        categories = defaultdict(list)
        for e in BlogItem.categories.through.objects.all():
            categories[e.blogitem_id].append(category_names[e.category_id])

        es = connections.get_connection()
        report_every = 100
        count = 0
        doc_type_name = _get_doc_type_name(BlogItem)
        t0 = time.time()
        for success, doc in streaming_bulk(
            es,
            (m.to_search(all_categories=categories).to_dict(True) for m in iterator),
            index=settings.ES_BLOG_ITEM_INDEX,
            doc_type=doc_type_name,
        ):
            if not success:
                print("NOT SUCCESS!", doc)
            count += 1
            if not count % report_every:
                print(count)
        t1 = time.time()

        self.out("DONE Indexing {} blogitems in {} seconds".format(count, t1 - t0))
def restore_tokens():
    connections.create_connection(hosts=ES_NODES)
    Index(INDEX_NAME).delete()

    class Token(DocType):
        username = String()
        token = String()
        expires = Date()
        read = Boolean()
        write = Boolean()
        revoked = Boolean()
        acl = String()
        groups = String()
        admin = Boolean()
        last_activity_at = Date()

        class Meta:
            index = INDEX_NAME

    Token.init()
    reindex_results = connections.get_connection().reindex(body={"source": {"index": BACKUP_INDEX_NAME}, "dest": {"index": INDEX_NAME}}, request_timeout=3600)
    if reindex_results.get('created') + reindex_results.get('updated') == reindex_results.get('total'):
        return ('Tokens restored to previous schema successfully!')
    else:
        return ('Tokens did not restore from backup properly')
Esempio n. 17
0
def get_es(alias='default'):
    """Retrieve Elasticsearch instance

    :arg alias: the alias in ES_URLS for this Elasticsearch connection

    """
    return connections.get_connection(alias=alias)
Esempio n. 18
0
    def handle(self, *args, **options):
        es = connections.get_connection()
        self.stdout.write('Deleting all the indices')
        es.indices.delete('blog-search')

        es.indices.create('blog-search', settings.ES_INDICES_SETTINGS)
        for post in Post.objects.all():
            PostES.index_post(post)
Esempio n. 19
0
    def setUpClass(cls):
        super(ElasticTestCase, cls).setUpClass()

        if not getattr(settings, 'ES_URLS', None):
            cls.skipme = True
            return

        try:
            connections.get_connection().cluster.health()
        except ConnectionError:
            cls.skipme = True
            return

        cls._old_es_index_prefix = settings.ES_INDEX_PREFIX
        settings.ES_INDEX_PREFIX = 'test-%s' % settings.ES_INDEX_PREFIX
        cls._old_es_live_index = settings.ES_LIVE_INDEX
        settings.ES_LIVE_INDEX = True
Esempio n. 20
0
    def token_last_activity_at(self, token, timestamp=None):
        s = Token.search()
        s = s.filter('term', token=token.decode('utf-8'))
        rv = s.execute()
        if rv.hits.total > 0:
            rv = rv.hits.hits[0]
            rv = Token.get(rv['_id'])

            if timestamp:
                self.logger.debug('updating timestamp to: {}'.format(timestamp))
                rv.update(last_activity_at=timestamp)
                connections.get_connection().indices.flush(index='tokens')
                return timestamp
            else:
                return rv.last_activity_at
        else:
            return timestamp
Esempio n. 21
0
def search_results_page(request):
    es_conn = connections.get_connection()
    term = request.GET.get('q', '').strip().lower()
    days_ago = request.GET.get('days_ago', 30)
    size = request.GET.get('size', 10)
    offset = request.GET.get('offset', 0)
    start_date, end_date = get_date_range_from_args(request)
    prev_start_date = start_date - timedelta(days=int(days_ago))
    prev_end_date = end_date - timedelta(days=int(days_ago))
    current_histogram = get_term_counts_histogram(
        es_conn, term, start_date, end_date
    )
    prev_histogram = get_term_counts_histogram(
        es_conn, term, prev_start_date, prev_end_date
    )
    current_total = sum(current_histogram.values())
    prev_total = sum(prev_histogram.values())
    docs = get_text_search_results(
        start_date, end_date, {'content': term}, size=size, offset=offset
    )
    for doc in docs:
        doc['mentions'] = doc['content'].lower().count(term.lower())
        doc['search_phrase'] = term
        date_issued = datetime.strptime(doc['date_issued'], '%Y-%m-%d')
        doc['human_date'] = date_issued.strftime('%b %d, %Y')
        i = doc['content'].lower().find(term.lower())
        if i > 0:
            start = max(0, i - 100)
            end = min(len(doc['content']), i + 200)
            doc['snippet'] = doc['content'][start:end]
        speakers = doc.get('speakers', '').split(',')
        doc['speakers'] = []
        for s in speakers:
            matched_bioguide_data = match_speaker_to_bioguide(s)
            if matched_bioguide_data:
                doc['speakers'].append(matched_bioguide_data)
    return JsonResponse(
        {
            'delta': int(100 * ((current_total - prev_total) / float(max(prev_total, 1)))),
            'docs': docs,
            'term': term,
            'current_period': {
                'daily_breakdown': [
                    {'date': k, 'count': v} for k, v in current_histogram.items()
                ],
                'total_count': current_total
            },
            'previous_period': {
                'daily_breakdown': [
                    {'date': k, 'count': v} for k, v in prev_histogram.items()
                ],
                'total_count': prev_total
            },
            'start_date': start_date,
            'end_date': end_date,
        }
    )
Esempio n. 22
0
 def __init__(self, config='cdr', size=2000):
     """
     :param url: str
         Fully qualified url to an elasticsearch instance
     :param size: int|
         Size limit to set on elasticsearch query
     """
     self.conn = connections.get_connection(config)
     self.elastic = Search('cdr', extra={'size': size})
Esempio n. 23
0
def existing_analysis(using):
    """
    Get the existing analysis for the `using` Elasticsearch connection
    """
    es = connections.get_connection(using)
    index_name = settings.ELASTICSEARCH_CONNECTIONS[using]['index_name']
    if es.indices.exists(index=index_name):
        return stringer(es.indices.get_settings(index=index_name)[index_name]['settings']['index'].get('analysis', {}))
    return DOES_NOT_EXIST
Esempio n. 24
0
    def search(cls, **kwargs):
        options = {
            'using': connections.get_connection(),
            'index': cls.get_index(),
            'doc_type': {cls._doc_type.name: cls.from_es},
        }
        options.update(kwargs)
        sq = Search(**options)

        return sq
Esempio n. 25
0
    def search(cls, **kwargs):
        options = {
            "using": connections.get_connection(),
            "index": cls.get_index(),
            "doc_type": {cls._doc_type.name: cls.from_es},
        }
        options.update(kwargs)
        sq = Search(**options)

        return sq
Esempio n. 26
0
 def handle(self, *args, **options):
     doc_types = ','.join(args) or None
     output = self.stdout
     output.write('[')
     es = connections.get_connection()
     for idx, doc in enumerate(scan(es, index=options['index'], doc_type=doc_types)):
         if idx > 0:
             output.write(',')
         output.write(json.dumps(doc, indent=options['indent']), ending='')
     output.write(']')
Esempio n. 27
0
 def bulk_update(cls, dicts, client=None):
     def upsert(doc):
         d = doc.to_dict(True)
         d['_op_type'] = 'update'
         d['doc'] = d['_source']
         d['doc_as_upsert'] = True
         del d['_source']
         return d
     client = client or connections.get_connection()
     return bulk(client, (upsert(d) for d in dicts))
Esempio n. 28
0
def bulk_load(docs_to_index):
    conn = connections.get_connection()
    index = NameVariant._doc_type.index

    for response in streaming_bulk(
            conn,
            docs_to_index,
            index=index,
            doc_type=NameVariant._doc_type.name):
        pass
Esempio n. 29
0
 def bulk_save(cls, dicts):
     objects = (
         dict(
             d.to_dict(include_meta=True),
             **{'_index': cls.set_index_name(int(d.ano_eleicao))}
         )
         for d in dicts
     )
     client = connections.get_connection()
     return bulk(client, objects)
Esempio n. 30
0
    def test_delete_index(self):
        # first create and populate the index
        index = Index.objects.create()
        index.populate()

        # then delete it and check if recreating works without blowing up
        index.delete()

        es = connections.get_connection()
        es.indices.create(index.prefixed_name)
        es.indices.delete(index.prefixed_name)
Esempio n. 31
0
def create_app(config_name='default'):
    """
    Create Flask app
    :param config_name:
    :return: Flask
    """

    from .api import blueprint as api_blueprint

    app = Flask(__name__)
    CORS(app, resources={
        r"/api/*": {"origins": "*"}
    })

    app.config.from_object(config[config_name])
    config[config_name].init_app(app)

    connections.create_connection(
        hosts=app.config['ELASTICSEARCH_HOST'],
        http_auth=(app.config['ELASTICSEARCH_USER'], app.config['ELASTICSEARCH_SECRET']),
        timeout=20
    )
    connections.get_connection()

    app.register_blueprint(api_blueprint)

    extensions(app)

    @app.after_request
    def after_request(response):
        response.headers.add('Access-Control-Allow-Origin', '*')
        if request.method == 'OPTIONS':
            response.headers['Access-Control-Allow-Methods'] = 'DELETE, GET, POST, PUT'
            headers = request.headers.get('Access-Control-Request-Headers')
            if headers:
                response.headers['Access-Control-Allow-Headers'] = headers

        return response

    return app
Esempio n. 32
0
    def process_batch(self, jobs):
        # query redis in a pipeline
        pipe = listinghash_db.pipeline()
        for job in jobs:
            pipe.get(job['job_data']['listingHash'])
        job_info_xs = pipe.execute()
        # deserializing jobs
        job_info_xs = [json.loads(job) if job else job for job in job_info_xs]
        # dup detect by listinghash cache
        _to_update = []
        _to_norm = []
        for job_info, job in zip(job_info_xs, jobs):
            if job_info:
                _to_update.append((job['job_data'], job_info))
            else:
                _to_norm.append(job)
        # bulk update old jobs
        # TODO: log old jobs to a topic for destination reasoning
        #       logger slows down the processor, don't use logger
        process_seq = r_db.get(KEY_PROCESS_SEQ)
        if not process_seq:
            self.logger.error("process_seq is empty")
            is_ok = False
        else:
            _better_job = []
            for job, job_info in _to_update:
                if better_job(job, job_info) >= 0:
                    # job = job.copy()
                    job['_id'] = job_info['_id']
                    _better_job.append(job)
            es_actions = bulk_update_actions(_better_job, process_seq)
            conn = connections.get_connection()
            is_ok = bulk_execute(self.logger, conn, es_actions)

        # if failed, norm old jobs as well, so
        # [_to_update] ++ [_to_norm] <- jobs
        # update: just discard them
        if not is_ok:
            # _to_norm = jobs
            self.logger.warn("Failed to bulk update existing jobs")

        # reporting
        count_all_job = len(jobs)
        count_old_job = len(_to_update)
        count_new_job = len(_to_norm)
        r_db.incr(r_total_job_key, count_all_job)
        r_db.incr(r_new_job_key, count_new_job)
        r_db.incr(r_old_job_key, count_old_job)

        # send to normalizer
        for job in _to_norm:
            self.produce_msg(**job)
Esempio n. 33
0
    def save(self, using=None, index=None, **kwargs):
        es = connections.get_connection()

        doc_meta = dict(
            (k, self.meta[k]) for k in DOC_META_FIELDS if k in self.meta)
        doc_meta.update(kwargs)

        meta = es.index(index=self._get_index(),
                        doc_type=self._doc_type.name,
                        body=self.serializer.data,
                        **doc_meta)

        return meta
Esempio n. 34
0
 def setUp(self):
     self.es_conn = connections.get_connection()
     self.test_crecs = []
     for i in range(20):
         self.test_crecs.append(
             CRECDoc(title=str(i),
                     content='foo bar baz Foo',
                     date_issued=datetime(2017, 1, i % 5 + 1)))
     self.index = Index(settings.ES_CW_INDEX)
     CRECDoc.init()
     for c in self.test_crecs:
         c.save(refresh=True)
     self.client = Client()
Esempio n. 35
0
    def tokens_create(self, data):
        self.logger.debug(data)
        if data.get('admin'):
            data['admin'] = True

        if data.get('read'):
            data['read'] = True

        if data.get('write'):
            data['write'] = True

        if not data.get('token'):
            data['token'] = self._token_generate()

        data['token'] = data['token']

        self.logger.debug(data)
        t = Token(**data)

        if t.save():
            connections.get_connection().indices.flush(index='tokens')
            return t.__dict__['_d_']
Esempio n. 36
0
    def setUpClass(cls):
        try:
            super(ElasticTestCase, cls).setUpClass()
        except AttributeError:
            # python 2.6 has no setUpClass, but that's okay
            pass

        if not getattr(settings, 'ES_URLS', None):
            cls.skipme = True
            return

        try:
            connections.get_connection().cluster.health()
        except ConnectionError:
            cls.skipme = True
            return

        cls._old_es_index_prefix = settings.ES_INDEX_PREFIX
        settings.ES_INDEX_PREFIX = 'test-%s' % settings.ES_INDEX_PREFIX
        # TODO: cleanup after upgarding test-utils (also in tearDownClass)
        cls._old_es_live_index = settings.ES_LIVE_INDEX
        settings.ES_LIVE_INDEX = True
Esempio n. 37
0
def es_conn(server=settings.ES_SERVER):
    """Standardized connection to the ES cluster.

    :param server: a server definition of the form [host:port, ...].  See
    https://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch
    for alternate host specification options.
    :return: an Elasticsearch connection instance
    """

    connections.configure(default=server,
                          max_retries=1,
                          sniff_on_start=False)
    return connections.get_connection()
Esempio n. 38
0
def index_video():
    """
    Method that index all the video objects to search server
    Call by pustakalaya_search app index_pustakalaya management command
    """
    from .models import Video
    # Create an index and populate the mappings
    VideoDoc.init()
    # Get elastic search client
    es = connections.get_connection()
    # Index all community with nested collection
    print("Indexing videos...")
    bulk(client=es, actions=(b.bulk_index() for b in Video.objects.all().iterator() if b.published == "yes"))
Esempio n. 39
0
    def setUpClass(cls):
        try:
            super(ElasticTestCase, cls).setUpClass()
        except AttributeError:
            # python 2.6 has no setUpClass, but that's okay
            pass

        if not getattr(settings, 'ES_URLS', None):
            cls.skipme = True
            return

        try:
            connections.get_connection().cluster.health()
        except ConnectionError:
            cls.skipme = True
            return

        cls._old_es_index_prefix = settings.ES_INDEX_PREFIX
        settings.ES_INDEX_PREFIX = 'test-{0!s}'.format(
            settings.ES_INDEX_PREFIX)
        cls._old_es_live_index = settings.ES_LIVE_INDEX
        settings.ES_LIVE_INDEX = True
Esempio n. 40
0
    def setUp(self):
        super(TestsWithData, self).setUp()

        self.docs = [
            self.TestDoc(title='doc-' + str(i))
            for i in range(1000)
        ]

        actions = [d.to_dict(include_meta=True) for d in self.docs]

        inserted, errors = bulk(connections.get_connection(), actions=actions, refresh=True)
        self.assertEqual(inserted, len(actions))
        self.assertEqual(len(errors), 0)
Esempio n. 41
0
 def build_index(self, document_parquet):
     if self.awsauth is not None:
         connections.create_connection(
             hosts=self.hosts,
             http_auth=self.awsauth,
             use_ssl=True,
             verify_certs=True,
             connection_class=RequestsHttpConnection)
     else:
         connections.create_connection(hosts=self.hosts)
     logger.info('Building elastic index')
     connections.create_connection(hosts=self.hosts)
     Page.init()
     # This is a parquet file to load from
     df = pd.read_parquet(document_parquet)
     unique_pages = df.groupby(
         ['pdf_name', 'page_num', 'dataset_id',
          'img_pth']).agg(lambda x: list(x))
     to_add = []
     for i, row in unique_pages.iterrows():
         to_add.append(
             Page(pdf_name=i[0],
                  page_num=i[1],
                  dataset_id=i[2],
                  img_pth=i[3],
                  pdf_dims=row['pdf_dims'][0].tolist(),
                  bbox=[j.tolist() for j in row['bounding_box']],
                  classes=[j.tolist() for j in row['classes']],
                  scores=[j.tolist() for j in row['scores']],
                  postprocess_cls=row['postprocess_cls'],
                  postprocess_score=row['postprocess_score'],
                  detect_cls=row['detect_cls'],
                  detect_score=row['detect_score']))
         if len(to_add) == 1000:
             bulk(connections.get_connection(), (upsert(d) for d in to_add))
             to_add = []
     bulk(connections.get_connection(), (upsert(d) for d in to_add))
     logger.info('Done building page index')
Esempio n. 42
0
    def index_all(cls, using=None, delete=False, **kwargs):
        def actions_generator():
            for obj in cls.index_queryset().iterator():
                yield cls.from_django(obj).to_dict(include_meta=True)

        client = connections.get_connection(using or cls._doc_type.using)
        if delete:
            client.indices.delete(index=cls._doc_type.index, ignore=[400, 404])
        cls._doc_type.init()
        for ok, item in streaming_bulk(client,
                                       actions_generator(),
                                       refresh=True,
                                       **kwargs):
            yield ok, item
Esempio n. 43
0
    def index_all(cls, index_name, using=None, **kwargs):
        def actions_generator():
            for obj in cls.index_queryset().iterator():
                elastic_data = cls.from_django(obj).to_dict(include_meta=True)
                elastic_data['_index'] = index_name
                yield elastic_data

        client = connections.get_connection(using or cls._doc_type.using)
        cls.init(index_name)
        for ok, item in streaming_bulk(client,
                                       actions_generator(),
                                       chunk_size=90,
                                       **kwargs):
            yield ok, item
Esempio n. 44
0
    def search(self,
               token,
               filters,
               sort='reporttime',
               raw=False,
               timeout=TIMEOUT):
        limit = filters.get('limit', LIMIT)

        s = Indicator.search(index='{}-*'.format(self.indicators_prefix))
        s = s.params(size=limit, timeout=timeout)
        s = s.sort('-reporttime', '-lasttime')

        s = filter_build(s, filters, token=token)

        logger.debug(s.to_dict())

        start = time.time()
        try:
            es = connections.get_connection(s._using)
            old_serializer = es.transport.deserializer

            if raw:
                rv = es.search(index=s._index,
                               doc_type=s._doc_type,
                               body=s.to_dict(),
                               **s._params)
            else:
                es.transport.deserializer = self.Deserializer()

                rv = es.search(index=s._index,
                               doc_type=s._doc_type,
                               body=s.to_dict(),
                               filter_path=['hits.hits._source'],
                               **s._params)
                # transport caches this, so the tokens mis-fire
                es.transport.deserializer = old_serializer

        except elasticsearch.exceptions.RequestError as e:
            logger.error(e)
            es.transport.deserializer = old_serializer
            return
        # catch all other es errors
        except elasticsearch.ElasticsearchException as e:
            logger.error(e)
            es.transport.deserializer = old_serializer
            raise CIFException

        logger.debug('query took: %0.2f' % (time.time() - start))

        return rv
Esempio n. 45
0
def sync_orders():
    highest_id = None
    try:
        r = Search(index='py-orders').sort('-_id')[0].execute()
        highest_id = int(r.hits[0].meta.id)
    except TransportError as e:
        if e.status_code == 404:
            highest_id = 0
    order_docs = []
    for order in models.Order.objects.filter(
            id__gt=highest_id).prefetch_related('customer'):
        order_docs.append(
            documents.Order(**order.to_search()).to_dict(include_meta=True))
    bulk(connections.get_connection(), order_docs)
Esempio n. 46
0
def init_app(app, db):
    email_service.sender.init_app(app)

    connections.create_connection(
        hosts=[{'host': app.config['ES_HOST'], 'port': app.config['ES_PORT']}],
        use_ssl=app.config['ES_USE_SSL'],
        connection_class=RequestsHttpConnection,
        timeout=120
    )

    app.elasticsearch = connections.get_connection()

    db.event.listen(db.session, 'before_commit', search.before_commit)
    db.event.listen(db.session, 'after_commit', search.after_commit)
Esempio n. 47
0
    def __init__(self, *args, **kwargs):
        assert self.document is not None

        self.client = connections.get_connection(
            self.document._get_using()
        )
        self.index = self.document._index._name
        self.mapping = self.document._doc_type.mapping.properties.name
        self.search = Search(
            using=self.client,
            index=self.index,
            doc_type=self.document._doc_type.name
        )
        super(BaseDocumentViewSet, self).__init__(*args, **kwargs)
Esempio n. 48
0
def do_multi_search(queries: List[ElasticSearchMultiSearchQuery], connection_type=DATA_CONNECTION):
    try:
        conn = connections.get_connection(alias=connection_type)
        multi_search_body = []
        for query_i in queries:
            multi_search_body.append({'index': query_i.index})
            if query_i.body is None:
                query_i.body = {}
            query_i.body['track_total_hits'] = True
            multi_search_body.append(query_i.body)
        return conn.msearch(body=multi_search_body)
    except Exception as e:
        traceback.print_exc()
        raise Exception('ERROR: can\'t retrieve elastic search data!')
Esempio n. 49
0
    def tokens_delete(self, data):
        if not (data.get('token') or data.get('username')):
            return 'username or token required'

        s = Token.search()

        if data.get('username'):
            s = s.filter('term', username=data['username'])

        if data.get('token'):
            s = s.filter('term', token=data['token'])

        rv = s.execute()

        if rv.hits.total > 0:
            for t in rv.hits.hits:
                t = Token.get(t['_id'])
                t.delete()

            connections.get_connection().indices.flush(index='tokens')
            return rv.hits.total
        else:
            return 0
Esempio n. 50
0
    def initiate_es_specific_state_if_is_enabled(self):
        """
        Initiates elasticsearch specific state if elasticsearch is enabled.

        Should be called in the class `__init__` method.
        """
        if not settings.ES_DISABLED:
            self.client = connections.get_connection(
                self.document._get_using())
            self.index = self.document._index._name
            self.mapping = self.document._doc_type.mapping.properties.name
            self.search = Search(using=self.client,
                                 index=self.index,
                                 doc_type=self.document._doc_type.name)
def test_smrt_elastcisearch():
    with Smrt(remote=REMOTE, client='elasticsearch') as s:
        assert type(s) is Smrt

        x = s.process('test/smrt/rules/csirtg.yml', feed='port-scanners')
        assert len(x) > 0

        x = s.process('test/smrt/rules/csirtg.yml', feed='port-scanners')
        assert len(x) > 0

        # cleanup
        es = connections.get_connection()
        cli = elasticsearch.client.IndicesClient(es)
        cli.delete(index='indicators-*')
Esempio n. 52
0
    def _health_check(self):
        try:
            x = connections.get_connection().cluster.health()
        except ConnectionError as e:
            logger.warn('elasticsearch connection error')
            logger.error(e)
            return

        except Exception as e:
            logger.error(traceback.print_exc())
            return

        logger.info('ES cluster is: %s' % x['status'])
        return x
Esempio n. 53
0
    def test_delete_index(self):
        # first create and populate the index
        index = Index.objects.create()
        index.populate()

        # then delete it and check if recreating works without blowing up
        index.delete()

        es = connections.get_connection()
        try:
            es.indices.create(index.prefixed_name)
        except RequestError:
            assert False
        es.indices.delete(index.prefixed_name)
Esempio n. 54
0
def community_detail(request, community_name):
    """Community detail page"""

    client = connections.get_connection()

    # s = Search(using=client, index=settings.ES_INDEX).query("match", )

    # Query the total number of items in elastic search having the name of this community
    # es_count = Search(index="pustakalaya").using(client).query("match", communities=community_name).count()

    # Context data
    context = {}
    # print("community name =",community_name)

    community_name = " ".join(community_name.split("-"))

    # Query all the collection that contains this community_name from ORM
    collections = Collection.objects.filter(community_name=community_name)

    collection_list = []

    all_total = 0

    for collection in collections:
        # Get the total no of items having this collection name in elastic search
        # item_count_per_collection = Search(index="pustakalaya").using(client).query("match", communities=collection).count()
        item_count_per_collection = Search(
            index="pustakalaya").using(client).query(
                "match", collections=collection.collection_name).count()

        all_total += item_count_per_collection

        pk = collection.pk

        # Create a list to that contain collection_name and total count
        collection_list.append({
            "collection_name": collection.collection_name,
            "total_count": all_total,
            "es_count": item_count_per_collection,
            "pk": pk,
        })

    # Implement total count

    # Sort list to display in alphabetical order.
    context["collection_list"] = collection_list
    context["community_name"] = community_name
    # print("colllist=",collection_list)
    return render(request, "collection/community_detail.html", context)
Esempio n. 55
0
    def handle(self, *args, **options):
        if "datasource" not in options:
            self.stderr.write("You need to specify datasource to reindex")
            return

        config = django_apps.app_configs[options["datasource"]]

        ElasticModel = config.elastic_model
        Model = config.data_model
        idx = config.elastic_index
        conn = connections.get_connection("default")

        if options["drop_indices"]:
            idx.delete(ignore=404)
            idx.create()
            ElasticModel.init()

            conn.indices.put_settings(
                index=ElasticModel._doc_type.index,
                body={"index.max_result_window": int(Model.objects.count() * 2. + 1)},
            )

        Model.setup_indexing()
        qs = Model.objects.all()

        if options["only_last_n_days"] is not None:
            qs = qs.filter(
                last_updated_from_dataset__gte=date.today()
                - relativedelta(days=options["only_last_n_days"])
            )

        docs_to_index = []
        with tqdm(total=qs.count()) as pbar:
            for p in qs.iterator():
                pbar.update(1)
                doc = p.to_dict()
                if doc is None:
                    self.stderr.write("Cannot parse {} document".format(p))
                    continue

                docs_to_index.append(ElasticModel(**doc))
                if len(docs_to_index) > options["batch_size"]:
                    self.bulk_write(conn, docs_to_index)
                    docs_to_index = []

        self.bulk_write(conn, docs_to_index)
        self.stdout.write(
            "{} of {} records indexed into ES".format(qs.count(), config.name)
        )
Esempio n. 56
0
    def search(cls, **kwargs):
        kwargs.update({
            'using': connections.get_connection(),
            'index': cls.get_index(),
            'doc_type': {
                cls._doc_type.name: cls.from_es
            },
        })
        sq = Search(**kwargs)

        # Add highlighting.
        sq = sq.highlight(*cls.excerpt_fields)
        sq = sq.highlight_options(order='score')

        return sq
Esempio n. 57
0
    def push(self):
        """Push built documents to ElasticSearch."""
        self._refresh_connection()
        self.create_mapping()

        if not self.push_queue:
            logger.debug("No documents to push, skipping push.")
            return

        logger.debug("Found %s documents to push to Elasticsearch.", len(self.push_queue))

        bulk(connections.get_connection(), (doc.to_dict(True) for doc in self.push_queue), refresh=True)
        self.push_queue = []

        logger.debug("Finished pushing builded documents to Elasticsearch server.")
    def handle(self, *args, **options):
        for index in options['indices']:
            if index == 'declarations_v2':
                doc_type = Declaration
            elif index == 'nacp_declarations':
                doc_type = NACPDeclaration

            es = connections.get_connection('default')
            if es.indices.exists(index=index):
                self.stdout.write('Index "{}" already exists, not creating.'.format(index))
                return

            doc_type.init()
            es.indices.put_settings(index=index, body=CATALOG_INDEX_SETTINGS)
            self.stdout.write('Created index "{}".'.format(index))
Esempio n. 59
0
def run():
    # create the mappings in elasticsearch
    DocumentIndex.init()

    # create and save and document
    document = DocumentIndex(meta={id:42})
    document.ocr_json = {'hello':'ok'}
    document.save()

    document = DocumentIndex.get(id=42)

    #print(document.ocr_json)

    # Display cluster health
    print(connections.get_connection().cluster.health())
Esempio n. 60
0
    def _create_index(self):
        dt = datetime.utcnow()
        dt = dt.strftime('%Y.%m')
        es = connections.get_connection()
        if not es.indices.exists('indicators-{}'.format(dt)):
            index = Index('indicators-{}'.format(dt))
            index.aliases(live={})
            index.doc_type(Indicator)
            index.create()

            m = Mapping('indicator')
            m.field('indicator_ipv4', 'ip')
            m.field('indicator_ipv4_mask', 'integer')
            m.save('indicators-{}'.format(dt))
        return 'indicators-{}'.format(dt)