Beispiel #1
0
from django.conf import settings

ELASTICSEARCH_ENABLED = hasattr(settings, 'ELASTICSEARCH_DSL')

if ELASTICSEARCH_ENABLED:
    connections.create_connection(
        hosts=[settings.ELASTICSEARCH_DSL['default']['hosts']])
    from elasticsearch import Elasticsearch

    es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
    from elasticsearch.client import IngestClient

    c = IngestClient(es)
    try:
        c.get_pipeline('geoip')
    except elasticsearch.exceptions.NotFoundError:
        c.put_pipeline('geoip',
                       body='''{
              "description" : "Add geoip info",
              "processors" : [
                {
                  "geoip" : {
                    "field" : "ip"
                  }
                }
              ]
            }''')


class GeoIp(InnerDoc):
Beispiel #2
0
class IngestConnector:
    def __init__(
            self,
            pipeline_id: str = "pdf_content",
            field: str = "data",
            pipeline_description: str = "Extracting info from pdf content"):
        self.pipeline_id: str = pipeline_id
        self.index_name: str = pipeline_id + "_index"
        self.field: str = field
        self.pipeline_description: str = pipeline_description

        self.ingest_client = IngestClient(current_app.elasticsearch)

    def create_pipeline(self):
        self.ingest_client.put_pipeline(id=self.pipeline_id,
                                        body={
                                            'description':
                                            self.pipeline_description,
                                            'processors': [{
                                                "attachment": {
                                                    "field": self.field
                                                }
                                            }]
                                        })

    def delete_pipeline(self):
        self.ingest_client.delete_pipeline(id=self.pipeline_id)

    def get_pipeline(self):
        return self.ingest_client.get_pipeline(id=self.pipeline_id)

    def add_to_index(self, id_: int, content: str, content_page: int,
                     content_paragraph: int):
        current_app.elasticsearch.index(
            index=self.index_name,
            id=id_,
            pipeline=self.pipeline_id,
            body={
                self.field:
                base64.b64encode(content.encode("utf-8")).decode("utf-8"),
                "content_page":
                content_page,
                "content_paragraph":
                content_paragraph,
            })

    def remove_from_index(self, id_: int):
        current_app.elasticsearch.delete(index=self.index_name, id=id_)

    def api_search(self, query: str):
        return current_app.elasticsearch.search(
            index=self.index_name,
            body={"query": {
                "match": {
                    "attachment.content": query
                }
            }})

    def search(self, query: str):
        search = self.api_search(query)

        ids = [int(hit['_id']) for hit in search['hits']['hits']]

        if len(ids) == 0:
            return None

        when = []
        for i in range(len(ids)):
            when.append((ids[i], i))

        res = KnowledgePdfContent.query.filter(
            KnowledgePdfContent.id.in_(ids)).order_by(
                db.case(when, value=KnowledgePdfContent.id)).all()
        return res[0] if len(res) > 0 else None
Beispiel #3
0
# simulate ingest pipeline
IngestClient.simulate(es, body)

# In[ ]:

# store the pipeline for use in prod
pipeline_name = model_id + '_ingest_pipeline'
body = {'description': 'predict flower type', 'processors': processors}

IngestClient.put_pipeline(es, id=pipeline_name, body=body)

# In[ ]:

# verify pipeline
IngestClient.get_pipeline(es, pipeline_name)

# In[ ]:

# create index template with our new pipeline as the default pipeline

settings = {
    "index_patterns": ["flower_measurements-*"],
    "settings": {
        "default_pipeline": "jeffs-rfc-flower-type_ingest_pipeline"
    }
}

template_name = 'flowers_measurement'
IndicesClient.put_template(es, name=template_name, body=settings)