class FacetScannerUpdateHandler(UpdateHandler):
    def __init__(self, conf: 'YamlConfig', **kwargs):

        self.facet_scanner = None
        self.es = None

        super().__init__(conf, **kwargs)

    def setup_extra(self, **kwargs) -> None:
        """
        Setup the facet scanner class and elasticserach connection to

        :param kwargs:
        """
        # Get the facet scanner class
        self.logger.info('Loading facet scanner')
        self.facet_scanner = FacetScanner()

        # Set up the Elasticsearch connection
        api_key = self.conf.get('elasticsearch', 'es_api_key')

        self.es = CEDAElasticsearchClient(headers={'x-api-key': api_key})

    def process_event(self, message: 'IngestMessage'):
        """
        Scan the file for facets
        :param message:
        :return:
        """
        if message.action == 'DEPOSIT':
            self._process_deposits(message)

    def _process_deposits(self, message: 'IngestMessage'):

        # Wait to make sure that the file is accessible on the filesystem
        self._wait_for_file(message)

        # Get the handler for this filepath
        handler = self.facet_scanner.get_handler(message.filepath)

        # Extract the facets
        facets = handler.get_facets(message.filepath)

        # Build the project dictionary using the handlers project name attr
        project = {'projects': {handler.project_name: facets}}

        index = self.conf.get('files_index', 'name')

        # Send facets to elasticsearch
        self.es.update(index=index,
                       id=PathTools.generate_id(message.filepath),
                       body={
                           'doc': project,
                           'doc_as_upsert': True
                       })
Esempio n. 2
0
class IndexUpdaterBase(object):
    """
    Base class for index updaters. Contains common methods.
    """
    def __init__(self, index, **kwargs):
        """
        Common variables.
        :param index:   Index to update
        """

        ca_root = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         '../root_certificate/root-ca.pem'))

        self.index = index
        self.es = CEDAElasticsearchClient(**kwargs)

    @staticmethod
    def _get_action_key(es_response_item):
        """
        Get the action key for processing the response
        :param es_response_item:
        :return: key
        """

        actions = ["update", "index", "delete"]
        response_keys = es_response_item.keys()

        return list(set(actions) & set(response_keys))[0]

    def _scroll_search(self, query, size=1000):
        """
        Perform a scroll search query

        :param query:   The query to perform
        :param size:    Size to return in each scroll. (default: 1000)
        :return:        Generator of results
        """

        return scan(self.es,
                    query=query,
                    scroll='1m',
                    index=self.index,
                    size=size)

    def _bulk_action(self, action_list, api="bulk", process_results=True):
        """
        Perform bulk action to elasticsearch. This is either bulk|msearch. Defualt: bulk

        :param action_list: List of bulk index operations.
        :return Consolidated report.
                    when api == bulk    returns {"success": int, "failed": int, "failed_items": list}
                    when api == msearch returns list with three levels as described below
                    [           # Container for the reponse
                        [       # Collection of all the responses in a block as submitted to elasticsearch
                            []  # Indiviual query responses
                        ]
                    ]

        """

        response_list = []
        for action in tqdm(action_list,
                           desc="Processing queries",
                           file=sys.stdout):

            if api == "bulk":
                response = self.es.bulk(index=self.index, body=action)
            elif api == "msearch":
                response = self.es.msearch(body=action)
            else:
                raise ValueError(
                    "Invalid api selected. Must be of either bulk|msearch")

            response_list.append(response)

        return self._process_bulk_action_response(response_list,
                                                  api,
                                                  process=process_results)

    def _generate_bulk_operation_body(self, content_list, action="index"):
        """
        Generate the query body for the bulk operation

        :param content_list:    List of dictionaries containing the content to be actioned upon
        :param action:          The elasticsearch action to perform. (index|update|delete) (default: index)
        :return:                List of actions to perform in batches of 800.
        """
        bulk_json = ""
        bulk_action_list = []

        for i, item in enumerate(content_list, 1):
            id = item["id"]

            if action == "index":
                header = json.dumps(
                    {"index": {
                        "_index": self.index,
                        "_id": id
                    }}) + "\n"
                body = json.dumps(item["document"]) + "\n"

            elif action == "update":
                header = json.dumps(
                    {"update": {
                        "_index": self.index,
                        "_id": id
                    }}) + "\n"
                body = json.dumps({
                    "doc": item["document"],
                    "doc_as_upsert": True
                }) + "\n"

            elif action == "delete":
                header = json.dumps(
                    {"delete": {
                        "_index": self.index,
                        "_id": id
                    }}) + "\n"
                body = ""

            elif action == "search":
                header = json.dumps({"index": self.index}) + "\n"
                body = json.dumps(item["query"]) + "\n"

            else:
                raise ValueError(
                    "Incorrect action supplied. Must be of either index|update|delete|search"
                )

            bulk_json += header + body

            # Every 800 items create a new bulk request
            if i % 800 == 0:
                bulk_action_list.append(bulk_json)
                bulk_json = ""

        # Clean up any remaining jobs
        if bulk_json:
            bulk_action_list.append(bulk_json)

        return bulk_action_list

    def _process_bulk_action_response(self,
                                      action_response,
                                      api,
                                      process=True):
        """
        Process the bulk action response and generate a consilated report of actions
        :param action_response: Response from elasticseach bulk api call
        :param api:             Whether api used was bulk or msearch
        :param process:         True: return consolidated response. False: Return raw response
        :return: Consolidated report | Raw response based on process flag.
        """

        # Return raw response
        if not process:
            return action_response

        if api == "bulk":
            success = 0
            failed = 0
            items_failed = []

            for action in action_response:
                # If there are no errors in the high level json. All items succeeded
                if not action["errors"]:
                    success += len(action["items"])

                else:
                    # Some or all items failed
                    for item in action["items"]:
                        action_key = self._get_action_key(item)

                        # If 2xx HTTP response. Successful
                        if 200 <= item[action_key]["status"] < 300:
                            success += 1

                        else:
                            failed += 1

                            id = item[action_key]["_id"]
                            status = item[action_key]["status"]
                            error = item[action_key]["error"]

                            items_failed.append({
                                "id": id,
                                "status": status,
                                "error": error
                            })

            return {
                "success": success,
                "failed": failed,
                "failed_items": items_failed
            }

        elif api == "msearch":

            msearch_action_response = []
            for action in action_response:
                response_hits = []

                for response in action["responses"]:
                    response_hits.append(response["hits"]["hits"])

                msearch_action_response.append(response_hits)

            return msearch_action_response

        else:
            raise ValueError(
                "Invalid api selected. Must be of either bulk|msearch")

    def _create_id(self, string):
        return hashlib.sha1(string).hexdigest()

    def _add_item(self, id, doc):
        """
        Update a single document
        :param id: Dictionary containing document body and id in form
        {'document':{},'id':<sha1 hash of filepath>}
        """
        document = {'doc': doc, 'doc_as_upsert': True}

        self.es.update(index=self.index, id=id, body=document)