Exemple #1
0
    def create_env(self, id: str = None, name: str = None):
        """
        TODO: Verify the name correctly

        :param id: Alphanumeric. If not presented the system will generate one.
        :param name: Name to give to the env. Not used ATM
        :return:
        """
        self.authorization.can_create_env()
        es = get_es_conn()
        # TODO validate that you can not create 2 envs with the same name.
        idName = None
        if id:
            try:
                idName = id
                res = es.get(index=self.envIndex,
                             doc_type="default",
                             id=idName)
                raise EnvAlreadyExistWithSameIdException(
                    "Id: {0}".format(idName))
            except exceptions.NotFoundError:
                pass
        else:
            idName = gen_uuid()

        es.create(index=self.envIndex,
                  doc_type="default",
                  id=idName,
                  body={"name": name})

        # create corpuses
        create_all_lists_for_env(idName, self.authorization)

        return Env(idName, name)
Exemple #2
0
    def _create_data_index_if_not_exist(self,
                                        docType="default",
                                        allowDynamicFields=True) -> str:
        """
        Will create a data index if it does not exist.
        This method should be considered private.

        :param indexName:
        :param allowDynamicFields:  If false dynamic fields will return an error when the user attempts to add a field not in schema
        :return:    name of the index
        """

        es = get_es_conn()
        try:
            es_wait_ready()
            res = es.get(index=self.typeIndex,
                         doc_type="directory_type",
                         id=docType)
            indexName = res["_source"]["indexName"]
        except exceptions.NotFoundError:
            indexName = self.dataIndexPrefix + "_" + gen_uuid()
            # adding standard name generation for default indexes
            if docType == "default":
                indexName = self.defaultDataIndex

            es.create(index=self.typeIndex,
                      doc_type="directory_type",
                      id=docType,
                      body={"indexName": indexName})

            body = {"settings": {}}
            body["settings"]["index"] = {
                "max_result_window": MAX_RESULT_WINDOW,
                "number_of_shards": get_number_of_shards(),
                "number_of_replicas": get_number_of_replicas()
            }
            body["settings"]["analysis"] = ANALYSIS
            if allowDynamicFields:
                es.indices.create(indexName, body=body)
            else:
                body["mappings"] = {}
                body["mappings"][docType] = {}
                body["mappings"][docType]["dynamic"] = "strict"
                # mapping["mappings"][docType]["properties"] = "{}"
                es.indices.create(indexName, body=body)
            es_wait_ready()
            # artificial pause to make sure index is created and ready
            time.sleep(1)

        return indexName
Exemple #3
0
    def add_annotations(self, bucketId: str, zipFilePath: str):
        """
        :param bucketId:
        :param zipFilePath:     Local location to where to where to save zip file.
        :param errorUrl:        Storage url to where to save errors.
        :return:    {"data" :[error1, ... ,errorN]
        """
        logger = logging.getLogger(__name__)
        fh = open(zipFilePath, 'rb')
        z = zipfile.ZipFile(fh)
        es = get_es_conn()
        bucket = get_master_bucket_list(self.envId,
                                        self.authorization).get_bucket(
                                            self.corpusId, bucketId)
        indicesPerType = bucket.get_es_index_per_schema_type()
        errorsToReturn = []
        ANNOTATION_ID_FIELD = "annotationId"
        DOCUMENT_ID_FIELD = "_documentID"
        ERROR_DOCUMENT_ID_FIELD = "_documentID"
        ERROR_ANNOTATION_FIELD = "annotation"
        ERROR_MESSAGE_FIELD = "message"
        ERROR_TYPE_FIELD = "errorType"
        ANNOTATION_ERROR_TYPE = "annotation"
        OTHER_ERROR_TYPE = "other"
        MAX_ERRORS_TO_RETURN = 10000
        for name in z.namelist():
            try:
                bulkData = []
                errorAnnotations = []
                data = z.read(name).decode('utf-8')
                jsonData = json.loads(data)
                annotations = jsonData["data"]
                annotationById = {}
                for annotation in annotations:
                    try:
                        # TODO make real annotation validation
                        annotationId = None
                        documentId = None
                        if ANNOTATION_ID_FIELD in annotation:
                            annotationId = annotation[ANNOTATION_ID_FIELD]
                        else:
                            annotationId = gen_uuid()

                        if DOCUMENT_ID_FIELD in annotation:
                            documentId = annotation[DOCUMENT_ID_FIELD]

                        if not "schemaType" in annotation:
                            errorAnnotations.append({
                                ERROR_ANNOTATION_FIELD:
                                annotation,
                                ERROR_DOCUMENT_ID_FIELD:
                                documentId,
                                ERROR_TYPE_FIELD:
                                ANNOTATION_ERROR_TYPE,
                                ERROR_MESSAGE_FIELD:
                                "Missing schemaType"
                            })
                        elif not annotation["schemaType"] in indicesPerType:
                            errorAnnotations.append({
                                ERROR_ANNOTATION_FIELD:
                                annotation,
                                ERROR_DOCUMENT_ID_FIELD:
                                documentId,
                                ERROR_TYPE_FIELD:
                                ANNOTATION_ERROR_TYPE,
                                ERROR_MESSAGE_FIELD:
                                "Bucket doesnt contain a schema for schemaType:{0}"
                                .format(annotation["schemaType"])
                            })
                        elif annotationId in annotationById:
                            errorAnnotations.append({
                                ERROR_ANNOTATION_FIELD:
                                annotation,
                                ERROR_DOCUMENT_ID_FIELD:
                                documentId,
                                ERROR_TYPE_FIELD:
                                ANNOTATION_ERROR_TYPE,
                                ERROR_MESSAGE_FIELD:
                                "Duplicate annotation ID."
                            })
                        else:

                            if ANNOTATION_ID_FIELD in annotation:
                                annotationById[annotationId] = annotation.copy(
                                )
                                del annotation[ANNOTATION_ID_FIELD]
                            else:
                                annotationById[annotationId] = annotation

                            recordToAdd = {}
                            recordToAdd["_index"] = indicesPerType[
                                annotation["schemaType"]]
                            recordToAdd["_type"] = annotation["schemaType"]
                            recordToAdd["_id"] = annotationId
                            recordToAdd["_source"] = annotation
                            bulkData.append(recordToAdd)
                    except Exception as e:
                        errorAnnotations.append({
                            ERROR_ANNOTATION_FIELD: annotation,
                            ERROR_DOCUMENT_ID_FIELD: documentId,
                            ERROR_TYPE_FIELD: ANNOTATION_ERROR_TYPE,
                            ERROR_MESSAGE_FIELD: str(e)
                        })
                        logger.info({
                            ERROR_ANNOTATION_FIELD: annotation,
                            ERROR_DOCUMENT_ID_FIELD: documentId,
                            ERROR_TYPE_FIELD: ANNOTATION_ERROR_TYPE,
                            ERROR_MESSAGE_FIELD: str(e)
                        })
                        logger.info(
                            {"ERROR TRACEBACK": traceback.format_exc()})
                try:
                    helpers.bulk(es, bulkData)
                except helpers.BulkIndexError as bie:
                    for error in bie.errors:
                        annotationError = {}
                        if "index" in error:
                            if "_id" in error["index"]:
                                if error["index"]["_id"] in annotationById:
                                    annotation = annotationById[error["index"]
                                                                ["_id"]]
                                    documentId = None
                                    if DOCUMENT_ID_FIELD in annotation:
                                        documentId = annotation[
                                            DOCUMENT_ID_FIELD]
                                    if "error" in error["index"]:
                                        if "type" in error["index"][
                                                "error"] and "reason" in error[
                                                    "index"]["error"]:
                                            if error["index"]["error"][
                                                    "type"] == "strict_dynamic_mapping_exception":
                                                annotationError = {
                                                    ERROR_ANNOTATION_FIELD:
                                                    annotation,
                                                    ERROR_DOCUMENT_ID_FIELD:
                                                    documentId,
                                                    ERROR_TYPE_FIELD:
                                                    ANNOTATION_ERROR_TYPE,
                                                    ERROR_MESSAGE_FIELD:
                                                    "Annotation fields dont respect schema. "
                                                    + error["index"]["error"]
                                                    ["reason"]
                                                }
                                            else:
                                                annotationError = {
                                                    ERROR_ANNOTATION_FIELD:
                                                    annotation,
                                                    ERROR_DOCUMENT_ID_FIELD:
                                                    documentId,
                                                    ERROR_TYPE_FIELD:
                                                    ANNOTATION_ERROR_TYPE,
                                                    ERROR_MESSAGE_FIELD:
                                                    error["index"]["error"]
                                                    ["reason"]
                                                }
                                        else:
                                            annotationError = {
                                                ERROR_ANNOTATION_FIELD:
                                                annotation,
                                                ERROR_DOCUMENT_ID_FIELD:
                                                documentId,
                                                ERROR_TYPE_FIELD:
                                                ANNOTATION_ERROR_TYPE,
                                                ERROR_MESSAGE_FIELD:
                                                error["index"]["error"]
                                            }

                                    else:
                                        annotationError = {
                                            ERROR_ANNOTATION_FIELD: annotation,
                                            ERROR_DOCUMENT_ID_FIELD:
                                            documentId,
                                            ERROR_TYPE_FIELD:
                                            ANNOTATION_ERROR_TYPE,
                                            ERROR_MESSAGE_FIELD: error["index"]
                                        }
                        if annotationError:
                            errorAnnotations.append(annotationError)
                            logger.info(annotationError)
                        else:
                            errorAnnotations.append({
                                ERROR_TYPE_FIELD: OTHER_ERROR_TYPE,
                                "message": str(error)
                            })
                            logger.info({
                                ERROR_TYPE_FIELD: OTHER_ERROR_TYPE,
                                "message": str(error)
                            })

                except Exception as e:
                    errorAnnotations.append({
                        ERROR_TYPE_FIELD:
                        OTHER_ERROR_TYPE,
                        "message":
                        "Bulk Insert Failed File: {0}\n{1}".format(
                            name, str(e))
                    })
                    logger.error({
                        ERROR_TYPE_FIELD:
                        OTHER_ERROR_TYPE,
                        "message":
                        "Bulk Insert Failed File: {0}\n{1}".format(
                            name, str(e))
                    })
                    logger.error({"ERROR TRACEBACK": traceback.format_exc()})
            except Exception as e:
                errorAnnotations.append({
                    ERROR_TYPE_FIELD:
                    OTHER_ERROR_TYPE,
                    "message":
                    "Failed to process file: {0}\n{1}".format(name, str(e))
                })
                logger.error({
                    ERROR_TYPE_FIELD:
                    OTHER_ERROR_TYPE,
                    "message":
                    "Failed to process file: {0}\n{1}".format(name, str(e))
                })
                logger.error({"ERROR TRACEBACK": traceback.format_exc()})

            for error in errorAnnotations:
                if len(errorsToReturn) < MAX_ERRORS_TO_RETURN:
                    errorsToReturn.append(error)
                else:
                    break

        if errorsToReturn:
            return {
                "data": errorsToReturn,
                "totalErrorsCount": len(errorAnnotations)
            }
        else:
            return None
Exemple #4
0
    def add_or_update_document(self,
                               jsonDocument,
                               id: str = None,
                               docType: str = "default",
                               esProperties: dict = None,
                               update: bool = False,
                               createDataIndexIfNotExist: bool = True):
        """
        Adds a jsonDocument.
        If for a for a give type of document the esSchemaMapping is not compatible, it will throw ConflictingSchemaException.

        Precondition: esSchemaMapping is a valid ElasticSearch mapping, containing a property field.

        If esSchemaMapping is present, it will validate that it is compatible with the schema for docs of doc_type.

        :param id:                  id which will uniquely identify this document between all document of the same type.
        :param jsonDocument:        jsonContaining the document to be indexed.This should be a python object representing the json.. Without schema the whole document is indexed.
        :param docType:            Type of the document. All document with the same type have the same structure.
        :param esProperties:        Properties sections of elastic search mappings. Allows to dynamically change the schema when adding the document.
        :param update:              If true, this function will throw and exception if trying a add an already existing document
        :param createDataIndexIfNotExist:   If true will create a data index if it doest exist for a given type. If false it will throw an error if user attemps to put data into an unexisting index.
        :return:                            Id of the created document
        """

        # TODO, since it will create one index per document type
        # Eventually we will need to merge the different indexes.

        logger = logging.getLogger(__name__)
        es = get_es_conn()

        # get or create index
        indexName = ""
        if createDataIndexIfNotExist:
            indexName = self._create_data_index_if_not_exist(docType, True)
        else:
            try:
                es_wait_ready()
                res = es.get(index=self.typeIndex,
                             doc_type="directory_type",
                             id=docType)
                indexName = res["_source"]["indexName"]
            except exceptions.NotFoundError:
                raise DocumentDirectoryNoSchemaFoundException(
                    "No schema found for type:{0}".format(docType))

        # update schema if supplied
        if esProperties:
            esPropertiesFrom = None
            try:
                esMappingFrom = es.indices.get_mapping(index=indexName,
                                                       doc_type=docType)
                esPropertiesFrom = esMappingFrom[indexName]["mappings"][
                    docType]
            except Exception:
                pass

            if (esPropertiesFrom):
                self.add_or_update_schema(
                    SchemaList.calculate_properties_delta(
                        esPropertiesFrom, esProperties), docType)
            else:
                self.add_or_update_schema(esProperties, docType)

        if not id:
            id = gen_uuid()

        if update:
            es.update(index=indexName,
                      doc_type=docType,
                      body={"doc": jsonDocument},
                      id=id)
        else:
            try:
                es.create(index=indexName,
                          doc_type=docType,
                          body=jsonDocument,
                          id=id)
                return id
            except exceptions.ConflictError:
                logger.info(
                    "Adding existing document {0} for type {1} in index {2}".
                    format(id, docType, indexName))
                raise DocumentAlreadyExistsException()
            except exceptions.TransportError as te:
                if ("error" in te.info) and ("type" in te.info["error"]) and (
                        te.info["error"]["type"]
                        == "strict_dynamic_mapping_exception"):
                    raise DocumentDoesNotRespectSchemaException(
                        "Document id: {0}".format(id))
Exemple #5
0
    def add_json_schema(self,
                        jsonSchema: dict,
                        name: str = None,
                        description: str = None,
                        id: str = None,
                        shouldValidate: bool = False,
                        nestedFields=[]):
        """
        This function is used by the user to add his json schema, and able to search it.
        A json schema added by the system will be automatically searchable by hash.

        :param authorization:       User authorization object
        :param jsonSchema:          Json Schema
        :param name:                Optional name used by the user to search for the schema
        :param description:         Optional description used by the user to search for the shema
        :param id:                  Id to uniquely identify this JsonSchema
        :param shouldValidate:      If true it will validate the schema before adding.
        :param nestedFields:    Lists arrays which should be considered as nested properties in es
        :return:                    jsonSchemaInfoId: used to get the information about json schema
        """

        self.authorization.can_add_json_schema()
        jsonSchemaStr = json.dumps(jsonSchema)
        jsonSchemaHash = SchemaList.hash_json_schema(jsonSchemaStr)
        esProperties = self.convert_json_schema_to_es_properties(
            jsonSchema, nestedFields)
        esHash = self.add_es_schema(esProperties)

        tmpName = ""
        tmpDescription = ""
        entry = {}
        if name:
            tmpName = name
            entry["name"] = name
        if description:
            tmpDescription = description
            entry["description"] = description

        es = get_es_conn()
        if id:
            try:
                es.get(index=self.masterJsonSchemaIndex,
                       doc_type="default",
                       id=id)
                raise JsonSchemaAlreadyExistsException(
                    "Schema with same id already exist: {0}".format(id))
            except exceptions.NotFoundError:
                pass
        else:
            id = gen_uuid()

        if shouldValidate:
            raise NotImplementedError()

        entry["jsonSchemaHash"] = jsonSchemaHash
        entry["esHash"] = esHash
        entry["jsonSchema"] = json.dumps(jsonSchema)
        es.create(index=self.masterJsonSchemaIndex,
                  doc_type="default",
                  id=id,
                  body=entry)

        return id