def create_env(self, id: str = None, name: str = None): """ TODO: Verify the name correctly :param id: Alphanumeric. If not presented the system will generate one. :param name: Name to give to the env. Not used ATM :return: """ self.authorization.can_create_env() es = get_es_conn() # TODO validate that you can not create 2 envs with the same name. idName = None if id: try: idName = id res = es.get(index=self.envIndex, doc_type="default", id=idName) raise EnvAlreadyExistWithSameIdException( "Id: {0}".format(idName)) except exceptions.NotFoundError: pass else: idName = gen_uuid() es.create(index=self.envIndex, doc_type="default", id=idName, body={"name": name}) # create corpuses create_all_lists_for_env(idName, self.authorization) return Env(idName, name)
def _create_data_index_if_not_exist(self, docType="default", allowDynamicFields=True) -> str: """ Will create a data index if it does not exist. This method should be considered private. :param indexName: :param allowDynamicFields: If false dynamic fields will return an error when the user attempts to add a field not in schema :return: name of the index """ es = get_es_conn() try: es_wait_ready() res = es.get(index=self.typeIndex, doc_type="directory_type", id=docType) indexName = res["_source"]["indexName"] except exceptions.NotFoundError: indexName = self.dataIndexPrefix + "_" + gen_uuid() # adding standard name generation for default indexes if docType == "default": indexName = self.defaultDataIndex es.create(index=self.typeIndex, doc_type="directory_type", id=docType, body={"indexName": indexName}) body = {"settings": {}} body["settings"]["index"] = { "max_result_window": MAX_RESULT_WINDOW, "number_of_shards": get_number_of_shards(), "number_of_replicas": get_number_of_replicas() } body["settings"]["analysis"] = ANALYSIS if allowDynamicFields: es.indices.create(indexName, body=body) else: body["mappings"] = {} body["mappings"][docType] = {} body["mappings"][docType]["dynamic"] = "strict" # mapping["mappings"][docType]["properties"] = "{}" es.indices.create(indexName, body=body) es_wait_ready() # artificial pause to make sure index is created and ready time.sleep(1) return indexName
def add_annotations(self, bucketId: str, zipFilePath: str): """ :param bucketId: :param zipFilePath: Local location to where to where to save zip file. :param errorUrl: Storage url to where to save errors. :return: {"data" :[error1, ... ,errorN] """ logger = logging.getLogger(__name__) fh = open(zipFilePath, 'rb') z = zipfile.ZipFile(fh) es = get_es_conn() bucket = get_master_bucket_list(self.envId, self.authorization).get_bucket( self.corpusId, bucketId) indicesPerType = bucket.get_es_index_per_schema_type() errorsToReturn = [] ANNOTATION_ID_FIELD = "annotationId" DOCUMENT_ID_FIELD = "_documentID" ERROR_DOCUMENT_ID_FIELD = "_documentID" ERROR_ANNOTATION_FIELD = "annotation" ERROR_MESSAGE_FIELD = "message" ERROR_TYPE_FIELD = "errorType" ANNOTATION_ERROR_TYPE = "annotation" OTHER_ERROR_TYPE = "other" MAX_ERRORS_TO_RETURN = 10000 for name in z.namelist(): try: bulkData = [] errorAnnotations = [] data = z.read(name).decode('utf-8') jsonData = json.loads(data) annotations = jsonData["data"] annotationById = {} for annotation in annotations: try: # TODO make real annotation validation annotationId = None documentId = None if ANNOTATION_ID_FIELD in annotation: annotationId = annotation[ANNOTATION_ID_FIELD] else: annotationId = gen_uuid() if DOCUMENT_ID_FIELD in annotation: documentId = annotation[DOCUMENT_ID_FIELD] if not "schemaType" in annotation: errorAnnotations.append({ ERROR_ANNOTATION_FIELD: annotation, ERROR_DOCUMENT_ID_FIELD: documentId, ERROR_TYPE_FIELD: ANNOTATION_ERROR_TYPE, ERROR_MESSAGE_FIELD: "Missing schemaType" }) elif not annotation["schemaType"] in indicesPerType: errorAnnotations.append({ ERROR_ANNOTATION_FIELD: annotation, ERROR_DOCUMENT_ID_FIELD: documentId, ERROR_TYPE_FIELD: ANNOTATION_ERROR_TYPE, ERROR_MESSAGE_FIELD: "Bucket doesnt contain a schema for schemaType:{0}" .format(annotation["schemaType"]) }) elif annotationId in annotationById: errorAnnotations.append({ ERROR_ANNOTATION_FIELD: annotation, ERROR_DOCUMENT_ID_FIELD: documentId, ERROR_TYPE_FIELD: ANNOTATION_ERROR_TYPE, ERROR_MESSAGE_FIELD: "Duplicate annotation ID." }) else: if ANNOTATION_ID_FIELD in annotation: annotationById[annotationId] = annotation.copy( ) del annotation[ANNOTATION_ID_FIELD] else: annotationById[annotationId] = annotation recordToAdd = {} recordToAdd["_index"] = indicesPerType[ annotation["schemaType"]] recordToAdd["_type"] = annotation["schemaType"] recordToAdd["_id"] = annotationId recordToAdd["_source"] = annotation bulkData.append(recordToAdd) except Exception as e: errorAnnotations.append({ ERROR_ANNOTATION_FIELD: annotation, ERROR_DOCUMENT_ID_FIELD: documentId, ERROR_TYPE_FIELD: ANNOTATION_ERROR_TYPE, ERROR_MESSAGE_FIELD: str(e) }) logger.info({ ERROR_ANNOTATION_FIELD: annotation, ERROR_DOCUMENT_ID_FIELD: documentId, ERROR_TYPE_FIELD: ANNOTATION_ERROR_TYPE, ERROR_MESSAGE_FIELD: str(e) }) logger.info( {"ERROR TRACEBACK": traceback.format_exc()}) try: helpers.bulk(es, bulkData) except helpers.BulkIndexError as bie: for error in bie.errors: annotationError = {} if "index" in error: if "_id" in error["index"]: if error["index"]["_id"] in annotationById: annotation = annotationById[error["index"] ["_id"]] documentId = None if DOCUMENT_ID_FIELD in annotation: documentId = annotation[ DOCUMENT_ID_FIELD] if "error" in error["index"]: if "type" in error["index"][ "error"] and "reason" in error[ "index"]["error"]: if error["index"]["error"][ "type"] == "strict_dynamic_mapping_exception": annotationError = { ERROR_ANNOTATION_FIELD: annotation, ERROR_DOCUMENT_ID_FIELD: documentId, ERROR_TYPE_FIELD: ANNOTATION_ERROR_TYPE, ERROR_MESSAGE_FIELD: "Annotation fields dont respect schema. " + error["index"]["error"] ["reason"] } else: annotationError = { ERROR_ANNOTATION_FIELD: annotation, ERROR_DOCUMENT_ID_FIELD: documentId, ERROR_TYPE_FIELD: ANNOTATION_ERROR_TYPE, ERROR_MESSAGE_FIELD: error["index"]["error"] ["reason"] } else: annotationError = { ERROR_ANNOTATION_FIELD: annotation, ERROR_DOCUMENT_ID_FIELD: documentId, ERROR_TYPE_FIELD: ANNOTATION_ERROR_TYPE, ERROR_MESSAGE_FIELD: error["index"]["error"] } else: annotationError = { ERROR_ANNOTATION_FIELD: annotation, ERROR_DOCUMENT_ID_FIELD: documentId, ERROR_TYPE_FIELD: ANNOTATION_ERROR_TYPE, ERROR_MESSAGE_FIELD: error["index"] } if annotationError: errorAnnotations.append(annotationError) logger.info(annotationError) else: errorAnnotations.append({ ERROR_TYPE_FIELD: OTHER_ERROR_TYPE, "message": str(error) }) logger.info({ ERROR_TYPE_FIELD: OTHER_ERROR_TYPE, "message": str(error) }) except Exception as e: errorAnnotations.append({ ERROR_TYPE_FIELD: OTHER_ERROR_TYPE, "message": "Bulk Insert Failed File: {0}\n{1}".format( name, str(e)) }) logger.error({ ERROR_TYPE_FIELD: OTHER_ERROR_TYPE, "message": "Bulk Insert Failed File: {0}\n{1}".format( name, str(e)) }) logger.error({"ERROR TRACEBACK": traceback.format_exc()}) except Exception as e: errorAnnotations.append({ ERROR_TYPE_FIELD: OTHER_ERROR_TYPE, "message": "Failed to process file: {0}\n{1}".format(name, str(e)) }) logger.error({ ERROR_TYPE_FIELD: OTHER_ERROR_TYPE, "message": "Failed to process file: {0}\n{1}".format(name, str(e)) }) logger.error({"ERROR TRACEBACK": traceback.format_exc()}) for error in errorAnnotations: if len(errorsToReturn) < MAX_ERRORS_TO_RETURN: errorsToReturn.append(error) else: break if errorsToReturn: return { "data": errorsToReturn, "totalErrorsCount": len(errorAnnotations) } else: return None
def add_or_update_document(self, jsonDocument, id: str = None, docType: str = "default", esProperties: dict = None, update: bool = False, createDataIndexIfNotExist: bool = True): """ Adds a jsonDocument. If for a for a give type of document the esSchemaMapping is not compatible, it will throw ConflictingSchemaException. Precondition: esSchemaMapping is a valid ElasticSearch mapping, containing a property field. If esSchemaMapping is present, it will validate that it is compatible with the schema for docs of doc_type. :param id: id which will uniquely identify this document between all document of the same type. :param jsonDocument: jsonContaining the document to be indexed.This should be a python object representing the json.. Without schema the whole document is indexed. :param docType: Type of the document. All document with the same type have the same structure. :param esProperties: Properties sections of elastic search mappings. Allows to dynamically change the schema when adding the document. :param update: If true, this function will throw and exception if trying a add an already existing document :param createDataIndexIfNotExist: If true will create a data index if it doest exist for a given type. If false it will throw an error if user attemps to put data into an unexisting index. :return: Id of the created document """ # TODO, since it will create one index per document type # Eventually we will need to merge the different indexes. logger = logging.getLogger(__name__) es = get_es_conn() # get or create index indexName = "" if createDataIndexIfNotExist: indexName = self._create_data_index_if_not_exist(docType, True) else: try: es_wait_ready() res = es.get(index=self.typeIndex, doc_type="directory_type", id=docType) indexName = res["_source"]["indexName"] except exceptions.NotFoundError: raise DocumentDirectoryNoSchemaFoundException( "No schema found for type:{0}".format(docType)) # update schema if supplied if esProperties: esPropertiesFrom = None try: esMappingFrom = es.indices.get_mapping(index=indexName, doc_type=docType) esPropertiesFrom = esMappingFrom[indexName]["mappings"][ docType] except Exception: pass if (esPropertiesFrom): self.add_or_update_schema( SchemaList.calculate_properties_delta( esPropertiesFrom, esProperties), docType) else: self.add_or_update_schema(esProperties, docType) if not id: id = gen_uuid() if update: es.update(index=indexName, doc_type=docType, body={"doc": jsonDocument}, id=id) else: try: es.create(index=indexName, doc_type=docType, body=jsonDocument, id=id) return id except exceptions.ConflictError: logger.info( "Adding existing document {0} for type {1} in index {2}". format(id, docType, indexName)) raise DocumentAlreadyExistsException() except exceptions.TransportError as te: if ("error" in te.info) and ("type" in te.info["error"]) and ( te.info["error"]["type"] == "strict_dynamic_mapping_exception"): raise DocumentDoesNotRespectSchemaException( "Document id: {0}".format(id))
def add_json_schema(self, jsonSchema: dict, name: str = None, description: str = None, id: str = None, shouldValidate: bool = False, nestedFields=[]): """ This function is used by the user to add his json schema, and able to search it. A json schema added by the system will be automatically searchable by hash. :param authorization: User authorization object :param jsonSchema: Json Schema :param name: Optional name used by the user to search for the schema :param description: Optional description used by the user to search for the shema :param id: Id to uniquely identify this JsonSchema :param shouldValidate: If true it will validate the schema before adding. :param nestedFields: Lists arrays which should be considered as nested properties in es :return: jsonSchemaInfoId: used to get the information about json schema """ self.authorization.can_add_json_schema() jsonSchemaStr = json.dumps(jsonSchema) jsonSchemaHash = SchemaList.hash_json_schema(jsonSchemaStr) esProperties = self.convert_json_schema_to_es_properties( jsonSchema, nestedFields) esHash = self.add_es_schema(esProperties) tmpName = "" tmpDescription = "" entry = {} if name: tmpName = name entry["name"] = name if description: tmpDescription = description entry["description"] = description es = get_es_conn() if id: try: es.get(index=self.masterJsonSchemaIndex, doc_type="default", id=id) raise JsonSchemaAlreadyExistsException( "Schema with same id already exist: {0}".format(id)) except exceptions.NotFoundError: pass else: id = gen_uuid() if shouldValidate: raise NotImplementedError() entry["jsonSchemaHash"] = jsonSchemaHash entry["esHash"] = esHash entry["jsonSchema"] = json.dumps(jsonSchema) es.create(index=self.masterJsonSchemaIndex, doc_type="default", id=id, body=entry) return id