def delete(self, corpusId, bucketId): try: envId = get_env_id() authorization = get_autorisation(envId, None, None) corpus = get_master_document_corpus_list( envId, authorization).get_corpus(corpusId) corpus.delete_bucket(bucketId) self.write_and_set_status(None, HTTPStatus.NO_CONTENT) except BucketNotFoundException as err: self.write_and_set_status( { MESSAGE: "Bucket does not exist.Extra info: '{0}'".format(err) }, HTTPStatus.NOT_FOUND) except CorpusNotFoundException as err: self.write_and_set_status( { MESSAGE: "Corpus does not exist.Extra info: '{0}'".format(err) }, HTTPStatus.NOT_FOUND) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status( { MESSAGE: "Internal server error", TRACE: trace }, HTTPStatus.INTERNAL_SERVER_ERROR)
def get(self, corpusId, documentId): """Get a single document from corpus""" try: envId = get_env_id() authorization = get_autorisation(envId, None, None) corpus = get_master_document_corpus_list( envId, authorization).get_corpus(corpusId) document = corpus.get_text_document(documentId) if document is None: raise DocumentNotFoundException(documentId) self.write_and_set_status(document, HTTPStatus.OK) except CorpusNotFoundException: self.write_and_set_status({MESSAGE: "Specified corpus not found"}, HTTPStatus.NOT_FOUND) except DocumentNotFoundException: self.write_and_set_status( {MESSAGE: "Specified document not found"}, HTTPStatus.NOT_FOUND) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status( { MESSAGE: "Internal server error", TRACE: trace }, HTTPStatus.INTERNAL_SERVER_ERROR)
def delete(self, corpusId, documentId): """Delete a single document an optionally its annotations""" try: delete_annotations_argument = self.get_query_argument( "deleteAnnotations", None) if not delete_annotations_argument: self.missing_required_field("deleteAnnotations") return delete_annotations = 'true' == delete_annotations_argument envId = get_env_id() authorization = get_autorisation(envId, None, None) corpus = get_master_document_corpus_list( envId, authorization).get_corpus(corpusId) document = corpus.delete_document(documentId, delete_annotations) self.write_and_set_status(document, HTTPStatus.OK) except CorpusNotFoundException: self.write_and_set_status({MESSAGE: "Specified corpus not found"}, HTTPStatus.NOT_FOUND) except DocumentNotFoundException: self.write_and_set_status( {MESSAGE: "Specified document not found"}, HTTPStatus.NOT_FOUND) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status( { MESSAGE: "Internal server error", TRACE: trace }, HTTPStatus.INTERNAL_SERVER_ERROR)
def get(self, corpusId): """Get documents from corpus according to pagination""" try: fromIndexArgument = self.get_query_argument("from") fromIndex = int(fromIndexArgument) if fromIndex < 0: self.write_and_set_status( {MESSAGE: "'from' must cannot be less than zero"}, HTTPStatus.UNPROCESSABLE_ENTITY) return sizeArgument = self.get_query_argument("size") size = int(sizeArgument) if size < 1: self.write_and_set_status( {MESSAGE: "'size' cannot be less than 1"}, HTTPStatus.UNPROCESSABLE_ENTITY) return size = min(size, MAX_DOCUMENT_SIZE) envId = get_env_id() authorization = get_autorisation(envId, None, None) corpus = get_master_document_corpus_list( envId, authorization).get_corpus(corpusId) filterTitle = self.get_query_argument("filterTitle", default=None) filterSource = self.get_query_argument("filterSource", default=None) filterJoin = self.get_query_argument("filterJoin", default=None) sortBy = self.get_query_argument("sortBy", default=None) sortOrder = self.get_query_argument("sortOrder", default=None) documents = corpus.get_text_documents(fromIndex, size, sortBy, sortOrder, filterTitle, filterSource, filterJoin) self.write_and_set_status({"documents": documents}, HTTPStatus.OK) except CorpusNotFoundException: self.write_and_set_status({MESSAGE: "Specified corpus not found"}, HTTPStatus.NOT_FOUND) except ValueError as ve: self.write_and_set_status( {MESSAGE: "Invalid 'from' or 'size' parameter"}, HTTPStatus.UNPROCESSABLE_ENTITY) except TransportError as te: trace = traceback.format_exc().splitlines() self.write_and_set_status( { MESSAGE: "ES TransportError", TRACE: trace }, te.status_code) except Exception as e: trace = traceback.format_exc().splitlines() self.write_and_set_status( { MESSAGE: "Internal server error", TRACE: trace }, HTTPStatus.INTERNAL_SERVER_ERROR)
def test_remove_all_remove_annotations(self): self.recreate_read_write_env() jsonSchema1 = { "$schema": "http://json-schema.org/draft-04/schema#", "targetType": "document_surface1d", "schemaType": "schema1", "type": "object", "required": ["_schemaType", "_corpusID", "_documentID", "offsets"], "properties": { "_schemaType": { "type": "string", "description": "Schema type", "searchable": True, "searchModes": ["noop"], "locked": True }, "_documentID": { "type": "string", "description": "Internal document GUID", "searchable": True, "searchModes": ["basic"], "locked": True }, "_corpusID": { "type": "string", "description": "Internal Corpus GUID", "searchable": True, "searchModes": ["basic"], "locked": True } } } corpus = get_master_document_corpus_list( self.envId, self.authorization).create_corpus("corpusx") corpus.add_text_document("Another doc with auto id", "do1", "english") bucket1 = corpus.create_bucket("bucket1", "bucket1") schemaId1 = get_schema_list( self.envId, self.authorization).add_json_schema_as_hash( jsonSchema1, False, {}) time.sleep(1) bucket1.add_or_update_schema_to_bucket( schemaId1, "schema1", TargetType("document_surface1d"), {}) time.sleep(1) anno1 = { "_schemaType": "schema1", "_documentID": "document1", "_corpusID": "corpusx" } bucket1.add_annotation(anno1, "schema1", "1") time.sleep(1) documentSearch = DocumentSearch(self.envId, self.authorization, None, "corpusx") documentSearch.delete_annotations_for_types("bucket1", ["schema1"]) time.sleep(1) bucket1.add_annotation(anno1, "schema1", "1")
def get(self): try: envId = get_env_id() authorization = get_autorisation(envId, None, None) corpora = get_master_document_corpus_list(envId, authorization) corporaInfos = corpora.get_corpuses_list() self.write_and_set_status({"data": corporaInfos}, HTTPStatus.OK) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace}, HTTPStatus.INTERNAL_SERVER_ERROR)
def set_up_corpus(self): corpus = get_master_document_corpus_list( self.envId, self.authorization).create_corpus(CORPUS_ID, languages=["en-US"]) files = glob.iglob(os.path.join(JASS_TEST_DATA_PATH, "*.txt")) self.contentById = dict() for filePath in files: with open(filePath, 'r', encoding="utf8") as f: id = os.path.basename(filePath) contents = f.read() self.contentById[str(id) + ".txt"] = contents corpus.add_text_document(contents, filePath, "en-US", id) time.sleep(1)
def post(self): body = self.request.body.decode("utf-8") try: envId = get_env_id() authorization = get_autorisation(envId, None, None) json_args = json.loads(body) for requiredField in [CORPUS_LANGUAGES]: if requiredField not in json_args: self.write_and_set_status({MESSAGE: "Missing required parameters. {0}".format(requiredField)}, HTTPStatus.UNPROCESSABLE_ENTITY) return languages = json_args.get(CORPUS_LANGUAGES, None) try: languageManager = get_language_manager() for language in languages: if not languageManager.has_es_analyser(language): self.write_and_set_status({MESSAGE: "Invalid language: " + language}, HTTPStatus.UNPROCESSABLE_ENTITY) return except Exception as e: self.write_and_set_status({MESSAGE: "Invalid languages field: " + str(languages)}, HTTPStatus.UNPROCESSABLE_ENTITY) return corpusId = json_args.get(CORPUS_ID, None) if corpusId and not valid_es_id(corpusId): self.write_and_set_status({ MESSAGE: "Corpus id invalid '{0}' . CorpusId can only be lowercase,alphanumeric with -_".format( corpusId)}, HTTPStatus.UNPROCESSABLE_ENTITY) return corpora = get_master_document_corpus_list(envId, authorization) corpus = corpora.create_corpus(corpusId, languages) self.write_and_set_status({"id": corpus.id}, HTTPStatus.OK) except CorpusNotFoundException: self.write_and_set_status({MESSAGE: "Specified corpus not found"}, HTTPStatus.NOT_FOUND) except CorpusInvalidFieldException as ci: self.write_and_set_status({MESSAGE: "Invalid field: {0}".format(ci)}, HTTPStatus.UNPROCESSABLE_ENTITY) except CorpusAlreadyExistsException: self.write_and_set_status({MESSAGE: "Corpus with the same id already exists"}, HTTPStatus.CONFLICT) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace}, HTTPStatus.INTERNAL_SERVER_ERROR)
def delete(self, corpusId): try: envId = get_env_id() authorization = get_autorisation(envId, None, None) corpora = get_master_document_corpus_list(envId, authorization) corpora.delete_corpus(corpusId) self.write_and_set_status(None, HTTPStatus.NO_CONTENT) except CorpusNotFoundException: self.write_and_set_status({MESSAGE: "Specified corpus not found"}, HTTPStatus.NOT_FOUND) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace}, HTTPStatus.INTERNAL_SERVER_ERROR)
def test_create_bucket_latencies2(self): """ In reference to https://www.crim.ca/jira/browse/PSC-558 Testing a 10 second timeout to elasticsearch REQUIRES dev/compose_with_toxiproxy.yml to run in order to be executed :return: """ from elasticsearch import ConnectionTimeout self.setup_unittest_environment() corpus = get_master_document_corpus_list( self.envId, self.authorization).create_corpus("corpus1") ####### Latency less then timeout ######### try: self.create_es_toxiproxy() self.toxiproxy_add_timeout("proxy_es", timeout=1) # setting es timeout to be superior to latency delay self.set_es_to_use_toxic(9201, timeout=2) # leave some time to add toxics time.sleep(1) bucket1 = corpus.create_bucket("bucket1") except Exception as e: self.reset_es_settings() self.destroy_ex_toxiproxy() self.assertTrue(False, "Exception has occured:" + str(e)) self.reset_es_settings() self.destroy_ex_toxiproxy() ####### Latency more then timeout ######### try: self.create_es_toxiproxy() self.toxiproxy_add_timeout("proxy_es", timeout=2) # setting es timeout to be inferior to latency delay self.set_es_to_use_toxic(9201, timeout=1) # leave some time to add toxics time.sleep(1) self.assertRaises(ConnectionTimeout, corpus.create_bucket, "bucket2") except Exception as e: self.reset_es_settings() self.destroy_ex_toxiproxy() self.assertTrue(False, "Exception has occured:" + str(e)) self.reset_es_settings() self.destroy_ex_toxiproxy()
def test_bind_schema_with_string_array(self): schema = json.loads(JSON_SCHEMA_WITH_STRING_ARRAY) corpus = get_master_document_corpus_list( self.envId, self.authorization).create_corpus("corpus1") bucket1 = corpus.create_bucket("bucket1") schema_id = get_schema_list( self.envId, self.authorization).add_json_schema_as_hash( schema, False, nestedFields=["offsets"]) time.sleep(1) bucket1.add_or_update_schema_to_bucket(schema_id, "schema1", TargetType("document"), {}) time.sleep(1) res = bucket1.get_schemas_info(True) self.assertEqual(len(res["data"]), 1)
def upload_documents(self, url: str = None, zipFileName: str = None, isSendPut=False, isMultipart: bool = True, multipartFieldName: str = "file"): """ Uploads all document for the current corpus :param url: Url to which to upload files :param zipFileName: Url to which to upload files :return: """ # creates a zip file logger = logging.getLogger(__name__) fileStorage = HttpPostFileStorage(url, zipFileName) fileStorage.create_zip_file() es = get_es_conn() corpus = get_master_document_corpus_list( self.envId, self.authorization).get_corpus(self.corpusId) search = Search(using=es, index=corpus.languages_indices()) search = search.source(["text"]) search = search.params(scroll=get_scan_scroll_duration(), size=get_nb_documents_per_scan_scroll()) start = time.time() count = 0 logger.info("Adding documents to zip: {0}".format(self.corpusId)) for result in search.scan(): fileStorage.add_utf8_file(result.text[0], str(result.meta.id) + ".txt") count += 1 if count % NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING == 0: end = time.time() logger.info( "Time to add documents {0} to {1} : {2} seconds".format( count - NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING, count, end - start)) start = end end = time.time() logger.info("Time to add documents {0} to {1} : {2} seconds".format( count - count % NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING, count, end - start)) fileStorage.flush(True, isSendPut, isMultipart, multipartFieldName)
def post(self, corpusId): try: body = json.loads(self.request.body.decode("utf-8")) language = body.get("language") if not language: self.write_and_set_status( {MESSAGE: "Missing required parameters"}) self.set_status(HTTPStatus.UNPROCESSABLE_ENTITY) return envId = get_env_id() authorization = get_autorisation(envId, None, None) docId = body.get( "id") # Note: 'get' defaults to None when key does not exist text = body.get("text", "") title = body.get("title", "") source = body.get("source", "") corpus = get_master_document_corpus_list( envId, authorization).get_corpus(corpusId) if not language in corpus.languages: self.write_and_set_status( { MESSAGE: "Document language do not correspond to corpus language" }, HTTPStatus.UNPROCESSABLE_ENTITY) return docId = corpus.add_text_document(text, title, language, docId, source) self.write_and_set_status({"id": docId}, HTTPStatus.OK) except DocumentAlreadyExistsException: self.write_and_set_status( {MESSAGE: "Document with the same id already exists"}, HTTPStatus.CONFLICT) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status( { MESSAGE: "Internal server error", TRACE: trace }, HTTPStatus.INTERNAL_SERVER_ERROR)
def get(self, corpusId): try: includeSchemaJson = 'true' == self.get_query_argument(INCLUDE_SCHEMA_JSON, default=False) envId = get_env_id() authorization = get_autorisation(envId, None, None) buckets = get_master_document_corpus_list(envId, authorization).get_corpus(corpusId).get_buckets() augmentedBuckets = [getBucketWithSchema(bucket, includeSchemaJson) for bucket in buckets] self.write_and_set_status({"buckets": augmentedBuckets}, HTTPStatus.OK) except CorpusNotFoundException: self.write_and_set_status({MESSAGE: "Specified corpus not found"}, HTTPStatus.NOT_FOUND) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace}, HTTPStatus.INTERNAL_SERVER_ERROR)
def set_up_corpus(self): corpus = get_master_document_corpus_list( self.envId, self.authorization).create_corpus("corpus1") time.sleep(1) bucket1 = corpus.create_bucket("bucket1", "bucket1") setting = get_settings() self.schemaList = get_schema_list(self.envId, self.authorization) schemaNormalId = self.schemaList.add_json_schema_as_hash(SCHEMA_NORMAL) schemaOffsetsId = self.schemaList.add_json_schema_as_hash( SCHEMA_OFFSETS, False, nestedFields=["offsets"]) time.sleep(1) bucket1.add_or_update_schema_to_bucket(schemaNormalId, "sentence", TargetType.document_surface1d, {}) bucket1.add_or_update_schema_to_bucket(schemaOffsetsId, "token", TargetType.document_surface1d, {}) time.sleep(1)
def post(self, corpusId): try: body = json.loads(self.request.body.decode("utf-8")) envId = get_env_id() authorization = get_autorisation(envId, None, None) bucketId = None bucketName = None if "id" in body: bucketId = body["id"] if "name" in body: bucketName = body["name"] if bucketId and not valid_es_id(bucketId): self.write_and_set_status( { MESSAGE: "Bucket id invalid '{0}' . BucketId can only be lowercase,alphanumeric with -_" .format(bucketId) }, HTTPStatus.UNPROCESSABLE_ENTITY) return bucket = get_master_document_corpus_list(envId, authorization). \ get_corpus(corpusId).create_bucket(bucketName, bucketId) self.write_and_set_status({"id": bucket.id}, HTTPStatus.OK) except BucketAlreadyExistsException: self.write_and_set_status( {MESSAGE: "Bucket with the same id already exists"}, HTTPStatus.CONFLICT) except CorpusNotFoundException as err: self.write_and_set_status( { MESSAGE: "Corpus does not exist.Extra info: '{0}'".format(err) }, HTTPStatus.UNPROCESSABLE_ENTITY) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status( { MESSAGE: "Internal server error", TRACE: trace }, HTTPStatus.INTERNAL_SERVER_ERROR)
def get_documents_zip(self, zipFileName: str = None): """ Creates a zip of all documents of the corpus and returns the path to them. :param zipFileName: Name of the created zip file. If not supplied it will be automatically generated. If exists, the existing file will be replaced. :return: path to the document in thee """ logger = logging.getLogger(__name__) self.tmpFileStorage = TmpFileStorage(zipFileName) self.tmpFileStorage.create_zip_file() es = get_es_conn() corpus = get_master_document_corpus_list( self.envId, self.authorization).get_corpus(self.corpusId) search = Search(using=es, index=corpus.languages_indices()) search = search.source(["text"]) search = search.params(scroll=get_scan_scroll_duration(), size=get_nb_documents_per_scan_scroll()) start = time.time() count = 0 logger.info("Adding documents to zip: {0}".format(self.corpusId)) for result in search.scan(): self.tmpFileStorage.add_utf8_file(result.text, str(result.meta.id) + ".txt") count += 1 if count % NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING == 0: end = time.time() logger.info( "Time to add documents {0} to {1} : {2} seconds".format( count - NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING, count, end - start)) start = end end = time.time() logger.info("Time to add documents {0} to {1} : {2} seconds".format( count - count % NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING, count, end - start)) self.tmpFileStorage.close() return self.tmpFileStorage.zipPath
def get(self, corpusId): try: envId = get_env_id() authorization = get_autorisation(envId, None, None) corpora = get_master_document_corpus_list(envId, authorization) corpus = corpora.get_corpus(corpusId) info = { CORPUS_ID: corpus.id, CORPUS_LANGUAGES: corpus.languages, CORPUS_MODIFICATION_DATE: datetime_to_json_str(corpus.modificationDate), CORPUS_DOCUMENT_COUNT: corpus.get_documents_count() } self.write_and_set_status(info, HTTPStatus.OK) except CorpusNotFoundException: self.write_and_set_status({MESSAGE: "Specified corpus not found"}, HTTPStatus.NOT_FOUND) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace}, HTTPStatus.INTERNAL_SERVER_ERROR)
def delete(self, corpusId, bucketId, schemaType): try: envId = get_env_id() authorization = get_autorisation(envId, None, None) bucket = get_master_document_corpus_list( envId, authorization).get_corpus(corpusId).get_bucket(bucketId) schemas = bucket.get_schemas_info(False) schemaTypes = [schema['schemaType'] for schema in schemas['data']] if not schemaType in schemaTypes: self.write_and_set_status( { MESSAGE: "Schema Type: {0} does not exist".format(schemaType) }, HTTPStatus.NOT_FOUND) return bucket.delete_schema_type(schemaType) self.write_and_set_status(None, HTTPStatus.NO_CONTENT) except CorpusNotFoundException as err: self.write_and_set_status( { MESSAGE: "Corpus does not exist.Extra info: '{0}'".format(err) }, HTTPStatus.NOT_FOUND) except BucketNotFoundException as err: self.write_and_set_status( { MESSAGE: "Bucket does not exist.Extra info: '{0}'".format(err) }, HTTPStatus.NOT_FOUND) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status( { MESSAGE: "Internal server error", TRACE: trace }, HTTPStatus.INTERNAL_SERVER_ERROR)
def put(self, corpusId): try: body = self.request.body.decode("utf-8") envId = get_env_id() authorization = get_autorisation(envId, None, None) json_args = json.loads(body) try: languages = json_args.get(CORPUS_LANGUAGES, None) if languages: languageManager = get_language_manager() for language in languages: if not languageManager.has_es_analyser(language): self.write_and_set_status({MESSAGE: "Invalid language: " + language}, HTTPStatus.UNPROCESSABLE_ENTITY) return except Exception as e: self.write_and_set_status({MESSAGE: "Invalid languages field: " + str(languages)}, HTTPStatus.UNPROCESSABLE_ENTITY) return corpora = get_master_document_corpus_list(envId, authorization) corpus = corpora.update_corpus(corpusId, languages) self.write_and_set_status(None, HTTPStatus.NO_CONTENT) except CorpusInvalidFieldException as ci: self.write_and_set_status({MESSAGE: "Invalid field: {0}".format(ci)}, HTTPStatus.UNPROCESSABLE_ENTITY) except CorpusNotFoundException: self.write_and_set_status({MESSAGE: "Specified corpus not found"}, HTTPStatus.NOT_FOUND) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace}, HTTPStatus.INTERNAL_SERVER_ERROR)
def test_count_annotations_for_type_basic(self): """ Test annotation count for schemaType indexed as basic instead of noop. Note: Not sure it should be permitted at all to allow schemaType with main index different than noop. """ global envIdReadOnly global authorizationReadOnly schema = json.loads(JSON_SCHEMA_WITH_SCHEMA_TYPE_BASIC) corpus = get_master_document_corpus_list( envIdReadOnly, authorizationReadOnly).create_corpus() bucket = corpus.create_bucket("bucket") schema_id = get_schema_list( envIdReadOnly, authorizationReadOnly).add_json_schema_as_hash( schema, False, nestedFields=["offsets"]) time.sleep(1) schema_type = "CHUNK_ap" bucket.add_or_update_schema_to_bucket(schema_id, schema_type, TargetType("document_surface1d"), {}) time.sleep(1) annotations = [{ "_documentID": "98ff06a6-02dd-11e8-b82a-0242ac12001f", "_corpusID": "rqgbf20180126", "length": 14, "string": "contemporaines", "schemaType": "CHUNK_ap", "offsets": [{ "end": 449, "begin": 435 }], }, { "_documentID": "98ff06a6-02dd-11e8-b82a-0242ac12001f", "_corpusID": "rqgbf20180126", "length": 13, "string": "plus anciens,", "schemaType": "CHUNK_ap", "offsets": [{ "end": 593, "begin": 580 }], }, { "_documentID": "98ff06a6-02dd-11e8-b82a-0242ac12001f", "_corpusID": "rqgbf20180126", "length": 9, "string": "coloniale", "schemaType": "CHUNK_ap", "offsets": [{ "end": 693, "begin": 684 }], }] for annotation in annotations: bucket.add_annotation(annotation, schema_type) time.sleep(1) ds = DocumentSearch(envIdReadOnly, authorizationReadOnly, "doc1", corpus.id) counts = ds.count_annotations_for_types(bucket.id, [schema_type]) self.assertEqual(counts[schema_type], len(annotations))
def populateData(cls): global envIdReadOnly global authorizationReadOnly # Copy paste from test corpus corpus = get_master_document_corpus_list(envIdReadOnly, authorizationReadOnly). \ create_corpus(CORPUS_ID, languages=["fr-xx", "en-xx"]) bucket1 = corpus.create_bucket("bucket1", "bucket1") bucket2 = corpus.create_bucket("bucket2", "bucket2") sentencesS = { "$schema": "http://json-schema.org/draft-04/schema#", "targetType": "document", "schemaType": "sentence", "type": "object", "required": ["schemaType", "_corpusID", "_documentID", "sentence"], "properties": { "schemaType": { "type": "string", "description": "Schema type", "searchable": True, "searchModes": ["noop"], "locked": True }, "_documentID": { "type": "string", "description": "Internal document GUID", "searchable": True, "searchModes": ["noop"], "locked": True }, "_corpusID": { "type": "string", "description": "Internal Corpus GUID", "searchable": True, "searchModes": ["noop"], "locked": True }, "sentence": { "type": "string", "description": "Sentence in a document", "searchable": True, "searchModes": ["basic"], "locked": True } } } tokenS = { "$schema": "http://json-schema.org/draft-04/schema#", "targetType": "document_surface1d", "schemaType": "token", "type": "object", "required": ["schemaType", "_corpusID", "_documentID", "sentence"], "properties": { "schemaType": { "type": "string", "description": "Schema type", "searchable": True, "searchModes": ["noop"], "locked": True }, "_documentID": { "type": "string", "description": "Internal document GUID", "searchable": True, "searchModes": ["noop"], "locked": True }, "_corpusID": { "type": "string", "description": "Internal Corpus GUID", "searchable": True, "searchModes": ["noop"], "locked": True }, "word": { "type": "string", "description": "Word in a document", "searchable": True, "searchModes": ["basic"], "locked": True }, "length": { "type": "integer", "description": "Length of a word", "searchable": True, "searchModes": ["noop"], "locked": True }, "category": { "type": "string", "description": "category of the word", "searchable": True, "searchModes": ["basic"], "locked": True }, "offsets": { "searchable": True, "locked": True, "type": "array", "minItems": 1, "items": { "type": "object", "properties": { "begin": { "type": "integer", "minimum": 0 }, "end": { "type": "integer", "minimum": 0 } } } } } } tokenwithlemmaS = { "$schema": "http://json-schema.org/draft-04/schema#", "targetType": "document_surface1d", "schemaType": "tokenwithlemma", "type": "object", "required": ["schemaType", "_corpusID", "_documentID", "sentence"], "properties": { "schemaType": { "type": "string", "description": "Schema type", "searchable": True, "searchModes": ["basic"], "locked": True }, "_documentID": { "type": "string", "description": "Internal document GUID", "searchable": True, "searchModes": ["noop"], "locked": True }, "_corpusID": { "type": "string", "description": "Internal Corpus GUID", "searchable": True, "searchModes": ["noop"], "locked": True }, "word": { "type": "string", "description": "Word in a document", "searchable": True, "searchModes": ["basic"], "locked": True }, "length": { "type": "integer", "description": "Length of a word", "searchable": True, "searchModes": ["noop"], "locked": True }, "lemma": { "type": "string", "description": "Lemma of a word", "searchable": True, "searchModes": ["basic"], "locked": True }, "category": { "type": "string", "description": "category of the word", "searchable": True, "searchModes": ["basic"], "locked": True }, "offsets": { "searchable": True, "locked": True, "type": "array", "minItems": 1, "items": { "type": "object", "properties": { "begin": { "type": "integer", "minimum": 0 }, "end": { "type": "integer", "minimum": 0 } } } } } } schemaList = get_schema_list(envIdReadOnly, authorizationReadOnly) sentenceSID = schemaList.add_json_schema(jsonSchema=sentencesS) tokenSID = schemaList.add_json_schema(jsonSchema=tokenS, nestedFields="offsets") tokenwithlemmaSID = schemaList.add_json_schema( jsonSchema=tokenwithlemmaS, nestedFields="offsets") bucket1.add_or_update_schema_to_bucket( sentenceSID, sentencesS["schemaType"], TargetType(sentencesS["targetType"]), {}) bucket1.add_or_update_schema_to_bucket( tokenSID, tokenS["schemaType"], TargetType(tokenS["targetType"]), {}) bucket1.add_or_update_schema_to_bucket( tokenwithlemmaSID, tokenwithlemmaS["schemaType"], TargetType(tokenwithlemmaS["targetType"]), {}) bucket2.add_or_update_schema_to_bucket( tokenSID, tokenS["schemaType"], TargetType(tokenS["targetType"]), {}) bucket2.add_or_update_schema_to_bucket( tokenwithlemmaSID, tokenwithlemmaS["schemaType"], TargetType(tokenwithlemmaS["targetType"]), {}) time.sleep(1) # sentences # bucket 1 bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "sentence", "sentence": "Les algorithmes de colonies de fourmis sont des algorithmes inspirés du comportement des fourmis." }, "sentence") bucket1.add_annotation( { "_documentID": ALICE_FR_DOC_ID, "_corpusID": CORPUS_ID, "schemaType": "sentence", "sentence": "Le café liégeois doit son appellation à la résistance de l’armée belge lors de la bataille des forts de Liège d’août 1914." }, "sentence") # bucket1 # token bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "Les", "offsets": [{ "begin": 0, "end": 3 }], "length": 3, "category": "DET:ART" }, "token") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "algorithmes", "offsets": [{ "begin": 4, "end": 15 }], "length": 11, "category": "NOM" }, "token") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "de", "offsets": [{ "begin": 28, "end": 30 }, { "begin": 16, "end": 18 }], "length": 2, "category": "PRP" }, "token") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "colonies", "offsets": [{ "begin": 19, "end": 27 }], "length": 8, "category": "NOM" }, "token") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "fourmis", "offsets": [{ "begin": 31, "end": 38 }], "length": 7, "category": "NOM" }, "token") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "sont", "offsets": [{ "begin": 39, "end": 43 }], "length": 4, "category": "VER:pres" }, "token") # some doc 2 annotations bucket1.add_annotation( { "_documentID": "doc2", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "des", "offsets": [{ "begin": 44, "end": 47 }], "length": 3, "category": "PRP:det" }, "token") bucket1.add_annotation( { "_documentID": "doc2", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "algorithmes", "offsets": [{ "begin": 48, "end": 59 }], "length": 11, "category": "NOM" }, "token") # tokenwithlemma bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "Le", "offsets": [{ "begin": 98, "end": 100 }], "length": 2, "lemma": "le", "category": "DET:ART" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "café", "offsets": [{ "begin": 101, "end": 105 }], "length": 4, "lemma": "café", "category": "NOM" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "liégeois", "offsets": [{ "begin": 106, "end": 114 }], "length": 8, "lemma": "liégeois", "category": "ADJ" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "doit", "offsets": [{ "begin": 115, "end": 119 }], "length": 4, "lemma": "devoir", "category": "VER:pres" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "son", "offsets": [{ "begin": 120, "end": 123 }], "length": 3, "lemma": "son", "category": "DET:POS" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "appellation", "offsets": [{ "begin": 124, "end": 135 }], "length": 11, "lemma": "appellation", "category": "NOM" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "à", "offsets": [{ "begin": 136, "end": 137 }], "length": 1, "lemma": "à", "category": "PRP" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "la", "offsets": [{ "begin": 138, "end": 140 }], "length": 2, "lemma": "le", "category": "DET:ART" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "résistance", "offsets": [{ "begin": 141, "end": 151 }], "length": 10, "lemma": "résistance", "category": "NOM" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "de", "offsets": [{ "begin": 152, "end": 154 }], "length": 2, "lemma": "de", "category": "PRP" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "l", "offsets": [{ "begin": 155, "end": 156 }], "length": 1, "lemma": None, "category": "NOM" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "armée", "offsets": [{ "begin": 157, "end": 162 }], "length": 5, "lemma": "armer", "category": "VER:pper" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "belge", "offsets": [{ "begin": 163, "end": 168 }], "length": 5, "lemma": "belge", "category": "ADJ" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "lors", "offsets": [{ "begin": 169, "end": 173 }], "length": 4, "lemma": "lors", "category": "ADV" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "de", "offsets": [{ "begin": 174, "end": 176 }], "length": 2, "lemma": "de", "category": "PRP" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "la", "offsets": [{ "begin": 177, "end": 179 }], "length": 2, "lemma": "le", "category": "DET:ART" }, "tokenwithlemma") bucket1.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "bataille", "offsets": [{ "begin": 180, "end": 188 }], "length": 8, "lemma": "bataille", "category": "NOM" }, "tokenwithlemma") # bucket2 # token bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "algorithmes", "offsets": [{ "begin": 48, "end": 59 }], "length": 11, "category": "NOM" }, "token") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "inspirés", "offsets": [{ "begin": 60, "end": 68 }], "length": 8, "category": "VER:pper" }, "token") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "du", "offsets": [{ "begin": 69, "end": 71 }], "length": 2, "category": "PRP:det" }, "token") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "comportement", "offsets": [{ "begin": 72, "end": 84 }], "length": 12, "category": "NOM" }, "token") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "des", "offsets": [{ "begin": 85, "end": 88 }], "length": 3, "category": "PRP:det" }, "token") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": "fourmis", "offsets": [{ "begin": 89, "end": 96 }], "length": 7, "category": "NOM" }, "token") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "token", "word": ".", "offsets": [{ "begin": 96, "end": 97 }], "length": 1, "category": "SENT" }, "token") # tokenwithlemma bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "armée", "offsets": [{ "begin": 157, "end": 162 }], "length": 5, "lemma": "armer", "category": "VER:pper" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "belge", "offsets": [{ "begin": 163, "end": 168 }], "length": 5, "lemma": "belge", "category": "ADJ" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "lors", "offsets": [{ "begin": 169, "end": 173 }], "length": 4, "lemma": "lors", "category": "ADV" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "de", "offsets": [{ "begin": 174, "end": 176 }], "length": 2, "lemma": "de", "category": "PRP" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "la", "offsets": [{ "begin": 177, "end": 179 }], "length": 2, "lemma": "le", "category": "DET:ART" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "bataille", "offsets": [{ "begin": 180, "end": 188 }], "length": 8, "lemma": "bataille", "category": "NOM" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "des", "offsets": [{ "begin": 189, "end": 192 }], "length": 3, "lemma": "du", "category": "PRP:det" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "forts", "offsets": [{ "begin": 193, "end": 198 }], "length": 5, "lemma": "fort", "category": "NOM" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "de", "offsets": [{ "begin": 199, "end": 201 }], "length": 2, "lemma": "de", "category": "PRP" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "Liège", "offsets": [{ "begin": 202, "end": 207 }], "length": 5, "lemma": "Liège", "category": "NAM" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "d", "offsets": [{ "begin": 208, "end": 209 }], "length": 1, "lemma": None, "category": "VER:futu" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "août", "offsets": [{ "begin": 210, "end": 214 }], "length": 4, "lemma": "août", "category": "NOM" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": "1914", "offsets": [{ "begin": 215, "end": 219 }], "length": 4, "lemma": "@card@", "category": "NUM" }, "tokenwithlemma") bucket2.add_annotation( { "_documentID": "doc1", "_corpusID": CORPUS_ID, "schemaType": "tokenwithlemma", "word": ".", "offsets": [{ "begin": 219, "end": 220 }], "length": 1, "lemma": ".", "category": "SENT" }, "tokenwithlemma") time.sleep(1) corpus.add_text_document( id=ALICE_FR_DOC_ID, language="fr-xx", title="AU FOND DU TERRIER", source="https://www.gutenberg.org/files/55456/55456-0.txt", text= "ALICE, assise auprès de sa sœur sur le gazon, commençait à s'ennuyer de rester là à ne rien faire; " "une ou deux fois elle avait jeté les yeux sur le livre que lisait sa sœur; mais quoi! pas d'images, " "pas de dialogues! \"La belle avance,\" pensait Alice, \"qu'un livre sans images, sans causeries!\"." ) corpus.add_text_document( id=ALICE_EN_DOC_ID, language="en-xx", title="Down the Rabbit-Hole", source="http://www.gutenberg.org/files/11/11-0.txt", text= "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing " "to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or " "conversations in it, ‘and what is the use of a book,’ thought Alice ‘without pictures or " "conversations?’") time.sleep(1)
def test_get_schemas_info(self): jsonSchema1 = { "$schema": "http://json-schema.org/draft-04/schema#", "targetType": "document_surface1d", "schemaType": "schema1", "type": "object", "required": ["_schemaType", "_corpusID", "_documentID", "offsets"], "properties": { "_schemaType": { "type": "string", "description": "Schema type", "searchable": True, "searchModes": ["noop"], "locked": True }, "_documentID": { "type": "string", "description": "Internal document GUID", "searchable": True, "searchModes": ["basic"], "locked": True }, "_corpusID": { "type": "string", "description": "Internal Corpus GUID", "searchable": True, "searchModes": ["basic"], "locked": True } } } jsonSchema2 = { "$schema": "http://json-schema.org/draft-04/schema#", "targetType": "document_surface1d", "schemaType": "schema2", "type": "object", "required": [ "_schemaType", ], "properties": { "_schemaType": { "type": "string", "description": "Schema type", "searchable": True, "searchModes": ["noop"], "locked": True } } } corpus = get_master_document_corpus_list( self.envId, self.authorization).create_corpus("corpus1") bucket1 = corpus.create_bucket("bucket1") schemaId1 = get_schema_list( self.envId, self.authorization).add_json_schema_as_hash( jsonSchema1, False, {}) schemaId2 = get_schema_list( self.envId, self.authorization).add_json_schema_as_hash( jsonSchema2, False, {}) time.sleep(1) bucket1.add_or_update_schema_to_bucket(schemaId1, "schema1", TargetType("document"), {}) bucket1.add_or_update_schema_to_bucket(schemaId2, "schema2", TargetType("document"), {}) time.sleep(1) res = bucket1.get_schemas_info(True) self.assertEqual(len(res["data"]), 2)
def test_delete_schema_type(self): jsonSchema1 = { "$schema": "http://json-schema.org/draft-04/schema#", "targetType": "document_surface1d", "schemaType": "schema1", "type": "object", "properties": { "name": { "type": "string", "description": "Internal document GUID", "searchable": True, "searchModes": ["basic"], "locked": True } } } jsonSchema2 = { "$schema": "http://json-schema.org/draft-04/schema#", "targetType": "document_surface1d", "schemaType": "schema2", "type": "object", "properties": { "city": { "type": "string", "description": "Internal document GUID", "searchable": True, "searchModes": ["basic"], "locked": True } } } corpus = get_master_document_corpus_list( self.envId, self.authorization).create_corpus("corpus1") bucket1 = corpus.create_bucket("bucket1") schemaId1 = get_schema_list( self.envId, self.authorization).add_json_schema_as_hash( jsonSchema1, False, {}) schemaId2 = get_schema_list( self.envId, self.authorization).add_json_schema_as_hash( jsonSchema2, False, {}) time.sleep(1) bucket1.add_or_update_schema_to_bucket(schemaId1, "schema1", TargetType("document"), {}) bucket1.add_or_update_schema_to_bucket(schemaId2, "schema2", TargetType("document"), {}) bucket1.add_annotation({"name": "Anton"}, "schema1", "1") bucket1.add_annotation({"name": "JF"}, "schema1", "2") bucket1.add_annotation({"city": "Montreal"}, "schema2", "1") bucket1.add_annotation({"city": "Quebec"}, "schema2", "2") time.sleep(1) anno1 = bucket1.get_annotation("1", "schema1") bucket1.delete_schema_type("schema1") time.sleep(1) info = bucket1.get_schemas_info() self.assertEqual(len(info["data"]), 1) # making shure the annotation remains in schema 2 anno1 = bucket1.get_annotation("1", "schema2") self.assertEqual(anno1["city"], "Montreal") anno2 = bucket1.get_annotation("2", "schema2") self.assertEqual(anno2["city"], "Quebec") # create the same schema type but with different data bucket1.add_or_update_schema_to_bucket(schemaId1, "schema1", TargetType("document"), {}) bucket1.add_annotation({"name": "Yolo"}, "schema1", "3") bucket1.add_annotation({"name": "Rage"}, "schema1", "4") time.sleep(1) # make sure old annotations do not exists anno1 = bucket1.get_annotation("3", "schema1") self.assertEqual(anno1["name"], "Yolo") anno2 = bucket1.get_annotation("4", "schema1") self.assertEqual(anno2["name"], "Rage") self.assertRaises(DocumentNotFoundException, bucket1.get_annotation, "1", "schema1")
def put(self, corpusId, bucketId): try: body = self.strip_body_bom() envId = get_env_id() authorization = get_autorisation(envId, None, None) if is_missing_required_fields( body, ["targetType", "schemaType", "properties"]): self.write_and_set_status( { MESSAGE: missing_fields_message( body, ["targetType", "schemaType", "properties"]) }, HTTPStatus.UNPROCESSABLE_ENTITY) return schemaType = body["schemaType"] targetTypeName = body["targetType"] if not TargetType.has(targetTypeName): self.write_and_set_status( { MESSAGE: "Target type {0} not supported".format(targetTypeName) }, HTTPStatus.UNPROCESSABLE_ENTITY) return # Is there currently a schema of schemaType associated with the bucket? bucket = get_master_document_corpus_list( envId, authorization).get_corpus(corpusId).get_bucket(bucketId) schemas = bucket.get_schemas_info(False) schemaTypes = [schema['schemaType'] for schema in schemas['data']] if schemaType not in schemaTypes: self.write_and_set_status( { MESSAGE: "There is no schema with the schemaType '{0}' currently bound to the bucket." .format(schemaType) }, HTTPStatus.NOT_FOUND) return # check if schema with the same has as the current annotation exist: targetType = TargetType(targetTypeName) nestedFields = [] if targetType == TargetType.document_surface1d: nestedFields.append("offsets") schemaId = get_schema_list(envId, authorization).add_json_schema_as_hash( body, False, nestedFields) bucket.add_or_update_schema_to_bucket(schemaId, schemaType, targetType, {}) self.write_and_set_status(None, HTTPStatus.NO_CONTENT) except EsSchemaMigrationInvalidException as err: self.write_and_set_status( { MESSAGE: "Cannot update schema because changes are not compatible with document in old schema.Extra info: '{0}'" .format(err) }, HTTPStatus.UNPROCESSABLE_ENTITY) except EsSchemaMigrationDeleteFieldsNotSupportedException as err: self.write_and_set_status( { MESSAGE: "Can not delete fields from existing schema.Missing Fields: '{0}'" .format(err) }, HTTPStatus.UNPROCESSABLE_ENTITY) except CorpusNotFoundException as err: self.write_and_set_status( { MESSAGE: "Corpus does not exist.Extra info: '{0}'".format(err) }, HTTPStatus.UNPROCESSABLE_ENTITY) except BucketNotFoundException as err: self.write_and_set_status( { MESSAGE: "Bucket does not exist.Extra info: '{0}'".format(err) }, HTTPStatus.UNPROCESSABLE_ENTITY) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status( { MESSAGE: "Internal server error", TRACE: trace }, HTTPStatus.INTERNAL_SERVER_ERROR)
def post(self, corpusId, bucketId): try: body = self.strip_body_bom() envId = get_env_id() authorization = get_autorisation(envId, None, None) if is_missing_required_fields( body, ["targetType", "schemaType", "properties"]): self.write_and_set_status( { MESSAGE: missing_fields_message( body, ["targetType", "schemaType", "properties"]) }, HTTPStatus.UNPROCESSABLE_ENTITY) return schemaType = body["schemaType"] targetTypeName = body["targetType"] if not TargetType.has(targetTypeName): self.write_and_set_status( { MESSAGE: "Target type {0} not supported".format(targetTypeName) }, HTTPStatus.UNPROCESSABLE_ENTITY) return bucket = get_master_document_corpus_list( envId, authorization).get_corpus(corpusId).get_bucket(bucketId) schemas = bucket.get_schemas_info(False) schemaTypes = [schema['schemaType'] for schema in schemas['data']] if schemaType in schemaTypes: self.write_and_set_status( { MESSAGE: "A schema with the schemaType '{0}' is already bound to the bucket." .format(schemaType) }, HTTPStatus.FORBIDDEN) return # check if schema with the same has as the current annotation exist: targetType = TargetType(targetTypeName) nestedFields = [] if targetType == TargetType.document_surface1d: nestedFields.append("offsets") schemaId = get_schema_list(envId, authorization).add_json_schema_as_hash( body, False, nestedFields) bucket.add_or_update_schema_to_bucket(schemaId, schemaType, targetType, {}) self.write_and_set_status(None, HTTPStatus.NO_CONTENT) except CorpusNotFoundException as err: self.write_and_set_status( { MESSAGE: "Corpus does not exist.Extra info: '{0}'".format(err) }, HTTPStatus.UNPROCESSABLE_ENTITY) except BucketNotFoundException as err: self.write_and_set_status( { MESSAGE: "Bucket does not exist.Extra info: '{0}'".format(err) }, HTTPStatus.UNPROCESSABLE_ENTITY) except SchemaBindingInvalid as err: self.write_and_set_status( {MESSAGE: "Schema Binding Invalid: '{0}'".format(err)}, HTTPStatus.UNPROCESSABLE_ENTITY) except Exception: trace = traceback.format_exc().splitlines() self.write_and_set_status( { MESSAGE: "Internal server error", TRACE: trace }, HTTPStatus.INTERNAL_SERVER_ERROR)