def on_post(self, req, resp, dataset_id, dataset_dto, triples_list): """Receives HTTP Request to add triples into the dataset This will expect an input on the body similar to this .. sourcecode:: json [ { "subject": "Q1492", "predicate": "P17", "object": "Q29" }, { "subject": "Q90", "predicate": "P17", "object": "Q142"}, { "subject": "Q2807", "predicate": "P17", "object": "Q29"} ] :param integer dataset_id: Unique ID of dataset :param integer dataset_dto: Dataset DTO (from hook) :param list triples_list: List of triples to insert (from hook) :returns: If operation was successful. :rtype: 202 NO CONTENT """ dataset_dao = data_access.DatasetDAO() res, err = dataset_dao.insert_triples(dataset_dto, triples_list) if res is None: raise falcon.HTTPBadRequest(description=str(err)) textbody = {"status": 202, "message": "Resources created successfuly"} resp.body = json.dumps(textbody) resp.content_type = 'application/json' resp.status = falcon.HTTP_202
def on_post(self, req, resp, dataset_info, **kwargs): """Create a new dataset on the service This method will create a new empty dataset, and returns a 201 CREATED with Location header filled with the URI of new dataset. :param HTTPUserDatasetDTO dataset_info: HTTP Client dataset information :query int dataset_type: The dataset type (optional) :returns: Location header with new path to dataset object """ dao = data_access.DatasetDAO() # Get dataset type dts_type = req.get_param_as_int("dataset_type") dataset_type = dao.get_dataset_types()[dts_type]["class"] id_dts, err = dao.insert_empty_dataset( dataset_type, name=dataset_info.name, description=dataset_info.description) if id_dts is None and err[0] == 409: raise falcon.HTTPConflict(title="The dataset name is already used", description=err[1]) elif id_dts is None and err[0] == 500: raise falcon.HTTPInternalServerError(description=err[1]) else: # Dataset created, evrything is done resp.status = falcon.HTTP_201 resp.body = json.dumps({"dataset": {"id": id_dts}}) resp.location = "/datasets/" + str(id_dts)
def on_put(self, req, resp, dataset_id, **kwargs): """Change the description of choosen dataset :param HTTPUserDatasetDTO dataset_info: Object with description param :param integer dataset_id: Unique ID of dataset :returns: The dataset with changed values :rtype: DatasetDTO """ dataset_info = HTTPUserDatasetDTO() try: dataset_info.load(kwargs["dataset_info"]) except KeyError: pass dataset_dao = data_access.DatasetDAO() if dataset_info.description is not None: res, err = dataset_dao.set_description(dataset_id, dataset_info.description) if res is None: raise falcon.HTTPInternalServerError( title="Server Error", description="Unable to process description param") resource, err = dataset_dao.get_dataset_by_id(dataset_id) response = { "dataset": resource.to_dict(), } resp.body = json.dumps(response) resp.content_type = 'application/json' resp.status = falcon.HTTP_200
def _get_dataset_status(dataset_id): """Returns the dataset status :rtype: integer """ d_dao = data_access.DatasetDAO() d_dto, err = d_dao.get_dataset_by_id(dataset_id, use_cache=True) return int(d_dto.status), d_dto
def delete_dataset_by_id(dataset_id): dataset_dao = data_access.DatasetDAO() list_bin_files, err = dataset_dao.delete_dataset(dataset_id) for bin_file in list_bin_files: print(bin_file) try: os.remove(bin_file) except IsADirectoryError as err: os.rmdir(bin_file)
def check_dataset_exsistence(req, resp, resource, params): """Will check if input dataset exists. :returns: A dataset DTO """ dataset_dao = data_access.DatasetDAO() cache = req.get_param_as_bool("use_cache", blank_as_true=True) if cache is None: cache = True params["dataset_dto"], err = dataset_dao.get_dataset_by_id( params['dataset_id'], use_cache=cache) if params["dataset_dto"] is None: raise falcon.HTTPNotFound( title="Dataset {} not found".format(params['dataset_id']), description="The dataset does not exists. " + str(err))
def on_post(self, req, resp, dataset_id, dataset_dto, gen_triples_param): """Generates a task to insert triples on dataset. Async petition. Reads from body the parameters such as SPARQL queries {"generate_triples": { "graph_pattern": "<SPARQL Query (Where part)>", "levels": 2, "batch_size": 30000 # Optional } } :param id dataset_id: The dataset to insert triples into :param DTO dataset_dto: The Dataset DTO from dataset_id (from hook) :param dict gen_triples_param: Params to call generate_triples function (from hook) """ try: batch_size = gen_triples_param.pop("batch_size") except KeyError: batch_size = None # Launch async task task = async_tasks.generate_dataset_from_sparql.delay( dataset_id, gen_triples_param.pop("graph_pattern"), int(gen_triples_param.pop("levels")), batch_size=batch_size) # Create a new task task_dao = data_access.TaskDAO() task_obj, err = task_dao.add_task_by_uuid(task.id) if task_obj is None: raise falcon.HTTPNotFound(description=str(err)) task_obj["next"] = "/datasets/" + dataset_id task_dao.update_task(task_obj) # Store the task into DatasetDTO dataset_dao = data_access.DatasetDAO() dataset_dao.set_task(dataset_id, task_obj['id']) msg = "Task {} created successfuly".format(task_obj['id']) textbody = {"status": 202, "message": msg, "task": task_dao.task} resp.location = "/tasks/" + str(task_obj['id']) resp.body = json.dumps(textbody) resp.content_type = 'application/json' resp.status = falcon.HTTP_202
def on_post(self, req, resp, dataset_id, dataset_dto): """Generates a model that fits the dataset and trains it It usually takes several hours for big datasets This changes the dataset status from 0 to 1 once finished. While training takes place, the status will be set to a negative value. :param id dataset_id: The dataset to insert triples into :param DTO dataset_dto: The Dataset DTO from dataset_id (from hook) :query int algorithm_id: The algorithm used to train the dataset """ # Dig for the limit param on Query Params algorithm_id = req.get_param('algorithm_id', required=True) # Obtain the algorithm algorithm_dao = data_access.AlgorithmDAO() algorithm, err = algorithm_dao.get_algorithm_by_id(algorithm_id) if algorithm is None: raise falcon.HTTPNotFound(message=str(err)) # Launch async task task = async_tasks.train_dataset_from_algorithm.delay( dataset_id, algorithm) # Create the new task task_dao = data_access.TaskDAO() task_obj, err = task_dao.add_task_by_uuid(task.id) if task_obj is None: raise falcon.HTTPNotFound(description=str(err)) task_obj["next"] = "/datasets/" + dataset_id task_dao.update_task(task_obj) # Store the task into DatasetDTO dataset_dao = data_access.DatasetDAO() dataset_dao.set_task(dataset_id, task_obj['id']) msg = "Task {} created successfuly".format(task_obj['id']) textbody = {"status": 202, "message": msg} resp.location = "/tasks/" + str(task_obj['id']) resp.body = json.dumps(textbody) resp.content_type = 'application/json' resp.status = falcon.HTTP_202
def on_get(self, req, resp): """Return all datasets available on the service :query boolean use_cache: False if cache must be reloaded, True if values returned can be those cached. :returns: A list with all datasets """ cache = req.get_param_as_bool("use_cache", blank_as_true=True) dao = data_access.DatasetDAO() listdts, err = dao.get_all_datasets(use_cache=cache) if listdts is None: raise falcon.HTTPNotFound(description=str(err)) response = [{"dataset": dtst.to_dict()} for dtst in listdts] resp.body = json.dumps(response) resp.content_type = 'application/json' resp.status = falcon.HTTP_200
def find_embeddings_on_model(dataset_id, entities): """Returns a list with the corresponding embeddings This will return a list like this: [["Q1", [0, 1, -1, 0.4]], ["Q5", [1, -0.5, -0.1, 0]]] :param str model_path: The path to the binary model :param list entities: A list with the URI (or identifiers) of entities :returns: The embedding vector of each entity :rtype: dict """ # Expected to return: {entities: [], embeddings: []} IN THE SAME ORDER!! dataset_dao = data_access.DatasetDAO() # dataset, err = dataset_dao.get_dataset_by_id(dataset_id) # if dataset is None: # raise LookupError("The dataset couldn't be located") dataset_path, err = dataset_dao.get_binary_path(dataset_id) if dataset_path is None: raise FileNotFoundError("The binary dataset doesn't exist on database") # Load dataset from binary dtset = dataset.Dataset() dtset.load_from_binary(dataset_path) model_path, err = dataset_dao.get_model(dataset_id) if model_path is None: raise FileNotFoundError("The model path does not exist on database") # Load the model and initialize the search index model = skge.TransE.load(model_path) return_list = [] for entity in entities: position = dtset.get_entity_id(entity) if position is None or position < 0: continue else: embedding = model.E[position] return_list.append([entity, embedding.tolist()]) return return_list
def build_search_index(self, dataset_id, n_trees): """Builds the search index and stores in disk :param str model_path: The path to the binary file which stores the model :param int n_trees: The number of trees to be generated. Default is 100 """ # Check input Params if n_trees is None: n_trees = 100 # Creates the progress object in redis celery_uuid = self.request.id progres_dao = data_access.ProgressDAO() progres_dao.create_progress(celery_uuid, 3) progres_dao.update_progress(celery_uuid, 0) dataset_dao = data_access.DatasetDAO() # Set working status dataset_dao.update_status(dataset_id, INDEXED_MASK | RUNNING_TASK_MASK) model_path, err = dataset_dao.get_model(dataset_id) # Load the model and initialize the search index model = skge.TransE.load(model_path) search_index = server.SearchIndex() # File to store the search index search_index_file = model_path[:-4] + "_annoy_{}.bin".format(n_trees) # Execute heavy task and track the progress progres_dao.update_progress(celery_uuid, 1) search_index.build_from_trained_model(model, n_trees) progres_dao.update_progress(celery_uuid, 2) search_index.save_to_binary(search_index_file) progres_dao.update_progress(celery_uuid, 3) # Update values on DB dataset_dao.update_status(dataset_id, INDEXED_MASK, statusAnd=0b1110) dataset_dao.set_search_index(dataset_id, search_index_file) return False
def on_post(self, req, resp, dataset_id, dataset_dto, entities_pair): """This method return the true distance between two entities {"distance": ["http://www.wikidata.org/entity/Q1492", "http://www.wikidata.org/entity/Q2807"] } :param int dataset_id: The dataset identifier on database :param DTO dataset_dto: The Dataset DTO from dataset_id (from hook) :param tuple entities_pair: A pair of entities (from hook) :returns: A distance attribute, float number :rtype: dict """ dataset_dao = data_access.DatasetDAO() dataset = dataset_dao.build_dataset_object(dataset_dto) # TODO: design # Get server to do 'queries' search_index, err = dataset_dao.get_search_index(dataset_dto) if search_index is None: msg_title = "Dataset not ready perform search operation" raise falcon.HTTPConflict(title=msg_title, description=str(err)) # TODO: Maybe extract server management anywhere to simplify this search_server = server.Server(search_index) entity_x, entity_y = entities_pair id_x = dataset.get_entity_id(entity_x) id_y = dataset.get_entity_id(entity_y) if id_x is None or id_y is None: raise falcon.HTTPNotFound( description=("The {} id from entity {} or the {} id from {} " "entity can't be found on the dataset" ).format(id_x, entity_x, id_y, entity_y)) dist = search_server.distance_between_entities(id_x, id_y) resp.body = json.dumps({"distance": dist}) resp.content_type = 'application/json' resp.status = falcon.HTTP_200
def generate_dataset_from_sparql(self, dataset_id, graph_pattern, levels, **keyw_args): """Creates a recurrent dataset from a seed vector This method is intended to be called only with celery *.delay()*, to be executed in foreground. The status of the generation can be queried through it's celery UUID. :param levels: The number of levels to scan :param dataset_path: The path to dataset file :param graph_pattern: The main query containing triples :kwparam limit_ent: Use only for testing purposes """ from celery import current_task # in task definition dataset_dao = data_access.DatasetDAO() dataset_dao.update_status(dataset_id, RUNNING_TASK_MASK) dataset_path, err = dataset_dao.get_binary_path(dataset_id) if dataset_path is None: raise FileNotFoundError("Dataset path is not on the system") # Load current dataset dtset = dataset.Dataset() dtset.load_from_binary(dataset_path) # Obtains the Redis connection from celery. redis = self.app.backend # The id of the object celery_uuid = self.request.id # Saves the empty id to be retrieved first time without error # redis.set(celery_uuid, "{}".encode("utf-8")) progres_dao = data_access.ProgressDAO() progres_dao.create_progress(celery_uuid, 1) progress = progres_dao.get_progress(celery_uuid) progress.total_steps = 1 progress.current_steps = 1 progress.current = 0 progress.total = 0 progres_dao.set_progress(celery_uuid, progress) def init_progress_callback(max_iter): progress = progres_dao.get_progress(celery_uuid) progress.total = max_iter progres_dao.set_progress(celery_uuid, progress) sv_kwargs = {} sv_kwargs['where'] = graph_pattern sv_kwargs['callback'] = lambda: progres_dao.add_progress(celery_uuid) sv_kwargs['start_callback'] = init_progress_callback # Batch limit has to be an integer try: sv_kwargs['batch_size'] = int(keyw_args.pop('batch_size')) except (LookupError, ValueError, TypeError): pass # Get the seed vector and load first entities seed_vector = dtset.load_from_graph_pattern(**sv_kwargs) celery_uuid = "celery-task-progress-" + self.request.id def status_callback(status): """Saves the progress of the task on redis db""" # Create progress object progress = { "current": status['it_analyzed'], "total": status['it_total'], "current_steps": status['round_curr'] + 1, "total_steps": status['round_total'] } # Retrieve task from redis task = redis.get(celery_uuid).decode("utf-8") task = json.loads(task) # Add task progress task['progress'] = progress # Save again on redis task = json.dumps(task).encode("utf-8") redis.set(celery_uuid, task) return # Build the optional args dict keyw_args["ext_callback"] = status_callback # Call to the *heavy* method dtset.load_dataset_recurrently(levels, seed_vector, **keyw_args) # Save new binary dtset.save_to_binary(dataset_path) # Restore status dataset_dao.set_status(dataset_id, 0) return False
def build_autocomplete_index(self, dataset_id, langs=['en', 'es']): """Generates an autocomplete index from a dataset using choosen languages This method extracts labels, descriptions and other useful information from sparql endpoint (or any other source) and stores it on the search database (elasticsearch). As the dataset may contain too much information in many languages, this will only use the selected languages. :param int dataset_id: The dataset ID :param list langs: A list of languages in ISO 639-1 format """ # Creates the progress object in redis celery_uuid = self.request.id progres_dao = data_access.ProgressDAO() # Load binary dataset dataset_dao = data_access.DatasetDAO() dataset_path, err = dataset_dao.get_binary_path(dataset_id) dataset_dto, err = dataset_dao.get_dataset_by_id(dataset_id) dtset = dataset.Dataset() dtset.load_from_binary(dataset_path) # Set working status # TODO: update status, not overwrite it dataset_dao.update_status(dataset_id, SEARCHINDEXED_MASK | RUNNING_TASK_MASK) # Update Progress progres_dao.create_progress(celery_uuid, len(dtset.entities)) progres_dao.update_progress(celery_uuid, 0) entity_dao = data_access.EntityDAO(dataset_dto.dataset_type, dataset_id) def get_labels(entity): """Auxiliar method to wrap dtset.entity_labels. Receives only one entity and stores on search Index """ # Get the labels from endpoint labels, descriptions, alt_labels = dtset.entity_labels(entity, langs=langs) # track progress: add one more step progres_dao.add_progress(celery_uuid) # Create the doc to be stored on elasticsearch and insert it entity_doc = { "entity_id": entity, "entity_uri": dtset.check_entity(entity), "label": labels, "alt_label": alt_labels, "description": descriptions } entity_dao.insert_entity(entity_doc) # Execute get_labels concurrently, using as many processes as cpu cores with ThreadPool(multiprocessing.cpu_count()) as p: all_labels = p.map(get_labels, dtset.entities) # Update status on DB when finished dataset_dao.update_status(dataset_id, SEARCHINDEXED_MASK, statusAnd=0b1110) return False
def train_dataset_from_algorithm(self, dataset_id, algorithm_dict): """Trains a dataset given an algorithm It is able to save the progress of training. :param str dataset_path: The path where binary dataset is located :param dict algorithm: An algorithm to be used in dataset training """ dataset_dao = data_access.DatasetDAO() # If it all goes ok, add id of algorithm to db dataset_dao.set_algorithm(dataset_id, algorithm_dict["id"]) dataset_dao.update_status(dataset_id, RUNNING_TASK_MASK | TRAINED_MASK) dataset_dto, err = dataset_dao.get_dataset_by_id(dataset_id) # Generate the filepath to the dataset dtset_path = dataset_dto.get_binary_dataset() # Loads the current dataset dtset = dataset.Dataset() dtset.load_from_binary(dtset_path) # Obtains the Redis connection from celery. redis = self.app.backend # The id of the object celery_uuid = "celery-task-progress-" + self.request.id # Saves the empty id to be retrieved first time without error progress = { "current": -1, "total": algorithm_dict['max_epochs'], "current_steps": None, "total_steps": None } redis.set(celery_uuid, json.dumps({"progress": progress}).encode("utf-8")) def status_callback(trainer): """Saves the progress of the task on redis db""" print("Status Callback. Trainer {}".format(trainer.epoch)) # Retrieve task from redis task = redis.get(celery_uuid).decode("utf-8") task = json.loads(task) # Add task progress task['progress']['current'] = trainer.epoch # Save again on redis task = json.dumps(task).encode("utf-8") redis.set(celery_uuid, task) return # Creates an optional parameters dict for better readability kwargs = { 'train_all': True, # All dataset will be trained, not validated 'test_all': -1, # No validation is going to be performed 'model_type': skge.TransE, # The default model will be used 'ncomp': algorithm_dict['embedding_size'], # Provided by the algorithm 'margin': algorithm_dict['margin'], # Provided by the algorithm 'max_epochs': algorithm_dict['max_epochs'], # Max number of iterations 'external_callback': status_callback, # The status callback } # Heavy task model = algorithm.ModelTrainer(dtset, **kwargs) modeloentrenado = model.run() model_path = dtset_path[:-4] + "_model.bin" modeloentrenado.save(model_path) # Update values on DB when model training has finished dataset_dao.update_status(dataset_id, TRAINED_MASK, statusAnd=0b1110) dataset_dao.set_model(dataset_id, model_path) return False
def on_get(self, req, resp, dataset_id, entity, embedding=False): """Makes HTTP response for a SimilarEntities search It may be used directly with get, but it is discouraged. This method does not return nothing, but makes a http request with Falcon. :param int dataset_id: The dataset identifier on database :param string entity: Can be either identifier or embedding vector :param boolean embedding: True if entity param is an embedding :query int limit: Limit of similar entities returned. By default is set to 10 :query int search_k: Maximum number of nodes where the search is made. The higher this param is, the higher quality is, but the performance is worse. Defaults to -1 :returns: None """ # Get dataset dataset_dao = data_access.DatasetDAO() dataset_dto, err = dataset_dao.get_dataset_by_id(dataset_id) if dataset_dto is None: raise falcon.HTTPNotFound(description=str(err)) # Ignore dataset status. May produce unpredictable results ignore = req.get_param_as_bool("ignore_status") if ignore is None: ignore = False dataset = dataset_dao.build_dataset_object(dataset_dto) # TODO: design # Get server to do 'queries' search_index, err = dataset_dao.get_search_index(dataset_dto, ignore_status=ignore) if search_index is None: msg_title = "Dataset not ready perform search operation" raise falcon.HTTPConflict(title=msg_title, description=str(err)) # TODO: Maybe extract server management anywhere to simplify this search_server = server.Server(search_index) # Dig for the limit param on Query Params limit = req.get_param_as_int('limit') if limit is None: limit = 10 # Default value # Needed because server returns also the identical triple limit = int(limit) + 1 # Dig for the search_k param on Query Params search_k = req.get_param_as_int('search_k') if search_k is None: search_k = -1 # If looking for similar_entities given an embedding vector if embedding: similar_entities = search_server.similarity_by_embedding( entity, limit, search_k=search_k) similar_entities = [{ "entity": dataset.get_entity(e_id), "distance": dist } for e_id, dist in similar_entities] entity_used = { "value": entity, # Will be an embedding vector "type": "embedding" } # If looking for similar_entities given an entity else: entity_dao = data_access.EntityDAO(dataset_dto.dataset_type, dataset_id) entity_id = dataset.get_entity_id(entity) if entity_id is None: raise falcon.HTTPNotFound( description="The {} entity can't be found inside dataset.". format(entity)) sim_entities = search_server.similarity_by_id(entity_id, limit, search_k=search_k) def getEntityDTO(e_id): entity = dataset.check_entity(dataset.get_entity(e_id)) entity_dto = entity_dao.get_entity_dto(entity) return entity_dto.to_dict() if req.get_param_as_bool('object'): similar_entities = [{ "entity": dataset.get_entity(e_id), "object": getEntityDTO(e_id), "distance": dist } for e_id, dist in sim_entities] else: similar_entities = [{ "entity": dataset.get_entity(e_id), "distance": dist } for e_id, dist in sim_entities] entity_used = { "value": dataset.get_entity(entity_id), "type": "uri" } response = { "dataset": dataset_dto.to_dict(), "similar_entities": { "entity": entity_used, "limit": len(similar_entities), "search_k": search_k, "response": similar_entities } } resp.body = json.dumps(response) resp.content_type = 'application/json' resp.status = falcon.HTTP_200