Exemple #1
0
    def on_post(self, req, resp, dataset_id, dataset_dto, triples_list):
        """Receives HTTP Request to add triples into the dataset

        This will expect an input on the body similar to this

        .. sourcecode:: json

            [
                {   "subject": "Q1492",
                    "predicate": "P17",
                    "object": "Q29" },
                {   "subject": "Q90",
                    "predicate": "P17",
                    "object": "Q142"},
                {   "subject": "Q2807",
                    "predicate": "P17",
                    "object": "Q29"}
            ]

        :param integer dataset_id: Unique ID of dataset
        :param integer dataset_dto: Dataset DTO (from hook)
        :param list triples_list: List of triples to insert (from hook)
        :returns: If operation was successful.
        :rtype: 202 NO CONTENT
        """
        dataset_dao = data_access.DatasetDAO()
        res, err = dataset_dao.insert_triples(dataset_dto, triples_list)
        if res is None:
            raise falcon.HTTPBadRequest(description=str(err))

        textbody = {"status": 202, "message": "Resources created successfuly"}
        resp.body = json.dumps(textbody)
        resp.content_type = 'application/json'
        resp.status = falcon.HTTP_202
Exemple #2
0
    def on_post(self, req, resp, dataset_info, **kwargs):
        """Create a new dataset on the service

        This method will create a new empty dataset, and returns a 201 CREATED
        with Location header filled with the URI of new dataset.

        :param HTTPUserDatasetDTO dataset_info: HTTP Client dataset information
        :query int dataset_type: The dataset type (optional)
        :returns: Location header with new path to dataset object
        """
        dao = data_access.DatasetDAO()
        # Get dataset type
        dts_type = req.get_param_as_int("dataset_type")

        dataset_type = dao.get_dataset_types()[dts_type]["class"]
        id_dts, err = dao.insert_empty_dataset(
            dataset_type,
            name=dataset_info.name,
            description=dataset_info.description)

        if id_dts is None and err[0] == 409:
            raise falcon.HTTPConflict(title="The dataset name is already used",
                                      description=err[1])
        elif id_dts is None and err[0] == 500:
            raise falcon.HTTPInternalServerError(description=err[1])
        else:
            # Dataset created, evrything is done
            resp.status = falcon.HTTP_201
            resp.body = json.dumps({"dataset": {"id": id_dts}})
            resp.location = "/datasets/" + str(id_dts)
Exemple #3
0
    def on_put(self, req, resp, dataset_id, **kwargs):
        """Change the description of choosen dataset

        :param HTTPUserDatasetDTO dataset_info: Object with description param
        :param integer dataset_id: Unique ID of dataset
        :returns: The dataset with changed values
        :rtype: DatasetDTO
        """
        dataset_info = HTTPUserDatasetDTO()
        try:
            dataset_info.load(kwargs["dataset_info"])
        except KeyError:
            pass

        dataset_dao = data_access.DatasetDAO()
        if dataset_info.description is not None:
            res, err = dataset_dao.set_description(dataset_id,
                                                   dataset_info.description)
            if res is None:
                raise falcon.HTTPInternalServerError(
                    title="Server Error",
                    description="Unable to process description param")

        resource, err = dataset_dao.get_dataset_by_id(dataset_id)
        response = {
            "dataset": resource.to_dict(),
        }
        resp.body = json.dumps(response)
        resp.content_type = 'application/json'
        resp.status = falcon.HTTP_200
def _get_dataset_status(dataset_id):
    """Returns the dataset status

    :rtype: integer
    """
    d_dao = data_access.DatasetDAO()
    d_dto, err = d_dao.get_dataset_by_id(dataset_id, use_cache=True)
    return int(d_dto.status), d_dto
Exemple #5
0
def delete_dataset_by_id(dataset_id):
    dataset_dao = data_access.DatasetDAO()
    list_bin_files, err = dataset_dao.delete_dataset(dataset_id)
    for bin_file in list_bin_files:
        print(bin_file)
        try:
            os.remove(bin_file)
        except IsADirectoryError as err:
            os.rmdir(bin_file)
def check_dataset_exsistence(req, resp, resource, params):
    """Will check if input dataset exists.

    :returns: A dataset DTO
    """
    dataset_dao = data_access.DatasetDAO()
    cache = req.get_param_as_bool("use_cache", blank_as_true=True)
    if cache is None:
        cache = True
    params["dataset_dto"], err = dataset_dao.get_dataset_by_id(
        params['dataset_id'], use_cache=cache)
    if params["dataset_dto"] is None:
        raise falcon.HTTPNotFound(
            title="Dataset {} not found".format(params['dataset_id']),
            description="The dataset does not exists. " + str(err))
Exemple #7
0
    def on_post(self, req, resp, dataset_id, dataset_dto, gen_triples_param):
        """Generates a task to insert triples on dataset. Async petition.

        Reads from body the parameters such as SPARQL queries

        {"generate_triples":
            {
                "graph_pattern": "<SPARQL Query (Where part)>",
                "levels": 2,
                "batch_size": 30000   # Optional
            }
        }

        :param id dataset_id: The dataset to insert triples into
        :param DTO dataset_dto: The Dataset DTO from dataset_id (from hook)
        :param dict gen_triples_param: Params to call generate_triples function
                                       (from hook)
        """
        try:
            batch_size = gen_triples_param.pop("batch_size")
        except KeyError:
            batch_size = None

        # Launch async task
        task = async_tasks.generate_dataset_from_sparql.delay(
            dataset_id,
            gen_triples_param.pop("graph_pattern"),
            int(gen_triples_param.pop("levels")),
            batch_size=batch_size)

        # Create a new task
        task_dao = data_access.TaskDAO()
        task_obj, err = task_dao.add_task_by_uuid(task.id)
        if task_obj is None:
            raise falcon.HTTPNotFound(description=str(err))
        task_obj["next"] = "/datasets/" + dataset_id
        task_dao.update_task(task_obj)

        # Store the task into DatasetDTO
        dataset_dao = data_access.DatasetDAO()
        dataset_dao.set_task(dataset_id, task_obj['id'])

        msg = "Task {} created successfuly".format(task_obj['id'])
        textbody = {"status": 202, "message": msg, "task": task_dao.task}
        resp.location = "/tasks/" + str(task_obj['id'])
        resp.body = json.dumps(textbody)
        resp.content_type = 'application/json'
        resp.status = falcon.HTTP_202
Exemple #8
0
    def on_post(self, req, resp, dataset_id, dataset_dto):
        """Generates a model that fits the dataset and trains it

        It usually takes several hours for big datasets

        This changes the dataset status from 0 to 1 once finished. While
        training takes place, the status will be set to a negative value.

        :param id dataset_id: The dataset to insert triples into
        :param DTO dataset_dto: The Dataset DTO from dataset_id (from hook)
        :query int algorithm_id: The algorithm used to train the dataset
        """
        # Dig for the limit param on Query Params
        algorithm_id = req.get_param('algorithm_id', required=True)

        # Obtain the algorithm
        algorithm_dao = data_access.AlgorithmDAO()
        algorithm, err = algorithm_dao.get_algorithm_by_id(algorithm_id)
        if algorithm is None:
            raise falcon.HTTPNotFound(message=str(err))

        # Launch async task
        task = async_tasks.train_dataset_from_algorithm.delay(
            dataset_id, algorithm)

        # Create the new task
        task_dao = data_access.TaskDAO()
        task_obj, err = task_dao.add_task_by_uuid(task.id)
        if task_obj is None:
            raise falcon.HTTPNotFound(description=str(err))
        task_obj["next"] = "/datasets/" + dataset_id
        task_dao.update_task(task_obj)

        # Store the task into DatasetDTO
        dataset_dao = data_access.DatasetDAO()
        dataset_dao.set_task(dataset_id, task_obj['id'])

        msg = "Task {} created successfuly".format(task_obj['id'])
        textbody = {"status": 202, "message": msg}
        resp.location = "/tasks/" + str(task_obj['id'])
        resp.body = json.dumps(textbody)
        resp.content_type = 'application/json'
        resp.status = falcon.HTTP_202
Exemple #9
0
    def on_get(self, req, resp):
        """Return all datasets available on the service

        :query boolean use_cache: False if cache must be reloaded, True if
                                  values returned can be those cached.
        :returns: A list with all datasets
        """
        cache = req.get_param_as_bool("use_cache", blank_as_true=True)

        dao = data_access.DatasetDAO()

        listdts, err = dao.get_all_datasets(use_cache=cache)

        if listdts is None:
            raise falcon.HTTPNotFound(description=str(err))

        response = [{"dataset": dtst.to_dict()} for dtst in listdts]
        resp.body = json.dumps(response)
        resp.content_type = 'application/json'
        resp.status = falcon.HTTP_200
Exemple #10
0
def find_embeddings_on_model(dataset_id, entities):
    """Returns a list with the corresponding embeddings

    This will return a list like this:

    [["Q1", [0, 1, -1, 0.4]], ["Q5", [1, -0.5, -0.1, 0]]]

    :param str model_path: The path to the binary model
    :param list entities: A list with the URI (or identifiers) of entities
    :returns: The embedding vector of each entity
    :rtype: dict
    """
    # Expected to return: {entities: [], embeddings: []} IN THE SAME ORDER!!
    dataset_dao = data_access.DatasetDAO()
    # dataset, err = dataset_dao.get_dataset_by_id(dataset_id)
    # if dataset is None:
    #     raise LookupError("The dataset couldn't be located")

    dataset_path, err = dataset_dao.get_binary_path(dataset_id)
    if dataset_path is None:
        raise FileNotFoundError("The binary dataset doesn't exist on database")

    # Load dataset from binary
    dtset = dataset.Dataset()
    dtset.load_from_binary(dataset_path)

    model_path, err = dataset_dao.get_model(dataset_id)
    if model_path is None:
        raise FileNotFoundError("The model path does not exist on database")
    # Load the model and initialize the search index
    model = skge.TransE.load(model_path)

    return_list = []
    for entity in entities:
        position = dtset.get_entity_id(entity)
        if position is None or position < 0:
            continue
        else:
            embedding = model.E[position]
        return_list.append([entity, embedding.tolist()])
    return return_list
Exemple #11
0
def build_search_index(self, dataset_id, n_trees):
    """Builds the search index and stores in disk

    :param str model_path: The path to the binary file which stores the model
    :param int n_trees: The number of trees to be generated. Default is 100
    """
    # Check input Params
    if n_trees is None:
        n_trees = 100

    # Creates the progress object in redis
    celery_uuid = self.request.id
    progres_dao = data_access.ProgressDAO()
    progres_dao.create_progress(celery_uuid, 3)
    progres_dao.update_progress(celery_uuid, 0)

    dataset_dao = data_access.DatasetDAO()
    # Set working status
    dataset_dao.update_status(dataset_id, INDEXED_MASK | RUNNING_TASK_MASK)
    model_path, err = dataset_dao.get_model(dataset_id)
    # Load the model and initialize the search index
    model = skge.TransE.load(model_path)
    search_index = server.SearchIndex()

    # File to store the search index
    search_index_file = model_path[:-4] + "_annoy_{}.bin".format(n_trees)

    # Execute heavy task and track the progress
    progres_dao.update_progress(celery_uuid, 1)
    search_index.build_from_trained_model(model, n_trees)
    progres_dao.update_progress(celery_uuid, 2)
    search_index.save_to_binary(search_index_file)
    progres_dao.update_progress(celery_uuid, 3)

    # Update values on DB
    dataset_dao.update_status(dataset_id, INDEXED_MASK, statusAnd=0b1110)
    dataset_dao.set_search_index(dataset_id, search_index_file)

    return False
Exemple #12
0
    def on_post(self, req, resp, dataset_id, dataset_dto, entities_pair):
        """This method return the true distance between two entities

        {"distance":
            ["http://www.wikidata.org/entity/Q1492",
             "http://www.wikidata.org/entity/Q2807"]
        }

        :param int dataset_id: The dataset identifier on database
        :param DTO dataset_dto: The Dataset DTO from dataset_id (from hook)
        :param tuple entities_pair: A pair of entities (from hook)
        :returns: A distance attribute, float number
        :rtype: dict
        """
        dataset_dao = data_access.DatasetDAO()
        dataset = dataset_dao.build_dataset_object(dataset_dto)  # TODO: design

        # Get server to do 'queries'
        search_index, err = dataset_dao.get_search_index(dataset_dto)
        if search_index is None:
            msg_title = "Dataset not ready perform search operation"
            raise falcon.HTTPConflict(title=msg_title, description=str(err))
        # TODO: Maybe extract server management anywhere to simplify this
        search_server = server.Server(search_index)
        entity_x, entity_y = entities_pair
        id_x = dataset.get_entity_id(entity_x)
        id_y = dataset.get_entity_id(entity_y)
        if id_x is None or id_y is None:
            raise falcon.HTTPNotFound(
                description=("The {} id from entity {} or the {} id from {} "
                             "entity can't be found on the dataset"
                             ).format(id_x, entity_x, id_y, entity_y))

        dist = search_server.distance_between_entities(id_x, id_y)

        resp.body = json.dumps({"distance": dist})
        resp.content_type = 'application/json'
        resp.status = falcon.HTTP_200
Exemple #13
0
def generate_dataset_from_sparql(self, dataset_id, graph_pattern, levels,
                                 **keyw_args):
    """Creates a recurrent dataset from a seed vector

    This method is intended to be called only with celery *.delay()*, to
    be executed in foreground. The status of the generation can be queried
    through it's celery UUID.

    :param levels: The number of levels to scan
    :param dataset_path: The path to dataset file
    :param graph_pattern: The main query containing triples
    :kwparam limit_ent: Use only for testing purposes
    """
    from celery import current_task  # in task definition
    dataset_dao = data_access.DatasetDAO()
    dataset_dao.update_status(dataset_id, RUNNING_TASK_MASK)

    dataset_path, err = dataset_dao.get_binary_path(dataset_id)
    if dataset_path is None:
        raise FileNotFoundError("Dataset path is not on the system")

    # Load current dataset
    dtset = dataset.Dataset()
    dtset.load_from_binary(dataset_path)

    # Obtains the Redis connection from celery.
    redis = self.app.backend
    # The id of the object
    celery_uuid = self.request.id
    # Saves the empty id to be retrieved first time without error
    # redis.set(celery_uuid, "{}".encode("utf-8"))
    progres_dao = data_access.ProgressDAO()
    progres_dao.create_progress(celery_uuid, 1)
    progress = progres_dao.get_progress(celery_uuid)
    progress.total_steps = 1
    progress.current_steps = 1
    progress.current = 0
    progress.total = 0
    progres_dao.set_progress(celery_uuid, progress)

    def init_progress_callback(max_iter):
        progress = progres_dao.get_progress(celery_uuid)
        progress.total = max_iter
        progres_dao.set_progress(celery_uuid, progress)

    sv_kwargs = {}
    sv_kwargs['where'] = graph_pattern
    sv_kwargs['callback'] = lambda: progres_dao.add_progress(celery_uuid)
    sv_kwargs['start_callback'] = init_progress_callback

    # Batch limit has to be an integer
    try:
        sv_kwargs['batch_size'] = int(keyw_args.pop('batch_size'))
    except (LookupError, ValueError, TypeError):
        pass

    # Get the seed vector and load first entities
    seed_vector = dtset.load_from_graph_pattern(**sv_kwargs)

    celery_uuid = "celery-task-progress-" + self.request.id

    def status_callback(status):
        """Saves the progress of the task on redis db"""
        # Create progress object
        progress = {
            "current": status['it_analyzed'],
            "total": status['it_total'],
            "current_steps": status['round_curr'] + 1,
            "total_steps": status['round_total']
        }

        # Retrieve task from redis
        task = redis.get(celery_uuid).decode("utf-8")
        task = json.loads(task)

        # Add task progress
        task['progress'] = progress

        # Save again on redis
        task = json.dumps(task).encode("utf-8")
        redis.set(celery_uuid, task)
        return

    # Build the optional args dict
    keyw_args["ext_callback"] = status_callback

    # Call to the *heavy* method
    dtset.load_dataset_recurrently(levels, seed_vector, **keyw_args)

    # Save new binary
    dtset.save_to_binary(dataset_path)

    # Restore status
    dataset_dao.set_status(dataset_id, 0)

    return False
Exemple #14
0
def build_autocomplete_index(self, dataset_id, langs=['en', 'es']):
    """Generates an autocomplete index from a dataset using choosen languages

    This method extracts labels, descriptions and other useful information
    from sparql endpoint (or any other source) and stores it on the search
    database (elasticsearch). As the dataset may contain too much information
    in many languages, this will only use the selected languages.

    :param int dataset_id: The dataset ID
    :param list langs: A list of languages in ISO 639-1 format
    """
    # Creates the progress object in redis
    celery_uuid = self.request.id
    progres_dao = data_access.ProgressDAO()

    # Load binary dataset
    dataset_dao = data_access.DatasetDAO()
    dataset_path, err = dataset_dao.get_binary_path(dataset_id)
    dataset_dto, err = dataset_dao.get_dataset_by_id(dataset_id)
    dtset = dataset.Dataset()
    dtset.load_from_binary(dataset_path)
    # Set working status
    # TODO: update status, not overwrite it
    dataset_dao.update_status(dataset_id,
                              SEARCHINDEXED_MASK | RUNNING_TASK_MASK)
    # Update Progress
    progres_dao.create_progress(celery_uuid, len(dtset.entities))
    progres_dao.update_progress(celery_uuid, 0)

    entity_dao = data_access.EntityDAO(dataset_dto.dataset_type, dataset_id)

    def get_labels(entity):
        """Auxiliar method to wrap dtset.entity_labels.

        Receives only one entity and stores on search Index
        """
        # Get the labels from endpoint
        labels, descriptions, alt_labels = dtset.entity_labels(entity,
                                                               langs=langs)

        # track progress: add one more step
        progres_dao.add_progress(celery_uuid)

        # Create the doc to be stored on elasticsearch and insert it
        entity_doc = {
            "entity_id": entity,
            "entity_uri": dtset.check_entity(entity),
            "label": labels,
            "alt_label": alt_labels,
            "description": descriptions
        }
        entity_dao.insert_entity(entity_doc)

    # Execute get_labels concurrently, using as many processes as cpu cores
    with ThreadPool(multiprocessing.cpu_count()) as p:
        all_labels = p.map(get_labels, dtset.entities)

    # Update status on DB when finished
    dataset_dao.update_status(dataset_id, SEARCHINDEXED_MASK, statusAnd=0b1110)

    return False
Exemple #15
0
def train_dataset_from_algorithm(self, dataset_id, algorithm_dict):
    """Trains a dataset given an algorithm

    It is able to save the progress of training.
    :param str dataset_path: The path where binary dataset is located
    :param dict algorithm: An algorithm to be used in dataset training
    """

    dataset_dao = data_access.DatasetDAO()

    # If it all goes ok, add id of algorithm to db
    dataset_dao.set_algorithm(dataset_id, algorithm_dict["id"])
    dataset_dao.update_status(dataset_id, RUNNING_TASK_MASK | TRAINED_MASK)

    dataset_dto, err = dataset_dao.get_dataset_by_id(dataset_id)
    # Generate the filepath to the dataset
    dtset_path = dataset_dto.get_binary_dataset()
    # Loads the current dataset
    dtset = dataset.Dataset()
    dtset.load_from_binary(dtset_path)

    # Obtains the Redis connection from celery.
    redis = self.app.backend
    # The id of the object
    celery_uuid = "celery-task-progress-" + self.request.id
    # Saves the empty id to be retrieved first time without error
    progress = {
        "current": -1,
        "total": algorithm_dict['max_epochs'],
        "current_steps": None,
        "total_steps": None
    }
    redis.set(celery_uuid, json.dumps({"progress": progress}).encode("utf-8"))

    def status_callback(trainer):
        """Saves the progress of the task on redis db"""
        print("Status Callback. Trainer {}".format(trainer.epoch))
        # Retrieve task from redis
        task = redis.get(celery_uuid).decode("utf-8")
        task = json.loads(task)

        # Add task progress
        task['progress']['current'] = trainer.epoch

        # Save again on redis
        task = json.dumps(task).encode("utf-8")
        redis.set(celery_uuid, task)
        return

    # Creates an optional parameters dict for better readability
    kwargs = {
        'train_all': True,  # All dataset will be trained, not validated
        'test_all': -1,  # No validation is going to be performed
        'model_type': skge.TransE,  # The default model will be used
        'ncomp': algorithm_dict['embedding_size'],  # Provided by the algorithm
        'margin': algorithm_dict['margin'],  # Provided by the algorithm
        'max_epochs': algorithm_dict['max_epochs'],  # Max number of iterations
        'external_callback': status_callback,  # The status callback
    }

    # Heavy task
    model = algorithm.ModelTrainer(dtset, **kwargs)
    modeloentrenado = model.run()
    model_path = dtset_path[:-4] + "_model.bin"
    modeloentrenado.save(model_path)

    # Update values on DB when model training has finished
    dataset_dao.update_status(dataset_id, TRAINED_MASK, statusAnd=0b1110)
    dataset_dao.set_model(dataset_id, model_path)

    return False
Exemple #16
0
    def on_get(self, req, resp, dataset_id, entity, embedding=False):
        """Makes HTTP response for a SimilarEntities search

        It may be used directly with get, but it is discouraged. This method
        does not return nothing, but makes a http request with Falcon.

        :param int dataset_id: The dataset identifier on database
        :param string entity: Can be either identifier or embedding vector
        :param boolean embedding: True if entity param is an embedding
        :query int limit: Limit of similar entities returned.
                          By default is set to 10
        :query int search_k: Maximum number of nodes where the search is made.
                             The higher this param is, the higher quality is,
                             but the performance is worse. Defaults to -1
        :returns: None
        """
        # Get dataset
        dataset_dao = data_access.DatasetDAO()
        dataset_dto, err = dataset_dao.get_dataset_by_id(dataset_id)
        if dataset_dto is None:
            raise falcon.HTTPNotFound(description=str(err))

        # Ignore dataset status. May produce unpredictable results
        ignore = req.get_param_as_bool("ignore_status")
        if ignore is None:
            ignore = False

        dataset = dataset_dao.build_dataset_object(dataset_dto)  # TODO: design

        # Get server to do 'queries'
        search_index, err = dataset_dao.get_search_index(dataset_dto,
                                                         ignore_status=ignore)
        if search_index is None:
            msg_title = "Dataset not ready perform search operation"
            raise falcon.HTTPConflict(title=msg_title, description=str(err))
        # TODO: Maybe extract server management anywhere to simplify this
        search_server = server.Server(search_index)

        # Dig for the limit param on Query Params
        limit = req.get_param_as_int('limit')
        if limit is None:
            limit = 10  # Default value
        # Needed because server returns also the identical triple
        limit = int(limit) + 1

        # Dig for the search_k param on Query Params
        search_k = req.get_param_as_int('search_k')
        if search_k is None:
            search_k = -1

        # If looking for similar_entities given an embedding vector
        if embedding:
            similar_entities = search_server.similarity_by_embedding(
                entity, limit, search_k=search_k)
            similar_entities = [{
                "entity": dataset.get_entity(e_id),
                "distance": dist
            } for e_id, dist in similar_entities]

            entity_used = {
                "value": entity,  # Will be an embedding vector
                "type": "embedding"
            }
        # If looking for similar_entities given an entity
        else:
            entity_dao = data_access.EntityDAO(dataset_dto.dataset_type,
                                               dataset_id)
            entity_id = dataset.get_entity_id(entity)
            if entity_id is None:
                raise falcon.HTTPNotFound(
                    description="The {} entity can't be found inside dataset.".
                    format(entity))
            sim_entities = search_server.similarity_by_id(entity_id,
                                                          limit,
                                                          search_k=search_k)

            def getEntityDTO(e_id):
                entity = dataset.check_entity(dataset.get_entity(e_id))
                entity_dto = entity_dao.get_entity_dto(entity)
                return entity_dto.to_dict()

            if req.get_param_as_bool('object'):
                similar_entities = [{
                    "entity": dataset.get_entity(e_id),
                    "object": getEntityDTO(e_id),
                    "distance": dist
                } for e_id, dist in sim_entities]
            else:
                similar_entities = [{
                    "entity": dataset.get_entity(e_id),
                    "distance": dist
                } for e_id, dist in sim_entities]
            entity_used = {
                "value": dataset.get_entity(entity_id),
                "type": "uri"
            }

        response = {
            "dataset": dataset_dto.to_dict(),
            "similar_entities": {
                "entity": entity_used,
                "limit": len(similar_entities),
                "search_k": search_k,
                "response": similar_entities
            }
        }
        resp.body = json.dumps(response)
        resp.content_type = 'application/json'
        resp.status = falcon.HTTP_200