Esempio n. 1
0
def listplugins(args):
    if args:
        from deepnlpf.core.plugin_manager import PluginManager
        PluginManager().listplugins()
    else:
        print("❗️Wrong command!")
        print("⌨️ Try the command: deepnlpf --listplugins all")
Esempio n. 2
0
 def get_all_documents_database(self):
     self.DOCUMENTS = PluginManager().call_plugin_db(
         plugin_name=self._use_db,
         operation="select_all_key",
         collection="document",
         key={"_id_dataset": ObjectId(self.ID_DATASET)},
     )
     return self.DOCUMENTS
Esempio n. 3
0
def uninstall(args):
    if args:
        from deepnlpf.core.plugin_manager import PluginManager

        PluginManager().uninstall(args)
    else:
        print("❗️Wrong command!")
        print("⌨️ Try the command: deepnlpf --uninstall <name_plugin>")
Esempio n. 4
0
    def run(self, tool):

        for document in tqdm(self.DOCUMENTS, desc="Document(s)"):

            annotated_document = PluginManager().call_plugin_nlp(
                plugin_name=tool,
                document=document["sentences"],
                pipeline=self._custom_pipeline,
            )

            # validates document annotated by the tool.
            validate_annotation(annotated_document)

            if self._use_db != None:
                PluginManager().call_plugin_db(
                    plugin_name=self._use_db,
                    operation="insert",
                    collection="analysi",
                    document={
                        "_id_dataset": self.ID_DATASET,
                        "_id_document": document["_id"],
                        "_id_pool": self._id_pool,
                        "tool": tool,
                        "sentences": annotated_document,
                    },
                )

            data_formating = {"sentences": annotated_document}

            # add document annotated in list documents.
            self.DOCUMENTS_ANNOTATED.append(
                self.type_output_results(
                    data_formating,
                    self.ID_DATASET,
                    document["name"],
                    tool,
                ))

        return self.DOCUMENTS_ANNOTATED
Esempio n. 5
0
    def load_database(self, database_name, id_dataset):
        """Load documents from a database.

        Arguments:
            database_name {str} -- Name of the database being used.
            id_dataset {str} -- Dataset id saved in the database.

        Returns:
            [type] -- A list of documents to be processed.
        """
        results = PluginManager().call_plugin_db(
            plugin_name=database_name,
            operation="select_all_key",
            collection="document",
            key={"_id_dataset": ObjectId(id_dataset)},
        )

        return results
Esempio n. 6
0
    def prepare_dataset(self):
        """Prepare the supplied dataset to be processed.

        Returns:
            {list} -- Returns a list of documents to be processed.
        """

        path_dataset = self._input
        dataset_name = ""
        list_documents = list()
        log_document = ""

        # check is path directory validate.
        if os.path.isdir(path_dataset):
            # check whether the directory is empty.
            if os.listdir(path_dataset) == []:
                print("Directory empty!")
                sys.exit(0)
            # if the directory is not empty.
            else:
                # takes the name of the directory and names the dataset.
                dataset_name = os.path.basename(os.path.normpath(path_dataset))

                # if using database save dataset.
                if self._use_db != None:
                    self.ID_DATASET = PluginManager().call_plugin_db(
                        plugin_name=self._use_db,
                        operation="insert",
                        collection="dataset",
                        document={
                            "name": dataset_name,
                            "data_time": datetime.datetime.now(),
                        },
                    )
                # if you are not using a database, generate an id for the dataset.
                else:
                    self.ID_DATASET = RandomObjectId().gen_random_object_id()

                # get all files' and directorys' names in the current directory.
                dirrectory_contents = os.listdir(path_dataset)

                list_files = []
                list_sub_directory = []  # train or test.

                # check all directory contents
                for item in dirrectory_contents:
                    # check whether the current item is a sub directory and add the list sub directory.
                    if os.path.isdir(
                            os.path.join(os.path.abspath(path_dataset), item)):
                        list_sub_directory.append(item)
                    # make sure the current item is a file and add the file list.
                    elif os.path.isfile(
                            os.path.join(os.path.abspath(path_dataset), item)):
                        list_files.append(item)

                # check exist sub directory.
                if list_sub_directory != []:
                    data = []

                    for directory_type in list_sub_directory:
                        print("├── {}:".format(directory_type))

                        folders_labels = os.listdir(path_dataset + "/" +
                                                    directory_type)

                        for _label in folders_labels:
                            cont_doc = 0

                            if os.path.isdir(
                                    os.path.join(
                                        os.path.abspath(path_dataset + "/" +
                                                        directory_type),
                                        _label,
                                    )):

                                for file_name in tqdm(
                                        os.listdir(path_dataset + "/" +
                                                   directory_type + "/" +
                                                   _label + "/"),
                                        desc="│   └── documents [{}]".format(
                                            _label),
                                ):
                                    cont_doc += 1

                                    text_raw = Util().open_txt(path_dataset +
                                                               "/" +
                                                               directory_type +
                                                               "/" + _label +
                                                               "/" + file_name)

                                    document = {
                                        "_id_dataset": self.ID_DATASET,
                                        "name": file_name,
                                        "type": directory_type,
                                        "label": _label,
                                        "sentences": text_raw,
                                    }

                                    if self._use_db != None:
                                        PluginManager().call_plugin_db(
                                            plugin_name=self._use_db,
                                            operation="insert",
                                            collection="document",
                                            document=document,
                                        )

                                f = {
                                    "type": directory_type,
                                    "label": _label,
                                    "doc": cont_doc,
                                }

                                data.append(f)

                    log = {
                        "_id_dataset": self.ID_DATASET,
                        "info": "Save Dataset.",
                        "data": data,
                        "data_time": datetime.datetime.now(),
                    }

                elif list_files != []:
                    data = []
                    cont_doc = 0

                    for file_name in tqdm(os.listdir(path_dataset),
                                          desc="Document(s) save"):
                        cont_doc += 1

                        # open file
                        text_raw = Util().open_txt(path_dataset + "/" +
                                                   file_name)

                        # if text raw.
                        if self._input_type == INPUT_FORMAT_DATASET[2]:
                            text_raw = self.prepare_raw_text(text_raw)

                            item = {
                                "_id_dataset": self.ID_DATASET,
                                "name": file_name,
                                "sentences":
                                [sentence for sentence in text_raw],
                            }
                        # is file.
                        else:
                            if self._use_db != None:
                                document_document = {
                                    "_id_dataset": self.ID_DATASET,
                                    "name": file_name,
                                    "sentences": text_raw,
                                }

                                PluginManager().call_plugin_db(
                                    plugin_name=self._use_db,
                                    operation="insert",
                                    collection="document",
                                    document=document_document,
                                )
                            else:
                                list_documents.append({
                                    "_id":
                                    RandomObjectId().
                                    gen_random_object_id_string(),
                                    "_id_dataset":
                                    self.ID_DATASET,
                                    "name":
                                    file_name,
                                    "sentences":
                                    text_raw,
                                })

                    data.append({"doc": cont_doc})

                    if self._use_db != None:
                        log_document = {
                            "_id_dataset": self.ID_DATASET,
                            "info": "Save Dataset.",
                            "data": data,
                            "data_time": datetime.datetime.now(),
                        }

                if self._use_db != None:
                    PluginManager().call_plugin_db(
                        plugin_name=self._use_db,
                        operation="insert",
                        collection="log",
                        document=log_document,
                    )

        else:
            print("This path does not contain a valid directory!")
            sys.exit(0)

        return list_documents
Esempio n. 7
0
    def prepare_raw_text(self, raw_text):
        log.logger.info(
            "Pre-processing - Execute Sentence Split in texto raw.")

        list_sentences = list()

        # pre-processing tokenization and ssplit using plugin base selected.
        if self._tool_base == "stanza":
            doc_annotation = PluginManager().call_plugin_nlp(
                plugin_name="preprocessing",
                document=raw_text,
                pipeline={
                    "lang": self._custom_pipeline["lang"],
                    "tools": {
                        "stanza": {
                            "processors": ["tokenize"]
                        }
                    },
                },
            )

            # join tokens.
            for sentence in doc_annotation.sentences:
                list_tokens = list()
                for token in sentence.tokens:
                    list_tokens.append(token.text)
                list_sentences.append(" ".join(list_tokens))

        if self._tool_base == "stanfordcorenlp":
            doc_annotation = PluginManager().call_plugin_nlp(
                plugin_name="preprocessing",
                document=raw_text,
                pipeline={
                    "lang": self._custom_pipeline["lang"],
                    "tools": {
                        "stanfordcorenlp": {
                            "processors": ["ssplit"]
                        }
                    },
                },
            )

            for item in doc_annotation[0]["sentences"]:
                sentence = list()
                for token in item["tokens"]:
                    sentence.append(token["word"])
                list_sentences.append(" ".join(sentence))

        if self._use_db != None:
            # insert dataset in database.
            self.ID_DATASET = PluginManager().call_plugin_db(
                plugin_name=self._use_db,
                operation="insert",
                collection="dataset",
                document={
                    "name":
                    "dataset_" +
                    RandomObjectId().gen_random_object_id_string(),
                    "data_time":
                    OutputFormat().data_time(),
                },
            )

            # insert document(s) in database.
            PluginManager().call_plugin_db(
                plugin_name=self._use_db,
                operation="insert",
                collection="document",
                document={
                    "_id_dataset":
                    self.ID_DATASET,
                    "name":
                    "document_" +
                    RandomObjectId().gen_random_object_id_string(),
                    "sentences": [sentence for sentence in list_sentences],
                },
            )

            return self.get_all_documents_database()

        # Not using database.
        else:
            # generates a document id.
            _id = RandomObjectId().gen_random_object_id()

            # generate an id for the dataset.
            self.ID_DATASET = RandomObjectId().gen_random_object_id()

            # generates a name for the document.
            name = "document_" + RandomObjectId().gen_random_object_id_string()

            document = {
                "_id": _id,
                "_id_dataset": self.ID_DATASET,
                "name": name,
                "sentences": list_sentences,
            }

            return [document]
Esempio n. 8
0
async def plugins():
    return {"plugins": PluginManager().load_manifest()}
Esempio n. 9
0
def annotation():
    response = {
        'corpus': db.select_all(),
        'plugins': PluginManager().loadManifest()
    }
    return jsonify(response)
Esempio n. 10
0
def install_user_plugin(args):
    if args:
        from deepnlpf.core.plugin_manager import PluginManager
        PluginManager().install_user_plugin(args)
    else:
        print("Command format: deepnlpf --install_user_plugin <plugin_url>")