def listplugins(args): if args: from deepnlpf.core.plugin_manager import PluginManager PluginManager().listplugins() else: print("❗️Wrong command!") print("⌨️ Try the command: deepnlpf --listplugins all")
def get_all_documents_database(self): self.DOCUMENTS = PluginManager().call_plugin_db( plugin_name=self._use_db, operation="select_all_key", collection="document", key={"_id_dataset": ObjectId(self.ID_DATASET)}, ) return self.DOCUMENTS
def uninstall(args): if args: from deepnlpf.core.plugin_manager import PluginManager PluginManager().uninstall(args) else: print("❗️Wrong command!") print("⌨️ Try the command: deepnlpf --uninstall <name_plugin>")
def run(self, tool): for document in tqdm(self.DOCUMENTS, desc="Document(s)"): annotated_document = PluginManager().call_plugin_nlp( plugin_name=tool, document=document["sentences"], pipeline=self._custom_pipeline, ) # validates document annotated by the tool. validate_annotation(annotated_document) if self._use_db != None: PluginManager().call_plugin_db( plugin_name=self._use_db, operation="insert", collection="analysi", document={ "_id_dataset": self.ID_DATASET, "_id_document": document["_id"], "_id_pool": self._id_pool, "tool": tool, "sentences": annotated_document, }, ) data_formating = {"sentences": annotated_document} # add document annotated in list documents. self.DOCUMENTS_ANNOTATED.append( self.type_output_results( data_formating, self.ID_DATASET, document["name"], tool, )) return self.DOCUMENTS_ANNOTATED
def load_database(self, database_name, id_dataset): """Load documents from a database. Arguments: database_name {str} -- Name of the database being used. id_dataset {str} -- Dataset id saved in the database. Returns: [type] -- A list of documents to be processed. """ results = PluginManager().call_plugin_db( plugin_name=database_name, operation="select_all_key", collection="document", key={"_id_dataset": ObjectId(id_dataset)}, ) return results
def prepare_dataset(self): """Prepare the supplied dataset to be processed. Returns: {list} -- Returns a list of documents to be processed. """ path_dataset = self._input dataset_name = "" list_documents = list() log_document = "" # check is path directory validate. if os.path.isdir(path_dataset): # check whether the directory is empty. if os.listdir(path_dataset) == []: print("Directory empty!") sys.exit(0) # if the directory is not empty. else: # takes the name of the directory and names the dataset. dataset_name = os.path.basename(os.path.normpath(path_dataset)) # if using database save dataset. if self._use_db != None: self.ID_DATASET = PluginManager().call_plugin_db( plugin_name=self._use_db, operation="insert", collection="dataset", document={ "name": dataset_name, "data_time": datetime.datetime.now(), }, ) # if you are not using a database, generate an id for the dataset. else: self.ID_DATASET = RandomObjectId().gen_random_object_id() # get all files' and directorys' names in the current directory. dirrectory_contents = os.listdir(path_dataset) list_files = [] list_sub_directory = [] # train or test. # check all directory contents for item in dirrectory_contents: # check whether the current item is a sub directory and add the list sub directory. if os.path.isdir( os.path.join(os.path.abspath(path_dataset), item)): list_sub_directory.append(item) # make sure the current item is a file and add the file list. elif os.path.isfile( os.path.join(os.path.abspath(path_dataset), item)): list_files.append(item) # check exist sub directory. if list_sub_directory != []: data = [] for directory_type in list_sub_directory: print("├── {}:".format(directory_type)) folders_labels = os.listdir(path_dataset + "/" + directory_type) for _label in folders_labels: cont_doc = 0 if os.path.isdir( os.path.join( os.path.abspath(path_dataset + "/" + directory_type), _label, )): for file_name in tqdm( os.listdir(path_dataset + "/" + directory_type + "/" + _label + "/"), desc="│ └── documents [{}]".format( _label), ): cont_doc += 1 text_raw = Util().open_txt(path_dataset + "/" + directory_type + "/" + _label + "/" + file_name) document = { "_id_dataset": self.ID_DATASET, "name": file_name, "type": directory_type, "label": _label, "sentences": text_raw, } if self._use_db != None: PluginManager().call_plugin_db( plugin_name=self._use_db, operation="insert", collection="document", document=document, ) f = { "type": directory_type, "label": _label, "doc": cont_doc, } data.append(f) log = { "_id_dataset": self.ID_DATASET, "info": "Save Dataset.", "data": data, "data_time": datetime.datetime.now(), } elif list_files != []: data = [] cont_doc = 0 for file_name in tqdm(os.listdir(path_dataset), desc="Document(s) save"): cont_doc += 1 # open file text_raw = Util().open_txt(path_dataset + "/" + file_name) # if text raw. if self._input_type == INPUT_FORMAT_DATASET[2]: text_raw = self.prepare_raw_text(text_raw) item = { "_id_dataset": self.ID_DATASET, "name": file_name, "sentences": [sentence for sentence in text_raw], } # is file. else: if self._use_db != None: document_document = { "_id_dataset": self.ID_DATASET, "name": file_name, "sentences": text_raw, } PluginManager().call_plugin_db( plugin_name=self._use_db, operation="insert", collection="document", document=document_document, ) else: list_documents.append({ "_id": RandomObjectId(). gen_random_object_id_string(), "_id_dataset": self.ID_DATASET, "name": file_name, "sentences": text_raw, }) data.append({"doc": cont_doc}) if self._use_db != None: log_document = { "_id_dataset": self.ID_DATASET, "info": "Save Dataset.", "data": data, "data_time": datetime.datetime.now(), } if self._use_db != None: PluginManager().call_plugin_db( plugin_name=self._use_db, operation="insert", collection="log", document=log_document, ) else: print("This path does not contain a valid directory!") sys.exit(0) return list_documents
def prepare_raw_text(self, raw_text): log.logger.info( "Pre-processing - Execute Sentence Split in texto raw.") list_sentences = list() # pre-processing tokenization and ssplit using plugin base selected. if self._tool_base == "stanza": doc_annotation = PluginManager().call_plugin_nlp( plugin_name="preprocessing", document=raw_text, pipeline={ "lang": self._custom_pipeline["lang"], "tools": { "stanza": { "processors": ["tokenize"] } }, }, ) # join tokens. for sentence in doc_annotation.sentences: list_tokens = list() for token in sentence.tokens: list_tokens.append(token.text) list_sentences.append(" ".join(list_tokens)) if self._tool_base == "stanfordcorenlp": doc_annotation = PluginManager().call_plugin_nlp( plugin_name="preprocessing", document=raw_text, pipeline={ "lang": self._custom_pipeline["lang"], "tools": { "stanfordcorenlp": { "processors": ["ssplit"] } }, }, ) for item in doc_annotation[0]["sentences"]: sentence = list() for token in item["tokens"]: sentence.append(token["word"]) list_sentences.append(" ".join(sentence)) if self._use_db != None: # insert dataset in database. self.ID_DATASET = PluginManager().call_plugin_db( plugin_name=self._use_db, operation="insert", collection="dataset", document={ "name": "dataset_" + RandomObjectId().gen_random_object_id_string(), "data_time": OutputFormat().data_time(), }, ) # insert document(s) in database. PluginManager().call_plugin_db( plugin_name=self._use_db, operation="insert", collection="document", document={ "_id_dataset": self.ID_DATASET, "name": "document_" + RandomObjectId().gen_random_object_id_string(), "sentences": [sentence for sentence in list_sentences], }, ) return self.get_all_documents_database() # Not using database. else: # generates a document id. _id = RandomObjectId().gen_random_object_id() # generate an id for the dataset. self.ID_DATASET = RandomObjectId().gen_random_object_id() # generates a name for the document. name = "document_" + RandomObjectId().gen_random_object_id_string() document = { "_id": _id, "_id_dataset": self.ID_DATASET, "name": name, "sentences": list_sentences, } return [document]
async def plugins(): return {"plugins": PluginManager().load_manifest()}
def annotation(): response = { 'corpus': db.select_all(), 'plugins': PluginManager().loadManifest() } return jsonify(response)
def install_user_plugin(args): if args: from deepnlpf.core.plugin_manager import PluginManager PluginManager().install_user_plugin(args) else: print("Command format: deepnlpf --install_user_plugin <plugin_url>")