def copy_from_resource_to_training_resource(minio_client): for i in range(1, 7, 1): download_from_bucket( minio_client=minio_client, bucket=minio_buckets["RESOURCE_BUCKET"], filename="{}/corpus.txt".format(i), target_path="/{}.txt".format(i)) upload_to_bucket( minio_client=minio_client, bucket=minio_buckets["TRAINING_RESOURCE_BUCKET"], filename="{}/corpus.txt".format(i), file_path="/{}.txt".format(i))
def get_vocabulary_of_training(project_uuid, training_version): # noqa: E501 """Get the entire vocabulary of the specified training Returns the entire vocabulary of the specified training # noqa: E501 :param project_uuid: UUID of the project :type project_uuid: :param training_version: Training version of the project :type training_version: int :rtype: str """ current_user = connexion.context['token_info']['user'] db_project = DB_Project.query.filter_by(uuid=project_uuid, owner_id=current_user.id).first() if db_project is None: return ("Project not found", 404) db_training = DB_Training.query.filter_by(version=training_version) \ .filter_by(project_id=db_project.id).first() if db_training is None: return ("Training not found", 404) status, stream = download_from_bucket( minio_client, minio_buckets["TRAINING_BUCKET"], "{}/unique_word_list.txt".format(db_training.id)) return stream.read().decode('utf-8') if status else ""
def get_corpus_of_training(project_uuid, training_version): # noqa: E501 """Get the entire corpus of the specified training Returns the entire corpus of the specified training # noqa: E501 :param project_uuid: UUID of the project :type project_uuid: :param training_version: Training version of the project :type training_version: int :rtype: str """ current_user = connexion.context['token_info']['user'] db_project = DB_Project.query.filter_by(uuid=project_uuid).first() if not db_project: return ("Project not found", 404) db_training = DB_Training.query.filter_by( version=training_version, project_id=db_project.id).first() if not db_training: return ("Training not found", 404) status, stream = download_from_bucket( minio_client, bucket=minio_buckets["TRAINING_BUCKET"], filename="{}/corpus.txt".format(db_training.id)) if not status: # means no success return ("File not found", 404) return stream.read().decode('utf-8')
def get_resource_data(resource_uuid): # noqa: E501 """Returns the resource content Returns the resource content # noqa: E501 :param resource_uuid: UUID of resource to return :type resource_uuid: str :rtype: file """ current_user = connexion.context['token_info']['user'] db_resource = DB_Resource.query.filter_by( uuid=resource_uuid, owner_id=current_user.id).first() if not db_resource: print('Resource {} in DB not found'.format(resource_uuid)) return ("File not found", 404) status, stream = download_from_bucket( minio_client, bucket=minio_buckets["RESOURCE_BUCKET"], filename='{}/source'.format(db_resource.uuid)) if not status: # means no success print('Resource {} in MinIO not found'.format(resource_uuid)) return ("File not found", 404) response = Response(response=stream, content_type=db_resource.mimetype(), direct_passthrough=True) response.headers['Content-Disposition'] = 'attachment; filename={}'.format( db_resource.name) return response
def get_audio_data(audio_uuid): # noqa: E501 """Returns the audio content Returns the audio content # noqa: E501 :param audio_uuid: UUID of resource to return :type audio_uuid: str :rtype: file """ current_user = connexion.context['token_info']['user'] db_audio = DB_AudioResource.query.filter_by(uuid=audio_uuid).first() if not db_audio: return ("Audio not found", 404) status, stream = download_from_bucket( minio_client, bucket=minio_buckets["DECODING_BUCKET"], filename=db_audio.uuid) if not status: # means no success print('audio {} in MinIO not found'.format(db_audio.uuid)) return ("File not found", 404) response = Response(response=stream, content_type="audio/wav", direct_passthrough=True) response.headers['Content-Disposition'] = 'attachment; filename={}'.format( db_audio.name) return response
def download_acoustic_model(acoustic_model_uuid): # noqa: E501 """Returns the acoustic model Returns the model of the specified acoustic model # noqa: E501 :param acoustic_model_uuid: UUID of the acoustic model :type acoustic_model_uuid: str :rtype: file """ db_acoustic_model = DB_AcousticModel.query.filter_by( uuid=acoustic_model_uuid).first() if not db_acoustic_model: return ("Acousticmodel not found", 404) status, stream = download_from_bucket( minio_client, bucket=minio_buckets["ACOUSTIC_MODELS_BUCKET"], filename='{}/model.zip'.format(db_acoustic_model.id)) if not status: # means no success return ("File not found", 404) response = Response(response=stream, content_type="application/zip", direct_passthrough=True) response.headers['Content-Disposition'] = 'attachment; filename=graph.zip' return response
def get_lexicon_of_training_resource(project_uuid, training_version, resource_uuid): # noqa: E501 """Get the lexicon of the resource Returns the lexicon of the specified resource for this training # noqa: E501 :param project_uuid: UUID of the project :type project_uuid: :param training_version: Training version of the project :type training_version: int :param resource_uuid: UUID of the resource :type resource_uuid: :rtype: List[List[str]] """ current_user = connexion.context['token_info']['user'] db_project = DB_Project.query.filter_by(uuid=project_uuid, owner_id=current_user.id).first() if db_project is None: return ("Project not found", 404) db_training = DB_Training.query.filter_by(version=training_version) \ .filter_by(project_id=db_project.id).first() if db_training is None: return ("Training not found", 404) db_resource = DB_Resource.query.filter( DB_Resource.uuid == resource_uuid).first() if db_resource is None: return ("Resource not found", 404) db_training_resource = DB_TrainingResource.query.filter_by(origin_id=db_resource.id) \ .filter_by(training_id=db_training.id).first() if db_training_resource is None: return ("Resource not assigned to this Training", 404) status, stream = download_from_bucket( minio_client, minio_buckets["TRAINING_RESOURCE_BUCKET"], "{}/lexicon.txt".format(db_training_resource.id)) print(db_training_resource.id) entrys = stream.read().decode('utf-8').splitlines() lex = [] for entry in entrys: # print(entry) #lex.append(re.split(r'\t+', entry.rstrip('\t'))) lex.append(entry.split(maxsplit=1)) return lex if status else ""
def get_corpus_of_training_resource(project_uuid, training_version, resource_uuid): # noqa: E501 """Get the corpus of the resource Returns the corpus of the specified resource for this training # noqa: E501 :param project_uuid: UUID of the project :type project_uuid: :param training_version: Training version of the project :type training_version: int :param resource_uuid: UUID of the resource :type resource_uuid: :rtype: str """ current_user = connexion.context['token_info']['user'] db_project = DB_Project.query.filter_by(uuid=project_uuid, owner_id=current_user.id).first() if db_project is None: return ("Project not found", 404) db_training = DB_Training.query.filter_by(version=training_version) \ .filter_by(project_id=db_project.id).first() if db_training is None: return ("Training not found", 404) db_resource = DB_Resource.query.filter( DB_Resource.uuid == resource_uuid).first() if db_resource is None: return ("Resource not found", 404) db_training_resource = DB_TrainingResource.query.filter_by(origin_id=db_resource.id) \ .filter_by(training_id=db_training.id).first() if db_training_resource is None: return ("Resource not assigned to this Training", 404) status, stream = download_from_bucket( minio_client, minio_buckets["TRAINING_RESOURCE_BUCKET"], "{}/corpus.txt".format(db_training_resource.id)) print(db_training_resource.id) return stream.read().decode('utf-8') if status else ""
def get_lexicon_of_training(project_uuid, training_version): # noqa: E501 """Get the entire lexicon of the specified training Returns the entire lexicon of the specified training # noqa: E501 :param project_uuid: UUID of the project :type project_uuid: :param training_version: Training version of the project :type training_version: int :rtype: List[List[str]] """ current_user = connexion.context['token_info']['user'] db_project = DB_Project.query.filter_by(uuid=project_uuid).first() if not db_project: return ("Project not found", 404) db_training = DB_Training.query.filter_by( version=training_version, project_id=db_project.id).first() if not db_training: return ("Training not found", 404) status, stream = download_from_bucket( minio_client, bucket=minio_buckets["TRAINING_BUCKET"], filename="{}/lexicon.txt".format(db_training.id)) if not status: # means no success return ("File not found", 404) entrys = stream.read().decode('utf-8').splitlines() lex = [] for entry in entrys: # print(entry) #lex.append(re.split(r'\t+', entry.rstrip('\t'))) lex.append(entry.split(maxsplit=1)) return lex
def download_model_for_training(project_uuid, training_version): # noqa: E501 """Returns the model Returns the model of the specified training # noqa: E501 :param project_uuid: UUID of project :type project_uuid: str :param training_version: Version of training :type training_version: int :rtype: file """ current_user = connexion.context['token_info']['user'] db_project = DB_Project.query.filter_by(uuid=project_uuid).first() if not db_project: return ("Project not found", 404) db_training = DB_Training.query.filter_by( version=training_version, project_id=db_project.id).first() if not db_training: return ("Training not found", 404) status, stream = download_from_bucket( minio_client, bucket=minio_buckets["TRAINING_BUCKET"], filename='{}/graph.zip'.format(db_training.id)) if not status: # means no success return ("File not found", 404) response = Response(response=stream, content_type="application/zip", direct_passthrough=True) response.headers['Content-Disposition'] = 'attachment; filename=graph.zip' return response
def get_training_stats(project_uuid, training_version): # noqa: E501 """Get Training Stats Returns the stats to be reviewed before training # noqa: E501 :param project_uuid: UUID of the project :type project_uuid: :param training_version: Training version of the project :type training_version: int :rtype: DataPrepStats """ current_user = connexion.context['token_info']['user'] db_project = DB_Project.query.filter_by(uuid=project_uuid, owner_id=current_user.id).first() if db_project is None: return ("Project not found", 404) db_training = DB_Training.query.filter_by(version=training_version) \ .filter_by(project_id=db_project.id).first() if db_training is None: return ("Training not found", 404) status, stream = download_from_bucket( minio_client, minio_buckets["TRAINING_BUCKET"], "{}/stats.json".format(db_training.id)) json_object = json.loads(stream.read().decode('utf-8')) return (DataPrepStats( unique_words_count=json_object["unique_words"], total_words_count=json_object["total_words_count"], lines_count=json_object["lines_count"], files_count=json_object["files_count"], ), 200)
def infinite_loop(): _, task_queue, status_queue, minio_client = parse_args( 'Data-Preparation-Worker Connector', task_queue='Data-Prep-Queue') for data in task_queue.listen(): print("Received the following task from Data-Prep-Queue: ") print(data) task = None try: print("Starting to process received data") task = DataPrepTask(**data) log_file_handler = open("/log.txt", "w") log_file_handler.write("Starting to process the received task \n") log_file_handler.write("{}\n".format(task)) status_queue.submit( DataPrepStatus(id=DataPrepStatusCode.IN_PROGRESS, training_id=task.training_id, message="Task in progress")) print("All needed parameters are available. Processing continues.") print(task.resources) download_results = [] log_file_handler.write("Starting to download all needed files. \n") # Step 1: Download all files which were created by the Text-Preparation-Worker for this task. # In addition to that, download the G2P-graph and lexicon.txt files from the acoustic-bucket: # Download of the graph download_results.append( download_from_bucket( minio_client, minio_buckets["ACOUSTIC_MODELS_BUCKET"], "{}/g2p_model.fst".format(task.acoustic_model_id), "/data_prep_worker/in/g2p_model.fst")) # Download of the lexicon.txt file download_results.append( download_from_bucket( minio_client, minio_buckets["ACOUSTIC_MODELS_BUCKET"], "{}/lexicon.txt".format(task.acoustic_model_id), "/data_prep_worker/in/lexicon.txt")) corpus_list = list() for resource in task.resources: # Download of all corpus files which were created within the TPW loc_corp_path = "/data_prep_worker/in/{}_corpus.txt".format( resource) download_results.append( download_from_bucket( minio_client, minio_buckets["TRAINING_RESOURCE_BUCKET"], "{}/corpus.txt".format(resource), loc_corp_path)) corpus_list.append(loc_corp_path) # If any download did not finish --> Set task status to: Failure for download in download_results: if not download[0]: log_file_handler.write( "While the task was processed, the following error has occurred: \n" ) log_file_handler.write( "############################################################### \n" ) log_file_handler.write( "At least one download failed. Task failed!\n") finish_logging(log_file_handler=log_file_handler, minio_client=minio_client, task=task) status_queue.submit( DataPrepStatus(id=DataPrepStatusCode.FAILURE, training_id=task.training_id, message="Download failed")) log_file_handler.write( "All needed files were successfully downloaded. Processing continues \n" ) # Step 2.1: Merge all corpus-files into one final corpus and save the file locally try: log_file_handler.write( "Starting to merge all downloaded corpus-files. \n") corpus = merge_corpus_list(corpus_list, log_file_handler) except Exception as e: print(e) log_file_handler.write( "While all corpus files were merged into one, the following error occurred: \n" ) log_file_handler.write( "############################################################################# \n" ) log_file_handler.write( "Either the given list is empty, or it was not possible to open a given corpus file. \n" ) finish_logging(log_file_handler=log_file_handler, minio_client=minio_client, task=task) status_queue.submit( DataPrepStatus(id=DataPrepStatusCode.FAILURE, training_id=task.training_id, message="Corpus merge failed")) # Step 2.2: Save merged corpus file locally try: log_file_handler.write( "Successfully merged all corpus-files. Continuing by saving the merged corpus locally \n" ) save_txt_file("/data_prep_worker/out/corpus.txt", corpus) except Exception as e: print(e) log_file_handler.write( "While the merged corpus list was saved locally, the following error occurred: \n" ) log_file_handler.write( "############################################################################# \n" ) log_file_handler.write( "It was not possible to save the file. \n") finish_logging(log_file_handler=log_file_handler, minio_client=minio_client, task=task) status_queue.submit( DataPrepStatus( id=DataPrepStatusCode.FAILURE, training_id=task.training_id, message="Saving merged corpus file locally failed")) log_file_handler.write( "Successfully saved the merged corpus file \n") # Step 3.1: Create the final_word_list, using the combined corpus try: log_file_handler.write( "Processing continues. Next step is to create the final_word_list \n" ) lexicon = create_unique_word_list( "/data_prep_worker/out/corpus.txt") except Exception as e: print(e) log_file_handler.write( "While trying to create the final_word_list, the following error occurred: \n" ) log_file_handler.write( "############################################################################# \n" ) log_file_handler.write( "It was not possible to open the corpus-file correctly. Therefore, it was not possible to create the final_word_list \n" ) finish_logging(log_file_handler=log_file_handler, minio_client=minio_client, task=task) status_queue.submit( DataPrepStatus(id=DataPrepStatusCode.FAILURE, training_id=task.training_id, message="Final word list creation failed")) log_file_handler.write( "Successfully created the final_word_list. \n") # Step 3.2: Save the final_word_list locally try: log_file_handler.write( "Saving the word list locally, before the processing continues. \n" ) save_txt_file("/data_prep_worker/out/final_word_list", lexicon) except Exception as e: print(e) log_file_handler.write( "While trying to save the final_word_list locally, the following error occurred: \n" ) log_file_handler.write( "############################################################################# \n" ) log_file_handler.write( "It was not possible to save the file. \n") finish_logging(log_file_handler=log_file_handler, minio_client=minio_client, task=task) status_queue.submit( DataPrepStatus( id=DataPrepStatusCode.FAILURE, training_id=task.training_id, message="Saving unique word list locally failed")) log_file_handler.write( "Successfully saved the final_word_list. \n") # Step 3.3: Gather all needed stats which are needed for the frontend and create a JSON-file try: log_file_handler.write( "Processing continues by collecting all needed stats for the Frontend! \n" ) number_of_words, number_of_lines = gather_corpus_information( "/data_prep_worker/out/corpus.txt") number_of_unique_words = len(lexicon) number_of_processed_corpus_files = len(corpus_list) save_json_file(number_of_words, number_of_lines, number_of_unique_words, number_of_processed_corpus_files) except Exception as e: print(e) log_file_handler.write( "While trying to create the final_word_list, the following error occurred: \n" ) log_file_handler.write( "############################################################################# \n" ) log_file_handler.write( "It was not possible to retrieve all needed information!") finish_logging(log_file_handler=log_file_handler, minio_client=minio_client, task=task) status_queue.submit( DataPrepStatus( id=DataPrepStatusCode.FAILURE, training_id=task.training_id, message= "Failed to retrieve needed stats for the frontend")) log_file_handler.write( "Successfully retrieved all needed stats for the frontend! \n") # Step 4.1: Compare the final_word_list with the existing lexicon.txt file unique_word_list = compare_lexicon_with_word_list( final_word_list="/data_prep_worker/out/final_word_list", lexicon="/data_prep_worker/in/lexicon.txt") # Step 4.2: Save the unique_word_list locally save_txt_file(file_path="/data_prep_worker/out/unique_word_list", content_list=unique_word_list) # Step 4.3: Execute Phonetisaurus and create phones for the unique_word_list try: log_file_handler.write( "Processing continues by executing the Phonetisaurus which will create the lexicon-file for the Kaldi-Framework. \n" ) execute_phonetisaurus() except Exception as e: print(e) log_file_handler.write( "While trying to execute the Phonetisaurus, the following error occurred: \n" ) log_file_handler.write( "############################################################################# \n" ) log_file_handler.write( "It was either not possible to read the unique word list properly, or an error occured while executing the Phonetisaurus! \n" ) finish_logging(log_file_handler=log_file_handler, minio_client=minio_client, task=task) status_queue.submit( DataPrepStatus( id=DataPrepStatusCode.FAILURE, training_id=task.training_id, message="Failing while executing the Phonetisaurus")) log_file_handler.write("Successfully created the lexicon-file. \n") print("Successfully created the lexicon-file") # Step 4.4: Append new word and phone list to the lexicon-file lexicon = combine_old_and_new_lexicon_files( old_lexicon="/data_prep_worker/in/lexicon.txt", new_lexicon="/data_prep_worker/out/lexicon.txt") # Step 4.5: Save final lexicon.txt file locally save_txt_file(file_path="/data_prep_worker/out/lexicon.txt", content_list=lexicon) # Step 5: Upload of lexicon.txt, corpus.txt and unique_word_list files log_file_handler.write( "Processing continues by uploading the created lexicon-file and merged corpus to their corresponding MinIO-bucket \n" ) lexicon_result = upload_to_bucket( minio_client, minio_buckets["TRAINING_BUCKET"], "{}/lexicon.txt".format(task.training_id), "/data_prep_worker/out/lexicon.txt") corpus_result = upload_to_bucket( minio_client, minio_buckets["TRAINING_BUCKET"], "{}/corpus.txt".format(task.training_id), "/data_prep_worker/out/corpus.txt") unique_word_list_result = upload_to_bucket( minio_client, minio_buckets["TRAINING_BUCKET"], "{}/unique_word_list.txt".format(task.training_id), "/data_prep_worker/out/final_word_list") json_result = upload_to_bucket( minio_client, minio_buckets["TRAINING_BUCKET"], "{}/stats.json".format(task.training_id), "/data_prep_worker/out/stats.json") if not lexicon_result[0] or not corpus_result[ 0] or not unique_word_list_result[0] or not json_result[0]: print( "At least one upload failed. It is not possible to finish this task successfully." ) log_file_handler.write( "While trying to upload the lexicon.txt, corpus.txt, unique_word_list.txt and stats.json files, the following error occurred: \n" ) log_file_handler.write( "############################################################################# \n" ) log_file_handler.write( "It was not possible to upload at least one file. Please check your internet connection. \n" ) finish_logging(log_file_handler=log_file_handler, minio_client=minio_client, task=task) status_queue.submit( DataPrepStatus(id=DataPrepStatusCode.FAILURE, training_id=task.training_id, message="At least one upload failed")) continue log_file_handler.write( "Successfully uploaded lexicon.txt, corpus.txt, unique_word_list.txt and stats.json \n" ) # Step 6: Delete all files which were downloaded or created for this task remove_local_files("/data_prep_worker/in/") remove_local_files("/data_prep_worker/out/") finish_logging(log_file_handler=log_file_handler, minio_client=minio_client, task=task) # Step 7: Update status queue to: Successfull if this point is reached status_queue.submit( DataPrepStatus(id=DataPrepStatusCode.SUCCESS, training_id=task.training_id, message="Task finished successfully")) except Exception as e: status_queue.submit( DataPrepStatus(id=DataPrepStatusCode.FAILURE, training_id=task.training_id if task else None, message=str(e))) raise e
log_file_handler.write( "Workspace directory already exists. Processing continues. \n") # cache models when used once and reduce download try: log_file_handler.write( "Starting to download all needed files from the corresponding MinIO-bucket! \n" ) cur_acoustic_model_path = os.path.join( acoustic_model_folder, str(task.acoustic_model_id)) phone_symbol_table = os.path.join(cur_acoustic_model_path, "phones.txt") if task.acoustic_model_id not in downloaded_acoustic_models: os.makedirs(cur_acoustic_model_path) download_from_bucket( minio_client, acoustic_model_bucket, "{}/final.mdl".format(task.acoustic_model_id), os.path.join(cur_acoustic_model_path, "final.mdl")) download_from_bucket( minio_client, acoustic_model_bucket, "{}/tree".format(task.acoustic_model_id), os.path.join(cur_acoustic_model_path, "tree")) download_from_bucket( minio_client, acoustic_model_bucket, "{}/phones.txt".format(task.acoustic_model_id), phone_symbol_table) downloaded_acoustic_models.add(task.acoustic_model_id) # load resources download_from_bucket(minio_client, training_bucket, "{}/lexicon.txt".format(task.training_id),
log_file_handler.write( "Needed acoustic model directory exists. Processing continues! \n" ) # cache acoustic models when used once and reduce download try: log_file_handler.write( "Trying to download all needed acoustic model files! \n") cur_acoustic_model_path = os.path.join(acoustic_model_folder, str(acoustic_model_id)) ivector_extractor_path = os.path.join(cur_acoustic_model_path, "extractor") if (acoustic_model_id not in downloaded_acoustic_models): os.makedirs(cur_acoustic_model_path) download_from_bucket( minio_client, acoustic_model_bucket, acoustic_model_id + "/final.mdl", os.path.join(cur_acoustic_model_path, "final.mdl")) download_from_bucket( minio_client, acoustic_model_bucket, acoustic_model_id + "/tree", os.path.join(cur_acoustic_model_path, "tree")) download_from_bucket( minio_client, acoustic_model_bucket, acoustic_model_id + "/cmvn_opts", os.path.join(cur_acoustic_model_path, "cmvn_opts")) os.makedirs(ivector_extractor_path) download_from_bucket( minio_client, acoustic_model_bucket, acoustic_model_id + "/extractor/final.dubm", os.path.join(cur_acoustic_model_path,
def process_file(file_type, resource_uuid, minio_client, log_file_handler): ''' This function is called, in order to open the received filename of the API. All files which need to be processed are saved within: /text-preparation/in/<resource_uuid>/source The following file types are supported: - PDF - Docx - HTML - txt - PNG or JPG ''' text_prep_input = "/text_prep_worker/in/" text_prep_output = "/text_prep_worker/out/" file_path = text_prep_input + resource_uuid # Step 1: Checks whether the requested bucket exist existance_result_in = does_bucket_exist(minio_client, minio_buckets["RESOURCE_BUCKET"]) if not existance_result_in[0]: log_file_handler.write( "While processing the received task, the following error has occurred: \n" ) log_file_handler.write( "###################################################################### \n" ) log_file_handler.write(existance_result_in[1] + "\n") return existance_result_in log_file_handler.write( "A request was send towards the MinIO-server to check whether the {} bucket exist. Response was positive. Processing continues. \n" .format(minio_buckets["RESOURCE_BUCKET"])) # Step 2: Downloads the needed file which is located within the texts-in bucket download_result = download_from_bucket(minio_client, minio_buckets["RESOURCE_BUCKET"], resource_uuid + "/source", file_path) if not download_result[0]: log_file_handler.write( "While processing the received task, the following error has occurred: \n" ) log_file_handler.write( "###################################################################### \n" ) log_file_handler.write(download_result[1] + "\n") return download_result log_file_handler.write("Download of {}/{} was successfull \n".format( minio_buckets["RESOURCE_BUCKET"], resource_uuid + "/source")) # Step 3: Process the downloaded file # After processing the corpus is ready full_text = "" try: if file_type == "pdf": full_text = pdf_parser(file_path, log_file_handler) elif file_type == "docx": full_text = word_parser(file_path, log_file_handler) elif file_type == "html": full_text = html_parser(file_path, log_file_handler) elif file_type == "txt": full_text = text_parser(file_path, log_file_handler) elif file_type == "png" or file_type == "jpg": full_text = ocr_parser(file_path, log_file_handler) else: log_file_handler.write( "While trying to parse the downloaded file, the following error has occurred: \n" ) log_file_handler.write( "###################################################################### \n" ) log_file_handler.write( "The given file type is not supported. Task failed. \n") return (False, "Given file type is not supported. Task failed") log_file_handler.write("Parsing of the file finished successfully. \n") log_file_handler.write("Starting to create the corpus file. \n") # Generates the corpus corpus = generate_corpus(full_text) log_file_handler.write( "Creation of the corpus file finished successfully \n") except Exception as e: log_file_handler.write( "While processing the received task, the following error has occurred: \n" ) log_file_handler.write( "###################################################################### \n" ) log_file_handler.write( "Failed to parse the received file. Task failed. \n") print(e) return (False, "Failed to parse the received file") # Step 4: Save corpus locally #TODO: Save lexicon locally try: log_file_handler.write("Trying to save the corpus file locally \n") corpus_name = resource_uuid + "_corpus" save_textfile(corpus, corpus_name) log_file_handler.write("Successfully saved the corpus file \n") except: log_file_handler.write( "While processing the received task, the following error has occurred: \n" ) log_file_handler.write( "###################################################################### \n" ) log_file_handler.write("Failed to save the corpus file \n") return (False, "Failed to save the corpus file") # Step 5: Upload corpus in bucket #TODO: Upload lexicon in bucket log_file_handler.write( "Trying to upload the corpus file into its corresponding MinIO-bucket \n" ) corpus_path = text_prep_output + corpus_name corpus_upload_result = upload_to_bucket(minio_client, minio_buckets["RESOURCE_BUCKET"], resource_uuid + "/corpus.txt", corpus_path) if not corpus_upload_result[0]: log_file_handler.write( "While processing the received task, the following error has occurred: \n" ) log_file_handler.write( "###################################################################### \n" ) log_file_handler.write(corpus_upload_result[1] + "\n") return corpus_upload_result log_file_handler.write( "Successfully uploaded the corpus file. Corpus is located within {} in the {} MinIO-bucket.\n" .format(resource_uuid + "/corpus.txt", minio_buckets["RESOURCE_BUCKET"])) # Step 6: Remove local files remove_local_files(text_prep_input) remove_local_files(text_prep_output) return (True, "The task was successfully processed.")