Ejemplo n.º 1
0
def copy_from_resource_to_training_resource(minio_client):

    for i in range(1, 7, 1):
        download_from_bucket(
            minio_client=minio_client,
            bucket=minio_buckets["RESOURCE_BUCKET"],
            filename="{}/corpus.txt".format(i),
            target_path="/{}.txt".format(i))
        upload_to_bucket(
            minio_client=minio_client,
            bucket=minio_buckets["TRAINING_RESOURCE_BUCKET"],
            filename="{}/corpus.txt".format(i),
            file_path="/{}.txt".format(i))
def get_vocabulary_of_training(project_uuid, training_version):  # noqa: E501
    """Get the entire vocabulary of the specified training

    Returns the entire vocabulary of the specified training # noqa: E501

    :param project_uuid: UUID of the project
    :type project_uuid: 
    :param training_version: Training version of the project
    :type training_version: int

    :rtype: str
    """
    current_user = connexion.context['token_info']['user']

    db_project = DB_Project.query.filter_by(uuid=project_uuid,
                                            owner_id=current_user.id).first()

    if db_project is None:
        return ("Project not found", 404)

    db_training = DB_Training.query.filter_by(version=training_version) \
        .filter_by(project_id=db_project.id).first()

    if db_training is None:
        return ("Training not found", 404)

    status, stream = download_from_bucket(
        minio_client, minio_buckets["TRAINING_BUCKET"],
        "{}/unique_word_list.txt".format(db_training.id))

    return stream.read().decode('utf-8') if status else ""
def get_corpus_of_training(project_uuid, training_version):  # noqa: E501
    """Get the entire corpus of the specified training

    Returns the entire corpus of the specified training # noqa: E501

    :param project_uuid: UUID of the project
    :type project_uuid: 
    :param training_version: Training version of the project
    :type training_version: int

    :rtype: str
    """
    current_user = connexion.context['token_info']['user']

    db_project = DB_Project.query.filter_by(uuid=project_uuid).first()

    if not db_project:
        return ("Project not found", 404)

    db_training = DB_Training.query.filter_by(
        version=training_version, project_id=db_project.id).first()

    if not db_training:
        return ("Training not found", 404)

    status, stream = download_from_bucket(
        minio_client,
        bucket=minio_buckets["TRAINING_BUCKET"],
        filename="{}/corpus.txt".format(db_training.id))

    if not status:  # means no success
        return ("File not found", 404)

    return stream.read().decode('utf-8')
Ejemplo n.º 4
0
def get_resource_data(resource_uuid):  # noqa: E501
    """Returns the resource content

    Returns the resource content # noqa: E501

    :param resource_uuid: UUID of resource to return
    :type resource_uuid: str

    :rtype: file
    """
    current_user = connexion.context['token_info']['user']

    db_resource = DB_Resource.query.filter_by(
        uuid=resource_uuid, owner_id=current_user.id).first()

    if not db_resource:
        print('Resource {} in DB not found'.format(resource_uuid))
        return ("File not found", 404)

    status, stream = download_from_bucket(
        minio_client,
        bucket=minio_buckets["RESOURCE_BUCKET"],
        filename='{}/source'.format(db_resource.uuid))

    if not status:  # means no success
        print('Resource {} in MinIO not found'.format(resource_uuid))
        return ("File not found", 404)

    response = Response(response=stream,
                        content_type=db_resource.mimetype(),
                        direct_passthrough=True)
    response.headers['Content-Disposition'] = 'attachment; filename={}'.format(
        db_resource.name)
    return response
Ejemplo n.º 5
0
def get_audio_data(audio_uuid):  # noqa: E501
    """Returns the audio content

    Returns the audio content # noqa: E501

    :param audio_uuid: UUID of resource to return
    :type audio_uuid: str

    :rtype: file
    """
    current_user = connexion.context['token_info']['user']

    db_audio = DB_AudioResource.query.filter_by(uuid=audio_uuid).first()

    if not db_audio:
        return ("Audio not found", 404)

    status, stream = download_from_bucket(
        minio_client,
        bucket=minio_buckets["DECODING_BUCKET"],
        filename=db_audio.uuid)

    if not status:  # means no success
        print('audio {} in MinIO not found'.format(db_audio.uuid))
        return ("File not found", 404)

    response = Response(response=stream,
                        content_type="audio/wav",
                        direct_passthrough=True)
    response.headers['Content-Disposition'] = 'attachment; filename={}'.format(
        db_audio.name)
    return response
Ejemplo n.º 6
0
def download_acoustic_model(acoustic_model_uuid):  # noqa: E501
    """Returns the acoustic model

    Returns the model of the specified acoustic model # noqa: E501

    :param acoustic_model_uuid: UUID of the acoustic model
    :type acoustic_model_uuid: str

    :rtype: file
    """
    db_acoustic_model = DB_AcousticModel.query.filter_by(
        uuid=acoustic_model_uuid).first()

    if not db_acoustic_model:
        return ("Acousticmodel not found", 404)

    status, stream = download_from_bucket(
        minio_client,
        bucket=minio_buckets["ACOUSTIC_MODELS_BUCKET"],
        filename='{}/model.zip'.format(db_acoustic_model.id))

    if not status:  # means no success
        return ("File not found", 404)

    response = Response(response=stream,
                        content_type="application/zip",
                        direct_passthrough=True)
    response.headers['Content-Disposition'] = 'attachment; filename=graph.zip'
    return response
def get_lexicon_of_training_resource(project_uuid, training_version,
                                     resource_uuid):  # noqa: E501
    """Get the lexicon of the resource

    Returns the lexicon of the specified resource for this training # noqa: E501

    :param project_uuid: UUID of the project
    :type project_uuid:
    :param training_version: Training version of the project
    :type training_version: int
    :param resource_uuid: UUID of the resource
    :type resource_uuid:

    :rtype: List[List[str]]
    """
    current_user = connexion.context['token_info']['user']

    db_project = DB_Project.query.filter_by(uuid=project_uuid,
                                            owner_id=current_user.id).first()

    if db_project is None:
        return ("Project not found", 404)

    db_training = DB_Training.query.filter_by(version=training_version) \
        .filter_by(project_id=db_project.id).first()

    if db_training is None:
        return ("Training not found", 404)

    db_resource = DB_Resource.query.filter(
        DB_Resource.uuid == resource_uuid).first()

    if db_resource is None:
        return ("Resource not found", 404)

    db_training_resource = DB_TrainingResource.query.filter_by(origin_id=db_resource.id) \
        .filter_by(training_id=db_training.id).first()

    if db_training_resource is None:
        return ("Resource not assigned to this Training", 404)

    status, stream = download_from_bucket(
        minio_client, minio_buckets["TRAINING_RESOURCE_BUCKET"],
        "{}/lexicon.txt".format(db_training_resource.id))

    print(db_training_resource.id)

    entrys = stream.read().decode('utf-8').splitlines()
    lex = []
    for entry in entrys:
        # print(entry)
        #lex.append(re.split(r'\t+', entry.rstrip('\t')))
        lex.append(entry.split(maxsplit=1))

    return lex if status else ""
def get_corpus_of_training_resource(project_uuid, training_version,
                                    resource_uuid):  # noqa: E501
    """Get the corpus of the resource

    Returns the corpus of the specified resource for this training # noqa: E501

    :param project_uuid: UUID of the project
    :type project_uuid: 
    :param training_version: Training version of the project
    :type training_version: int
    :param resource_uuid: UUID of the resource
    :type resource_uuid: 

    :rtype: str
    """
    current_user = connexion.context['token_info']['user']

    db_project = DB_Project.query.filter_by(uuid=project_uuid,
                                            owner_id=current_user.id).first()

    if db_project is None:
        return ("Project not found", 404)

    db_training = DB_Training.query.filter_by(version=training_version) \
        .filter_by(project_id=db_project.id).first()

    if db_training is None:
        return ("Training not found", 404)

    db_resource = DB_Resource.query.filter(
        DB_Resource.uuid == resource_uuid).first()

    if db_resource is None:
        return ("Resource not found", 404)

    db_training_resource = DB_TrainingResource.query.filter_by(origin_id=db_resource.id) \
        .filter_by(training_id=db_training.id).first()

    if db_training_resource is None:
        return ("Resource not assigned to this Training", 404)

    status, stream = download_from_bucket(
        minio_client, minio_buckets["TRAINING_RESOURCE_BUCKET"],
        "{}/corpus.txt".format(db_training_resource.id))

    print(db_training_resource.id)
    return stream.read().decode('utf-8') if status else ""
def get_lexicon_of_training(project_uuid, training_version):  # noqa: E501
    """Get the entire lexicon of the specified training

    Returns the entire lexicon of the specified training # noqa: E501

    :param project_uuid: UUID of the project
    :type project_uuid:
    :param training_version: Training version of the project
    :type training_version: int

    :rtype: List[List[str]]
    """

    current_user = connexion.context['token_info']['user']

    db_project = DB_Project.query.filter_by(uuid=project_uuid).first()

    if not db_project:
        return ("Project not found", 404)

    db_training = DB_Training.query.filter_by(
        version=training_version, project_id=db_project.id).first()

    if not db_training:
        return ("Training not found", 404)

    status, stream = download_from_bucket(
        minio_client,
        bucket=minio_buckets["TRAINING_BUCKET"],
        filename="{}/lexicon.txt".format(db_training.id))

    if not status:  # means no success
        return ("File not found", 404)

    entrys = stream.read().decode('utf-8').splitlines()
    lex = []
    for entry in entrys:
        # print(entry)
        #lex.append(re.split(r'\t+', entry.rstrip('\t')))
        lex.append(entry.split(maxsplit=1))

    return lex
def download_model_for_training(project_uuid, training_version):  # noqa: E501
    """Returns the model

    Returns the model of the specified training # noqa: E501

    :param project_uuid: UUID of project
    :type project_uuid: str
    :param training_version: Version of training
    :type training_version: int

    :rtype: file
    """
    current_user = connexion.context['token_info']['user']

    db_project = DB_Project.query.filter_by(uuid=project_uuid).first()

    if not db_project:
        return ("Project not found", 404)

    db_training = DB_Training.query.filter_by(
        version=training_version, project_id=db_project.id).first()

    if not db_training:
        return ("Training not found", 404)

    status, stream = download_from_bucket(
        minio_client,
        bucket=minio_buckets["TRAINING_BUCKET"],
        filename='{}/graph.zip'.format(db_training.id))

    if not status:  # means no success
        return ("File not found", 404)

    response = Response(response=stream,
                        content_type="application/zip",
                        direct_passthrough=True)
    response.headers['Content-Disposition'] = 'attachment; filename=graph.zip'
    return response
Ejemplo n.º 11
0
def get_training_stats(project_uuid, training_version):  # noqa: E501
    """Get Training Stats

    Returns the stats to be reviewed before training # noqa: E501

    :param project_uuid: UUID of the project
    :type project_uuid: 
    :param training_version: Training version of the project
    :type training_version: int

    :rtype: DataPrepStats
    """
    current_user = connexion.context['token_info']['user']

    db_project = DB_Project.query.filter_by(uuid=project_uuid,
                                            owner_id=current_user.id).first()

    if db_project is None:
        return ("Project not found", 404)

    db_training = DB_Training.query.filter_by(version=training_version) \
        .filter_by(project_id=db_project.id).first()

    if db_training is None:
        return ("Training not found", 404)

    status, stream = download_from_bucket(
        minio_client, minio_buckets["TRAINING_BUCKET"],
        "{}/stats.json".format(db_training.id))

    json_object = json.loads(stream.read().decode('utf-8'))
    return (DataPrepStats(
        unique_words_count=json_object["unique_words"],
        total_words_count=json_object["total_words_count"],
        lines_count=json_object["lines_count"],
        files_count=json_object["files_count"],
    ), 200)
Ejemplo n.º 12
0
def infinite_loop():
    _, task_queue, status_queue, minio_client = parse_args(
        'Data-Preparation-Worker Connector', task_queue='Data-Prep-Queue')

    for data in task_queue.listen():
        print("Received the following task from Data-Prep-Queue: ")
        print(data)

        task = None

        try:
            print("Starting to process received data")
            task = DataPrepTask(**data)

            log_file_handler = open("/log.txt", "w")
            log_file_handler.write("Starting to process the received task \n")
            log_file_handler.write("{}\n".format(task))

            status_queue.submit(
                DataPrepStatus(id=DataPrepStatusCode.IN_PROGRESS,
                               training_id=task.training_id,
                               message="Task in progress"))

            print("All needed parameters are available. Processing continues.")
            print(task.resources)

            download_results = []

            log_file_handler.write("Starting to download all needed files. \n")
            # Step 1: Download all files which were created by the Text-Preparation-Worker for this task.
            #         In addition to that, download the G2P-graph and lexicon.txt files from the acoustic-bucket:

            # Download of the graph
            download_results.append(
                download_from_bucket(
                    minio_client, minio_buckets["ACOUSTIC_MODELS_BUCKET"],
                    "{}/g2p_model.fst".format(task.acoustic_model_id),
                    "/data_prep_worker/in/g2p_model.fst"))
            # Download of the lexicon.txt file
            download_results.append(
                download_from_bucket(
                    minio_client, minio_buckets["ACOUSTIC_MODELS_BUCKET"],
                    "{}/lexicon.txt".format(task.acoustic_model_id),
                    "/data_prep_worker/in/lexicon.txt"))
            corpus_list = list()
            for resource in task.resources:
                # Download of all corpus files which were created within the TPW
                loc_corp_path = "/data_prep_worker/in/{}_corpus.txt".format(
                    resource)
                download_results.append(
                    download_from_bucket(
                        minio_client,
                        minio_buckets["TRAINING_RESOURCE_BUCKET"],
                        "{}/corpus.txt".format(resource), loc_corp_path))
                corpus_list.append(loc_corp_path)

            # If any download did not finish --> Set task status to: Failure
            for download in download_results:
                if not download[0]:
                    log_file_handler.write(
                        "While the task was processed, the following error has occurred: \n"
                    )
                    log_file_handler.write(
                        "############################################################### \n"
                    )
                    log_file_handler.write(
                        "At least one download failed. Task failed!\n")

                    finish_logging(log_file_handler=log_file_handler,
                                   minio_client=minio_client,
                                   task=task)

                    status_queue.submit(
                        DataPrepStatus(id=DataPrepStatusCode.FAILURE,
                                       training_id=task.training_id,
                                       message="Download failed"))
            log_file_handler.write(
                "All needed files were successfully downloaded. Processing continues \n"
            )

            # Step 2.1: Merge all corpus-files into one final corpus and save the file locally
            try:
                log_file_handler.write(
                    "Starting to merge all downloaded corpus-files. \n")
                corpus = merge_corpus_list(corpus_list, log_file_handler)
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While all corpus files were merged into one, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "Either the given list is empty, or it was not possible to open a given corpus file. \n"
                )

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(id=DataPrepStatusCode.FAILURE,
                                   training_id=task.training_id,
                                   message="Corpus merge failed"))

            # Step 2.2: Save merged corpus file locally
            try:
                log_file_handler.write(
                    "Successfully merged all corpus-files. Continuing by saving the merged corpus locally \n"
                )
                save_txt_file("/data_prep_worker/out/corpus.txt", corpus)
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While the merged corpus list was saved locally, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was not possible to save the file. \n")

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(
                        id=DataPrepStatusCode.FAILURE,
                        training_id=task.training_id,
                        message="Saving merged corpus file locally failed"))
            log_file_handler.write(
                "Successfully saved the merged corpus file \n")

            # Step 3.1: Create the final_word_list, using the combined corpus
            try:
                log_file_handler.write(
                    "Processing continues. Next step is to create the final_word_list \n"
                )
                lexicon = create_unique_word_list(
                    "/data_prep_worker/out/corpus.txt")
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While trying to create the final_word_list, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was not possible to open the corpus-file correctly. Therefore, it was not possible to create the final_word_list \n"
                )

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(id=DataPrepStatusCode.FAILURE,
                                   training_id=task.training_id,
                                   message="Final word list creation failed"))
            log_file_handler.write(
                "Successfully created the final_word_list. \n")

            # Step 3.2: Save the final_word_list locally
            try:
                log_file_handler.write(
                    "Saving the word list locally, before the processing continues. \n"
                )
                save_txt_file("/data_prep_worker/out/final_word_list", lexicon)
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While trying to save the final_word_list locally, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was not possible to save the file. \n")

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(
                        id=DataPrepStatusCode.FAILURE,
                        training_id=task.training_id,
                        message="Saving unique word list locally failed"))
            log_file_handler.write(
                "Successfully saved the final_word_list. \n")

            # Step 3.3: Gather all needed stats which are needed for the frontend and create a JSON-file
            try:
                log_file_handler.write(
                    "Processing continues by collecting all needed stats for the Frontend! \n"
                )
                number_of_words, number_of_lines = gather_corpus_information(
                    "/data_prep_worker/out/corpus.txt")
                number_of_unique_words = len(lexicon)
                number_of_processed_corpus_files = len(corpus_list)
                save_json_file(number_of_words, number_of_lines,
                               number_of_unique_words,
                               number_of_processed_corpus_files)
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While trying to create the final_word_list, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was not possible to retrieve all needed information!")

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(
                        id=DataPrepStatusCode.FAILURE,
                        training_id=task.training_id,
                        message=
                        "Failed to retrieve needed stats for the frontend"))
            log_file_handler.write(
                "Successfully retrieved all needed stats for the frontend! \n")

            # Step 4.1: Compare the final_word_list with the existing lexicon.txt file
            unique_word_list = compare_lexicon_with_word_list(
                final_word_list="/data_prep_worker/out/final_word_list",
                lexicon="/data_prep_worker/in/lexicon.txt")

            # Step 4.2: Save the unique_word_list locally
            save_txt_file(file_path="/data_prep_worker/out/unique_word_list",
                          content_list=unique_word_list)

            # Step 4.3: Execute Phonetisaurus and create phones for the unique_word_list
            try:
                log_file_handler.write(
                    "Processing continues by executing the Phonetisaurus which will create the lexicon-file for the Kaldi-Framework. \n"
                )
                execute_phonetisaurus()
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While trying to execute the Phonetisaurus, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was either not possible to read the unique word list properly, or an error occured while executing the Phonetisaurus! \n"
                )

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(
                        id=DataPrepStatusCode.FAILURE,
                        training_id=task.training_id,
                        message="Failing while executing the Phonetisaurus"))
            log_file_handler.write("Successfully created the lexicon-file. \n")
            print("Successfully created the lexicon-file")

            # Step 4.4: Append new word and phone list to the lexicon-file
            lexicon = combine_old_and_new_lexicon_files(
                old_lexicon="/data_prep_worker/in/lexicon.txt",
                new_lexicon="/data_prep_worker/out/lexicon.txt")

            # Step 4.5: Save final lexicon.txt file locally
            save_txt_file(file_path="/data_prep_worker/out/lexicon.txt",
                          content_list=lexicon)

            # Step 5: Upload of lexicon.txt, corpus.txt and unique_word_list files
            log_file_handler.write(
                "Processing continues by uploading the created lexicon-file and merged corpus to their corresponding MinIO-bucket \n"
            )
            lexicon_result = upload_to_bucket(
                minio_client, minio_buckets["TRAINING_BUCKET"],
                "{}/lexicon.txt".format(task.training_id),
                "/data_prep_worker/out/lexicon.txt")

            corpus_result = upload_to_bucket(
                minio_client, minio_buckets["TRAINING_BUCKET"],
                "{}/corpus.txt".format(task.training_id),
                "/data_prep_worker/out/corpus.txt")

            unique_word_list_result = upload_to_bucket(
                minio_client, minio_buckets["TRAINING_BUCKET"],
                "{}/unique_word_list.txt".format(task.training_id),
                "/data_prep_worker/out/final_word_list")

            json_result = upload_to_bucket(
                minio_client, minio_buckets["TRAINING_BUCKET"],
                "{}/stats.json".format(task.training_id),
                "/data_prep_worker/out/stats.json")

            if not lexicon_result[0] or not corpus_result[
                    0] or not unique_word_list_result[0] or not json_result[0]:
                print(
                    "At least one upload failed. It is not possible to finish this task successfully."
                )
                log_file_handler.write(
                    "While trying to upload the lexicon.txt, corpus.txt, unique_word_list.txt and stats.json files, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was not possible to upload at least one file. Please check your internet connection. \n"
                )

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(id=DataPrepStatusCode.FAILURE,
                                   training_id=task.training_id,
                                   message="At least one upload failed"))
                continue

            log_file_handler.write(
                "Successfully uploaded lexicon.txt, corpus.txt, unique_word_list.txt and stats.json \n"
            )

            # Step 6: Delete all files which were downloaded or created for this task
            remove_local_files("/data_prep_worker/in/")
            remove_local_files("/data_prep_worker/out/")

            finish_logging(log_file_handler=log_file_handler,
                           minio_client=minio_client,
                           task=task)

            # Step 7: Update status queue to: Successfull if this point is reached
            status_queue.submit(
                DataPrepStatus(id=DataPrepStatusCode.SUCCESS,
                               training_id=task.training_id,
                               message="Task finished successfully"))
        except Exception as e:
            status_queue.submit(
                DataPrepStatus(id=DataPrepStatusCode.FAILURE,
                               training_id=task.training_id if task else None,
                               message=str(e)))
            raise e
Ejemplo n.º 13
0
            log_file_handler.write(
                "Workspace directory already exists. Processing continues. \n")

            # cache models when used once and reduce download
            try:
                log_file_handler.write(
                    "Starting to download all needed files from the corresponding MinIO-bucket! \n"
                )
                cur_acoustic_model_path = os.path.join(
                    acoustic_model_folder, str(task.acoustic_model_id))
                phone_symbol_table = os.path.join(cur_acoustic_model_path,
                                                  "phones.txt")
                if task.acoustic_model_id not in downloaded_acoustic_models:
                    os.makedirs(cur_acoustic_model_path)
                    download_from_bucket(
                        minio_client, acoustic_model_bucket,
                        "{}/final.mdl".format(task.acoustic_model_id),
                        os.path.join(cur_acoustic_model_path, "final.mdl"))
                    download_from_bucket(
                        minio_client, acoustic_model_bucket,
                        "{}/tree".format(task.acoustic_model_id),
                        os.path.join(cur_acoustic_model_path, "tree"))
                    download_from_bucket(
                        minio_client, acoustic_model_bucket,
                        "{}/phones.txt".format(task.acoustic_model_id),
                        phone_symbol_table)

                    downloaded_acoustic_models.add(task.acoustic_model_id)

                # load resources
                download_from_bucket(minio_client, training_bucket,
                                     "{}/lexicon.txt".format(task.training_id),
Ejemplo n.º 14
0
            log_file_handler.write(
                "Needed acoustic model directory exists. Processing continues! \n"
            )

            # cache acoustic models when used once and reduce download
            try:
                log_file_handler.write(
                    "Trying to download all needed acoustic model files! \n")
                cur_acoustic_model_path = os.path.join(acoustic_model_folder,
                                                       str(acoustic_model_id))
                ivector_extractor_path = os.path.join(cur_acoustic_model_path,
                                                      "extractor")
                if (acoustic_model_id not in downloaded_acoustic_models):
                    os.makedirs(cur_acoustic_model_path)
                    download_from_bucket(
                        minio_client, acoustic_model_bucket,
                        acoustic_model_id + "/final.mdl",
                        os.path.join(cur_acoustic_model_path, "final.mdl"))
                    download_from_bucket(
                        minio_client, acoustic_model_bucket,
                        acoustic_model_id + "/tree",
                        os.path.join(cur_acoustic_model_path, "tree"))
                    download_from_bucket(
                        minio_client, acoustic_model_bucket,
                        acoustic_model_id + "/cmvn_opts",
                        os.path.join(cur_acoustic_model_path, "cmvn_opts"))

                    os.makedirs(ivector_extractor_path)
                    download_from_bucket(
                        minio_client, acoustic_model_bucket,
                        acoustic_model_id + "/extractor/final.dubm",
                        os.path.join(cur_acoustic_model_path,
def process_file(file_type, resource_uuid, minio_client, log_file_handler):
    '''
    This function is called, in order to open the received filename of the API.
    All files which need to be processed are saved within:
        /text-preparation/in/<resource_uuid>/source
    The following file types are supported:
        - PDF
        - Docx
        - HTML
        - txt
        - PNG or JPG
    '''
    text_prep_input = "/text_prep_worker/in/"
    text_prep_output = "/text_prep_worker/out/"
    file_path = text_prep_input + resource_uuid

    # Step 1: Checks whether the requested bucket exist
    existance_result_in = does_bucket_exist(minio_client,
                                            minio_buckets["RESOURCE_BUCKET"])
    if not existance_result_in[0]:
        log_file_handler.write(
            "While processing the received task, the following error has occurred: \n"
        )
        log_file_handler.write(
            "###################################################################### \n"
        )
        log_file_handler.write(existance_result_in[1] + "\n")
        return existance_result_in

    log_file_handler.write(
        "A request was send towards the MinIO-server to check whether the {} bucket exist. Response was positive. Processing continues. \n"
        .format(minio_buckets["RESOURCE_BUCKET"]))

    # Step 2: Downloads the needed file which is located within the texts-in bucket
    download_result = download_from_bucket(minio_client,
                                           minio_buckets["RESOURCE_BUCKET"],
                                           resource_uuid + "/source",
                                           file_path)
    if not download_result[0]:
        log_file_handler.write(
            "While processing the received task, the following error has occurred: \n"
        )
        log_file_handler.write(
            "###################################################################### \n"
        )
        log_file_handler.write(download_result[1] + "\n")
        return download_result

    log_file_handler.write("Download of {}/{} was successfull \n".format(
        minio_buckets["RESOURCE_BUCKET"], resource_uuid + "/source"))

    # Step 3: Process the downloaded file
    #         After processing the corpus is ready
    full_text = ""
    try:
        if file_type == "pdf":
            full_text = pdf_parser(file_path, log_file_handler)
        elif file_type == "docx":
            full_text = word_parser(file_path, log_file_handler)
        elif file_type == "html":
            full_text = html_parser(file_path, log_file_handler)
        elif file_type == "txt":
            full_text = text_parser(file_path, log_file_handler)
        elif file_type == "png" or file_type == "jpg":
            full_text = ocr_parser(file_path, log_file_handler)
        else:
            log_file_handler.write(
                "While trying to parse the downloaded file, the following error has occurred: \n"
            )
            log_file_handler.write(
                "###################################################################### \n"
            )
            log_file_handler.write(
                "The given file type is not supported. Task failed. \n")
            return (False, "Given file type is not supported. Task failed")

        log_file_handler.write("Parsing of the file finished successfully. \n")
        log_file_handler.write("Starting to create the corpus file. \n")
        # Generates the corpus
        corpus = generate_corpus(full_text)
        log_file_handler.write(
            "Creation of the corpus file finished successfully \n")

    except Exception as e:
        log_file_handler.write(
            "While processing the received task, the following error has occurred: \n"
        )
        log_file_handler.write(
            "###################################################################### \n"
        )
        log_file_handler.write(
            "Failed to parse the received file. Task failed. \n")

        print(e)
        return (False, "Failed to parse the received file")

    # Step 4: Save corpus locally
    #TODO: Save lexicon locally

    try:
        log_file_handler.write("Trying to save the corpus file locally \n")
        corpus_name = resource_uuid + "_corpus"
        save_textfile(corpus, corpus_name)
        log_file_handler.write("Successfully saved the corpus file \n")
    except:
        log_file_handler.write(
            "While processing the received task, the following error has occurred: \n"
        )
        log_file_handler.write(
            "###################################################################### \n"
        )
        log_file_handler.write("Failed to save the corpus file \n")
        return (False, "Failed to save the corpus file")

    # Step 5: Upload corpus in bucket
    #TODO: Upload lexicon in bucket

    log_file_handler.write(
        "Trying to upload the corpus file into its corresponding MinIO-bucket \n"
    )
    corpus_path = text_prep_output + corpus_name
    corpus_upload_result = upload_to_bucket(minio_client,
                                            minio_buckets["RESOURCE_BUCKET"],
                                            resource_uuid + "/corpus.txt",
                                            corpus_path)

    if not corpus_upload_result[0]:
        log_file_handler.write(
            "While processing the received task, the following error has occurred: \n"
        )
        log_file_handler.write(
            "###################################################################### \n"
        )
        log_file_handler.write(corpus_upload_result[1] + "\n")
        return corpus_upload_result

    log_file_handler.write(
        "Successfully uploaded the corpus file. Corpus is located within {} in the {} MinIO-bucket.\n"
        .format(resource_uuid + "/corpus.txt",
                minio_buckets["RESOURCE_BUCKET"]))

    # Step 6: Remove local files
    remove_local_files(text_prep_input)
    remove_local_files(text_prep_output)

    return (True, "The task was successfully processed.")