コード例 #1
0
def set_corpus_of_training_resource(project_uuid, training_version,
                                    resource_uuid, body):  # noqa: E501
    """Set the corpus of the resource

    Updates the corpus of the specified resource for this training # noqa: E501

    :param project_uuid: UUID of the project
    :type project_uuid: 
    :param training_version: Training version of the project
    :type training_version: int
    :param resource_uuid: UUID of the resource
    :type resource_uuid: 
    :param body: New or updated corpus as plain text
    :type body: str

    :rtype: None
    """
    current_user = connexion.context['token_info']['user']

    db_project = DB_Project.query.filter_by(uuid=project_uuid,
                                            owner_id=current_user.id).first()

    if db_project is None:
        return ("Project not found", 404)

    db_training = DB_Training.query.filter_by(version=training_version) \
        .filter_by(project_id=db_project.id).first()

    if db_training is None:
        return ("Training not found", 404)

    if db_training.status not in (DB_TrainingStateEnum.Init,
                                  DB_TrainingStateEnum.Trainable,
                                  DB_TrainingStateEnum.TextPrep_Pending,
                                  DB_TrainingStateEnum.TextPrep_Failure):
        return ("Training already started or done", 409)

    db_resource = DB_Resource.query.filter(
        DB_Resource.uuid == resource_uuid).first()

    if db_resource is None:
        return ("Resource not found", 404)

    db_training_resource = DB_TrainingResource.query.filter_by(origin_id=db_resource.id) \
        .filter_by(training_id=db_training.id).first()

    if db_training_resource is None:
        return ("Resource not assigned to this Training", 404)

    f = tempfile.NamedTemporaryFile()
    f.write(body)
    f.flush()

    upload_to_bucket(minio_client, minio_buckets["TRAINING_RESOURCE_BUCKET"],
                     str(db_training_resource.id) + "/corpus.txt", f.name)

    f.close()
    return ("Success", 200)
コード例 #2
0
def copy_from_resource_to_training_resource(minio_client):

    for i in range(1, 7, 1):
        download_from_bucket(
            minio_client=minio_client,
            bucket=minio_buckets["RESOURCE_BUCKET"],
            filename="{}/corpus.txt".format(i),
            target_path="/{}.txt".format(i))
        upload_to_bucket(
            minio_client=minio_client,
            bucket=minio_buckets["TRAINING_RESOURCE_BUCKET"],
            filename="{}/corpus.txt".format(i),
            file_path="/{}.txt".format(i))
コード例 #3
0
def finish_logging(log_file_handler, minio_client, task):
    log_file_handler.close()
    logfile_result = upload_to_bucket(
        minio_client, minio_buckets["LOG_BUCKET"],
        "kaldi_worker/{}/{}".format(task.training_id, "log.txt"), "/log.txt")
    if not logfile_result[0]:
        print("An error occurred during the upload of the logfile.")
    print("Logfile was successfully uploaded")
コード例 #4
0
def upload_audio(upfile):  # noqa: E501
    """Uploads audio

     # noqa: E501

    :param upfile: File object that needs to be uploaded
    :type upfile: str

    :rtype: List[Audio]
    """

    filename = secure_filename(upfile.filename)
    filetype = get_filetype(filename)

    if filetype is None:
        return ('Invalid input', 405)

    db_audioresource = DB_AudioResource(name=filename)
    db.session.add(db_audioresource)
    db.session.commit()

    # cache file in local file system, then upload to MinIO
    if not os.path.exists(TEMP_UPLOAD_FOLDER):
        os.makedirs(TEMP_UPLOAD_FOLDER)

    local_file_path = os.path.join(TEMP_UPLOAD_FOLDER,
                                   str(db_audioresource.uuid))
    upfile.save(local_file_path)

    minio_file_path = str(db_audioresource.uuid)

    upload_result = upload_to_bucket(minio_client=minio_client,
                                     bucket=minio_buckets["DECODING_BUCKET"],
                                     filename=minio_file_path,
                                     file_path=local_file_path)

    # TODO: delete local file local_file_path

    if upload_result[0]:
        # TODO WRONG STATUS UNTIL AUDIO PREP WORKFLOW EXISTS
        db_audioresource.status = DB_AudioStateEnum.AudioPrep_Success
    else:
        db_audioresource.status = DB_AudioStateEnum.AudioPrep_Failure

    db.session.add(db_audioresource)
    db.session.commit()

    print('Uploaded audio file to MinIO: ' + str(db_audioresource))
    print(mapper.db_audio_to_front(db_audioresource))
    return (mapper.db_audio_to_front(db_audioresource), 201)
コード例 #5
0
def test_text_prep(redis_client, minio_client):
    # Step 1: Create texts-in and texts-out bucket
    try:
        minio_client.make_bucket(minio_buckets["RESOURCE_BUCKET"])
        minio_client.make_bucket(minio_buckets["LOG_BUCKET"])
    except (minio.error.BucketAlreadyOwnedByYou,
            minio.error.BucketAlreadyExists):
        pass
    except minio.ResponseError as e:
        raise e

    # Step 2: Upload needed files into the texts-in bucket
    upload_to_bucket(minio_client, minio_buckets["RESOURCE_BUCKET"],
                     "1/source", "test-files/txt/kafkatxt")
    upload_to_bucket(minio_client, minio_buckets["RESOURCE_BUCKET"],
                     "2/source", "test-files/pdf/kafkapdf")
    upload_to_bucket(minio_client, minio_buckets["RESOURCE_BUCKET"],
                     "3/source", "test-files/word/kafkadocx")
    upload_to_bucket(minio_client, minio_buckets["RESOURCE_BUCKET"],
                     "4/source", "test-files/html/gamestarhtml")
    upload_to_bucket(minio_client, minio_buckets["RESOURCE_BUCKET"],
                     "5/source", "test-files/jpg/dokumentjpg")
    upload_to_bucket(minio_client, minio_buckets["RESOURCE_BUCKET"],
                     "6/source", "test-files/png/dokumentpng")
    # Step 3: Create redis tasks for the following file types:
    #      3.1: txt-file
    text_file_task = {"resource_uuid": "1", "file_type": "txt"}
    #      3.2: PDF-file
    pdf_file_task = {"resource_uuid": "2", "file_type": "pdf"}
    #      3.3: docx-file
    docx_file_task = {"resource_uuid": "3", "file_type": "docx"}
    #      3.4: html-file
    html_file_task = {"resource_uuid": "4", "file_type": "html"}
    #      3.5: jpg image
    jpg_file_task = {"resource_uuid": "5", "file_type": "jpg"}
    #      3.6: png image
    png_file_task = {"resource_uuid": "6", "file_type": "png"}

    # Step 4: Subscribe to the status-queue channel
    pubsub = redis_client.pubsub(ignore_subscribe_messages=True)
    pubsub.subscribe('Status-Queue')

    # Step 5: Send all tasks for text-prep-worker into the queue
    redis_client.rpush('Text-Prep-Queue', json.dumps(text_file_task))
    redis_client.rpush('Text-Prep-Queue', json.dumps(pdf_file_task))
    redis_client.rpush('Text-Prep-Queue', json.dumps(docx_file_task))
    redis_client.rpush('Text-Prep-Queue', json.dumps(html_file_task))
    redis_client.rpush('Text-Prep-Queue', json.dumps(jpg_file_task))
    redis_client.rpush('Text-Prep-Queue', json.dumps(png_file_task))

    # Step 6: Listen to all incoming Status-queue messages
    count = 0
    for msg in pubsub.listen():
        print("Received the following message:")
        print(msg)
        data_part = json.loads(msg['data'])
        if data_part['id'] == 10:
            pass
        elif data_part['id'] == 200:
            count += 1
        else:
            print("At least one task did NOT finish SUCCESSFULLY")
            print("Exiting with code 999")
            exit(999)
        if count == 6:
            print("###############################")
            print("All tasks finished SUCCESSFULLY")
            print("Exiting with code 0")
            print("###############################")
    exit(0)
コード例 #6
0
def infinite_loop():
    _, task_queue, status_queue, minio_client = parse_args(
        'Data-Preparation-Worker Connector', task_queue='Data-Prep-Queue')

    for data in task_queue.listen():
        print("Received the following task from Data-Prep-Queue: ")
        print(data)

        task = None

        try:
            print("Starting to process received data")
            task = DataPrepTask(**data)

            log_file_handler = open("/log.txt", "w")
            log_file_handler.write("Starting to process the received task \n")
            log_file_handler.write("{}\n".format(task))

            status_queue.submit(
                DataPrepStatus(id=DataPrepStatusCode.IN_PROGRESS,
                               training_id=task.training_id,
                               message="Task in progress"))

            print("All needed parameters are available. Processing continues.")
            print(task.resources)

            download_results = []

            log_file_handler.write("Starting to download all needed files. \n")
            # Step 1: Download all files which were created by the Text-Preparation-Worker for this task.
            #         In addition to that, download the G2P-graph and lexicon.txt files from the acoustic-bucket:

            # Download of the graph
            download_results.append(
                download_from_bucket(
                    minio_client, minio_buckets["ACOUSTIC_MODELS_BUCKET"],
                    "{}/g2p_model.fst".format(task.acoustic_model_id),
                    "/data_prep_worker/in/g2p_model.fst"))
            # Download of the lexicon.txt file
            download_results.append(
                download_from_bucket(
                    minio_client, minio_buckets["ACOUSTIC_MODELS_BUCKET"],
                    "{}/lexicon.txt".format(task.acoustic_model_id),
                    "/data_prep_worker/in/lexicon.txt"))
            corpus_list = list()
            for resource in task.resources:
                # Download of all corpus files which were created within the TPW
                loc_corp_path = "/data_prep_worker/in/{}_corpus.txt".format(
                    resource)
                download_results.append(
                    download_from_bucket(
                        minio_client,
                        minio_buckets["TRAINING_RESOURCE_BUCKET"],
                        "{}/corpus.txt".format(resource), loc_corp_path))
                corpus_list.append(loc_corp_path)

            # If any download did not finish --> Set task status to: Failure
            for download in download_results:
                if not download[0]:
                    log_file_handler.write(
                        "While the task was processed, the following error has occurred: \n"
                    )
                    log_file_handler.write(
                        "############################################################### \n"
                    )
                    log_file_handler.write(
                        "At least one download failed. Task failed!\n")

                    finish_logging(log_file_handler=log_file_handler,
                                   minio_client=minio_client,
                                   task=task)

                    status_queue.submit(
                        DataPrepStatus(id=DataPrepStatusCode.FAILURE,
                                       training_id=task.training_id,
                                       message="Download failed"))
            log_file_handler.write(
                "All needed files were successfully downloaded. Processing continues \n"
            )

            # Step 2.1: Merge all corpus-files into one final corpus and save the file locally
            try:
                log_file_handler.write(
                    "Starting to merge all downloaded corpus-files. \n")
                corpus = merge_corpus_list(corpus_list, log_file_handler)
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While all corpus files were merged into one, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "Either the given list is empty, or it was not possible to open a given corpus file. \n"
                )

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(id=DataPrepStatusCode.FAILURE,
                                   training_id=task.training_id,
                                   message="Corpus merge failed"))

            # Step 2.2: Save merged corpus file locally
            try:
                log_file_handler.write(
                    "Successfully merged all corpus-files. Continuing by saving the merged corpus locally \n"
                )
                save_txt_file("/data_prep_worker/out/corpus.txt", corpus)
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While the merged corpus list was saved locally, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was not possible to save the file. \n")

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(
                        id=DataPrepStatusCode.FAILURE,
                        training_id=task.training_id,
                        message="Saving merged corpus file locally failed"))
            log_file_handler.write(
                "Successfully saved the merged corpus file \n")

            # Step 3.1: Create the final_word_list, using the combined corpus
            try:
                log_file_handler.write(
                    "Processing continues. Next step is to create the final_word_list \n"
                )
                lexicon = create_unique_word_list(
                    "/data_prep_worker/out/corpus.txt")
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While trying to create the final_word_list, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was not possible to open the corpus-file correctly. Therefore, it was not possible to create the final_word_list \n"
                )

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(id=DataPrepStatusCode.FAILURE,
                                   training_id=task.training_id,
                                   message="Final word list creation failed"))
            log_file_handler.write(
                "Successfully created the final_word_list. \n")

            # Step 3.2: Save the final_word_list locally
            try:
                log_file_handler.write(
                    "Saving the word list locally, before the processing continues. \n"
                )
                save_txt_file("/data_prep_worker/out/final_word_list", lexicon)
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While trying to save the final_word_list locally, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was not possible to save the file. \n")

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(
                        id=DataPrepStatusCode.FAILURE,
                        training_id=task.training_id,
                        message="Saving unique word list locally failed"))
            log_file_handler.write(
                "Successfully saved the final_word_list. \n")

            # Step 3.3: Gather all needed stats which are needed for the frontend and create a JSON-file
            try:
                log_file_handler.write(
                    "Processing continues by collecting all needed stats for the Frontend! \n"
                )
                number_of_words, number_of_lines = gather_corpus_information(
                    "/data_prep_worker/out/corpus.txt")
                number_of_unique_words = len(lexicon)
                number_of_processed_corpus_files = len(corpus_list)
                save_json_file(number_of_words, number_of_lines,
                               number_of_unique_words,
                               number_of_processed_corpus_files)
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While trying to create the final_word_list, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was not possible to retrieve all needed information!")

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(
                        id=DataPrepStatusCode.FAILURE,
                        training_id=task.training_id,
                        message=
                        "Failed to retrieve needed stats for the frontend"))
            log_file_handler.write(
                "Successfully retrieved all needed stats for the frontend! \n")

            # Step 4.1: Compare the final_word_list with the existing lexicon.txt file
            unique_word_list = compare_lexicon_with_word_list(
                final_word_list="/data_prep_worker/out/final_word_list",
                lexicon="/data_prep_worker/in/lexicon.txt")

            # Step 4.2: Save the unique_word_list locally
            save_txt_file(file_path="/data_prep_worker/out/unique_word_list",
                          content_list=unique_word_list)

            # Step 4.3: Execute Phonetisaurus and create phones for the unique_word_list
            try:
                log_file_handler.write(
                    "Processing continues by executing the Phonetisaurus which will create the lexicon-file for the Kaldi-Framework. \n"
                )
                execute_phonetisaurus()
            except Exception as e:
                print(e)
                log_file_handler.write(
                    "While trying to execute the Phonetisaurus, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was either not possible to read the unique word list properly, or an error occured while executing the Phonetisaurus! \n"
                )

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(
                        id=DataPrepStatusCode.FAILURE,
                        training_id=task.training_id,
                        message="Failing while executing the Phonetisaurus"))
            log_file_handler.write("Successfully created the lexicon-file. \n")
            print("Successfully created the lexicon-file")

            # Step 4.4: Append new word and phone list to the lexicon-file
            lexicon = combine_old_and_new_lexicon_files(
                old_lexicon="/data_prep_worker/in/lexicon.txt",
                new_lexicon="/data_prep_worker/out/lexicon.txt")

            # Step 4.5: Save final lexicon.txt file locally
            save_txt_file(file_path="/data_prep_worker/out/lexicon.txt",
                          content_list=lexicon)

            # Step 5: Upload of lexicon.txt, corpus.txt and unique_word_list files
            log_file_handler.write(
                "Processing continues by uploading the created lexicon-file and merged corpus to their corresponding MinIO-bucket \n"
            )
            lexicon_result = upload_to_bucket(
                minio_client, minio_buckets["TRAINING_BUCKET"],
                "{}/lexicon.txt".format(task.training_id),
                "/data_prep_worker/out/lexicon.txt")

            corpus_result = upload_to_bucket(
                minio_client, minio_buckets["TRAINING_BUCKET"],
                "{}/corpus.txt".format(task.training_id),
                "/data_prep_worker/out/corpus.txt")

            unique_word_list_result = upload_to_bucket(
                minio_client, minio_buckets["TRAINING_BUCKET"],
                "{}/unique_word_list.txt".format(task.training_id),
                "/data_prep_worker/out/final_word_list")

            json_result = upload_to_bucket(
                minio_client, minio_buckets["TRAINING_BUCKET"],
                "{}/stats.json".format(task.training_id),
                "/data_prep_worker/out/stats.json")

            if not lexicon_result[0] or not corpus_result[
                    0] or not unique_word_list_result[0] or not json_result[0]:
                print(
                    "At least one upload failed. It is not possible to finish this task successfully."
                )
                log_file_handler.write(
                    "While trying to upload the lexicon.txt, corpus.txt, unique_word_list.txt and stats.json files, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "It was not possible to upload at least one file. Please check your internet connection. \n"
                )

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    DataPrepStatus(id=DataPrepStatusCode.FAILURE,
                                   training_id=task.training_id,
                                   message="At least one upload failed"))
                continue

            log_file_handler.write(
                "Successfully uploaded lexicon.txt, corpus.txt, unique_word_list.txt and stats.json \n"
            )

            # Step 6: Delete all files which were downloaded or created for this task
            remove_local_files("/data_prep_worker/in/")
            remove_local_files("/data_prep_worker/out/")

            finish_logging(log_file_handler=log_file_handler,
                           minio_client=minio_client,
                           task=task)

            # Step 7: Update status queue to: Successfull if this point is reached
            status_queue.submit(
                DataPrepStatus(id=DataPrepStatusCode.SUCCESS,
                               training_id=task.training_id,
                               message="Task finished successfully"))
        except Exception as e:
            status_queue.submit(
                DataPrepStatus(id=DataPrepStatusCode.FAILURE,
                               training_id=task.training_id if task else None,
                               message=str(e)))
            raise e
コード例 #7
0
                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
                               task=task)

                status_queue.submit(
                    KaldiStatus(id=KaldiStatusCode.FAILURE,
                                training_id=task.training_id))
            log_file_handler.write(
                "Successfully compressed files. Ready for upload! \n")

            try:
                log_file_handler.write(
                    "Final step: Upload of the ZIP archive into its corresponding MinIO-bucket! \n"
                )
                upload_to_bucket(minio_client, training_bucket,
                                 "{}/graph.zip".format(task.training_id),
                                 new_graph_archive + "." + archive_format)
            except Exception as e:
                print(e)

                log_file_handler.write(
                    "While trying to upload the ZIP archive, the following error occurred: \n"
                )
                log_file_handler.write(
                    "############################################################################# \n"
                )
                log_file_handler.write(
                    "The upload failed! Therefore, the task failed too! \n")

                finish_logging(log_file_handler=log_file_handler,
                               minio_client=minio_client,
コード例 #8
0
def create_resource(upfile):  # noqa: E501
    """Create/Upload a new resource

     # noqa: E501

    :param upfile: File object that needs to be uploaded
    :type upfile: str

    :rtype: Resource
    """
    current_user = connexion.context['token_info']['user']

    print('Received new file: ' + str(upfile))

    # if user does not select file, browser also
    # submit an empty part without filename
    if upfile is None:
        return ('Invalid input', 405)

    filename = secure_filename(upfile.filename)
    filetype = get_filetype(filename)

    if filetype is None:
        return ('Invalid input', 405)

    # file is okay: create db entry, store to dfs and create textprep job

    print('Set ownership to user 1')

    db_resource = DB_Resource(name=filename,
                              status=DB_ResourceState.Upload_InProgress,
                              resource_type=filetype,
                              owner=current_user)
    db.session.add(db_resource)
    db.session.commit()

    print('Added database entry: ' + str(db_resource))

    # cache file in local file system, then upload to MinIO
    if not os.path.exists(TEMP_UPLOAD_FOLDER):
        os.makedirs(TEMP_UPLOAD_FOLDER)

    local_file_path = os.path.join(TEMP_UPLOAD_FOLDER, str(db_resource.uuid))
    upfile.save(local_file_path)

    minio_file_path = str(db_resource.uuid) + '/source'

    status, message = upload_to_bucket(minio_client=minio_client,
                                       bucket=minio_buckets["RESOURCE_BUCKET"],
                                       filename=minio_file_path,
                                       file_path=local_file_path)

    os.remove(local_file_path)

    if status:
        db_resource.status = DB_ResourceState.TextPreparation_Ready
    else:
        db_resource.status = DB_ResourceState.Upload_Failure

    db.session.add(db_resource)
    db.session.commit()

    print('Uploaded file to MinIO: ' + str(db_resource))

    if db_resource.status == DB_ResourceState.TextPreparation_Ready:
        create_textprep_job(str(db_resource.uuid), db_resource.resource_type)

        db_resource.status = DB_ResourceState.TextPreparation_Pending
        db.session.add(db_resource)
        db.session.commit()

        print('Created TextPreparation job: ' + str(db_resource))

    return (mapper.db_resource_to_front(db_resource), 201)
コード例 #9
0
def process_file(file_type, resource_uuid, minio_client, log_file_handler):
    '''
    This function is called, in order to open the received filename of the API.
    All files which need to be processed are saved within:
        /text-preparation/in/<resource_uuid>/source
    The following file types are supported:
        - PDF
        - Docx
        - HTML
        - txt
        - PNG or JPG
    '''
    text_prep_input = "/text_prep_worker/in/"
    text_prep_output = "/text_prep_worker/out/"
    file_path = text_prep_input + resource_uuid

    # Step 1: Checks whether the requested bucket exist
    existance_result_in = does_bucket_exist(minio_client,
                                            minio_buckets["RESOURCE_BUCKET"])
    if not existance_result_in[0]:
        log_file_handler.write(
            "While processing the received task, the following error has occurred: \n"
        )
        log_file_handler.write(
            "###################################################################### \n"
        )
        log_file_handler.write(existance_result_in[1] + "\n")
        return existance_result_in

    log_file_handler.write(
        "A request was send towards the MinIO-server to check whether the {} bucket exist. Response was positive. Processing continues. \n"
        .format(minio_buckets["RESOURCE_BUCKET"]))

    # Step 2: Downloads the needed file which is located within the texts-in bucket
    download_result = download_from_bucket(minio_client,
                                           minio_buckets["RESOURCE_BUCKET"],
                                           resource_uuid + "/source",
                                           file_path)
    if not download_result[0]:
        log_file_handler.write(
            "While processing the received task, the following error has occurred: \n"
        )
        log_file_handler.write(
            "###################################################################### \n"
        )
        log_file_handler.write(download_result[1] + "\n")
        return download_result

    log_file_handler.write("Download of {}/{} was successfull \n".format(
        minio_buckets["RESOURCE_BUCKET"], resource_uuid + "/source"))

    # Step 3: Process the downloaded file
    #         After processing the corpus is ready
    full_text = ""
    try:
        if file_type == "pdf":
            full_text = pdf_parser(file_path, log_file_handler)
        elif file_type == "docx":
            full_text = word_parser(file_path, log_file_handler)
        elif file_type == "html":
            full_text = html_parser(file_path, log_file_handler)
        elif file_type == "txt":
            full_text = text_parser(file_path, log_file_handler)
        elif file_type == "png" or file_type == "jpg":
            full_text = ocr_parser(file_path, log_file_handler)
        else:
            log_file_handler.write(
                "While trying to parse the downloaded file, the following error has occurred: \n"
            )
            log_file_handler.write(
                "###################################################################### \n"
            )
            log_file_handler.write(
                "The given file type is not supported. Task failed. \n")
            return (False, "Given file type is not supported. Task failed")

        log_file_handler.write("Parsing of the file finished successfully. \n")
        log_file_handler.write("Starting to create the corpus file. \n")
        # Generates the corpus
        corpus = generate_corpus(full_text)
        log_file_handler.write(
            "Creation of the corpus file finished successfully \n")

    except Exception as e:
        log_file_handler.write(
            "While processing the received task, the following error has occurred: \n"
        )
        log_file_handler.write(
            "###################################################################### \n"
        )
        log_file_handler.write(
            "Failed to parse the received file. Task failed. \n")

        print(e)
        return (False, "Failed to parse the received file")

    # Step 4: Save corpus locally
    #TODO: Save lexicon locally

    try:
        log_file_handler.write("Trying to save the corpus file locally \n")
        corpus_name = resource_uuid + "_corpus"
        save_textfile(corpus, corpus_name)
        log_file_handler.write("Successfully saved the corpus file \n")
    except:
        log_file_handler.write(
            "While processing the received task, the following error has occurred: \n"
        )
        log_file_handler.write(
            "###################################################################### \n"
        )
        log_file_handler.write("Failed to save the corpus file \n")
        return (False, "Failed to save the corpus file")

    # Step 5: Upload corpus in bucket
    #TODO: Upload lexicon in bucket

    log_file_handler.write(
        "Trying to upload the corpus file into its corresponding MinIO-bucket \n"
    )
    corpus_path = text_prep_output + corpus_name
    corpus_upload_result = upload_to_bucket(minio_client,
                                            minio_buckets["RESOURCE_BUCKET"],
                                            resource_uuid + "/corpus.txt",
                                            corpus_path)

    if not corpus_upload_result[0]:
        log_file_handler.write(
            "While processing the received task, the following error has occurred: \n"
        )
        log_file_handler.write(
            "###################################################################### \n"
        )
        log_file_handler.write(corpus_upload_result[1] + "\n")
        return corpus_upload_result

    log_file_handler.write(
        "Successfully uploaded the corpus file. Corpus is located within {} in the {} MinIO-bucket.\n"
        .format(resource_uuid + "/corpus.txt",
                minio_buckets["RESOURCE_BUCKET"]))

    # Step 6: Remove local files
    remove_local_files(text_prep_input)
    remove_local_files(text_prep_output)

    return (True, "The task was successfully processed.")
コード例 #10
0
def infinite_loop():
    _, task_queue, status_queue, minio_client = parse_args(
        'Text-Preparation-Worker Connector', task_queue='Text-Prep-Queue')

    for data in task_queue.listen():
        print("Received the following task from Text-Prep-Queue: ")
        print(data)
        task = None

        try:
            task = TextPrepTask(**data)

            log_file_handler = open("/log.txt", "w")

            print("Starting to process the received task")
            log_file_handler.write("Starting to process the received task. \n")
            log_file_handler.write("{}\n".format(task))

            status_queue.submit(
                TextPrepStatus(id=TextPrepStatusCode.IN_PROGRESS,
                               resource_uuid=task.resource_uuid,
                               message="Task in progress"))
            return_value = process_file(task.file_type, task.resource_uuid,
                                        minio_client, log_file_handler)

            # If the task was successfully processed, the if-statement is executed
            # Otherwise, the status queue is updated to: failure
            if return_value[0]:
                status_queue.submit(
                    TextPrepStatus(id=TextPrepStatusCode.SUCCESS,
                                   resource_uuid=task.resource_uuid,
                                   message="Task finished successfully"))
                log_file_handler.write(return_value[1] + "\n")
            else:
                status_queue.submit(
                    TextPrepStatus(id=TextPrepStatusCode.FAILURE,
                                   resource_uuid=task.resource_uuid,
                                   message="Task has failed"))
                log_file_handler.write(
                    "Processing of the task failed, because of the following reason: \n"
                )
                log_file_handler.write(
                    "############################################################### \n"
                )
                log_file_handler.write(return_value[1] + "\n")

            log_file_handler.close()
            upload_result = upload_to_bucket(
                minio_client, minio_buckets["LOG_BUCKET"],
                "{}/{}/log.txt".format("text_preparation_worker",
                                       task.resource_uuid), "/log.txt")
            if not upload_result[0]:
                print("During the upload of log.txt, an error has occurred.")
            print("Upload of the log.txt-file finished successfully")

        except Exception as e:
            status_queue.submit(
                TextPrepStatus(
                    id=TextPrepStatusCode.FAILURE,
                    resource_uuid=task.resource_uuid if task else None,
                    message=str(e)))
            raise e