コード例 #1
0
def main(params):
    logging.info('Calling fn_extract_email_msgs.')

    try:

        cos_everest_submission_bucket = params.get(
            "cos_everest_submission_bucket", None)
        if cos_everest_submission_bucket is None or "":
            raise Exception("Pass location of the bucket")

        submission_id = params.get("submission_id", None)
        if submission_id is None or "":
            raise Exception("Pass submission_id ")

        mode = params.get("mode", None)
        if mode is None or "":
            raise Exception("Pass mode ")

        # Create a directory on the local drive
        object_storage_key_prefix = OBJECT_STORAGE_EMAIL_ATTACHMENTS_ROOT_FOLDER + "/" + mode

        db_conn = db2utils.get_connection()
        print("db_conn: {}".format(db_conn))

        sql = f'''SELECT ID, DOCUMENT_NAME, ENCODED_ID, HEX(ENCODED_ID) as MSG_DOCUMENT_ID FROM EVERESTSCHEMA.EVRE_LEARNING_EMAIL_MSGS where ID={submission_id}'''
        stmt = ibm_db.exec_immediate(db_conn, sql)
        result = ibm_db.fetch_both(stmt)
        msg_object_storage_key = None
        msg_document_id = None
        msg_id = None
        if result:
            msg_id = result["ID"]
            msg_object_storage_key = result["DOCUMENT_NAME"]
            msg_encoded_id = result["ENCODED_ID"]
            msg_document_id = result["MSG_DOCUMENT_ID"]

        else:
            raise Exception("No email message document found")

        email_message_bytes = cosutils.get_item(cos_everest_submission_bucket,
                                                msg_object_storage_key)

        # extract attachments
        msg = extract_msg.Message(email_message_bytes,
                                  attachmentClass=EmailAttachmentClass)

        # save message body
        # self.__crlf = '\r\n'
        msg_from = msg.sender
        msg_to = msg.to
        msg_cc = msg.cc
        msg_subject = msg.subject
        msg_date = msg.date
        msg_body = msg.body

        msg_file_content = """
        {msg_from}
        {msg_to}
        {msg_cc}
        {msg_subject}
        {msg_date}


        {msg_body}
        """.format(msg_from=msg_from,
                   msg_to=msg_to,
                   msg_cc=msg_cc,
                   msg_subject=msg_subject,
                   msg_date=msg_date,
                   msg_body=msg_body)

        print(msg_file_content)
        object_storage_key = object_storage_key_prefix + "/" + str(
            msg_id) + "/" + (msg_document_id + "_message.txt")
        return_val = cosutils.save_file(cos_everest_submission_bucket,
                                        object_storage_key, msg_file_content)
        if return_val is "SUCCESS":
            print("File Uploaded to object storage successfully")

            # create entries in DB2
            db_conn = db2utils.get_connection()
            print("db_conn: {}".format(db_conn))
            sql = f'''SELECT ID FROM FINAL TABLE (INSERT INTO EVERESTSCHEMA.EVRE_LEARNING_EMAIL_ATTACHMENTS (EVRE_EMAIL_MSG_ID, 
                        DOCUMENT_NAME, DOCUMENT_TYPE, CLASSIFICATION_TYPE, STATUS, USED_FOR) 
                        VALUES ({msg_id},                                  
                            '{object_storage_key}',
                            '.txt',
                            'N/A',
                            'CONVERT_TO_PDF',
                            '{mode}') 
                        )       
                        '''
            # print ("sql: {}".format(sql))

            stmt = ibm_db.exec_immediate(db_conn, sql)
            result = ibm_db.fetch_both(stmt)
            attachment_id = None
            if result:
                attachment_id = result["ID"]

            print(f'attachment_id: {attachment_id}')
        else:
            raise Exception("File upload to object storage failed")

        attachments = msg.attachments

        count_attachments = len(attachments)
        print("count_attachments: {}", count_attachments)

        if count_attachments == 0:
            print('No Atatchments found for msg:: {}', msg_object_storage_key)
        else:
            attachment_dir = None

            for i in range(count_attachments):
                attachment = attachments[i]

                attachment_id = attachment.save(
                    object_storage_bucket_name=cos_everest_submission_bucket,
                    object_storage_key_prefix=object_storage_key_prefix,
                    save_to_object_storage=True,
                    msg_id=msg_id,
                    msg_encoded_id=msg_encoded_id,
                    msg_document_id=msg_document_id,
                    mode=mode)

        sql = f'''SELECT ID, STATUS, TO_CHAR(FIRST_UPDATED,'YYYY-MM-DD HH.MI.SS') as FIRST_UPDATED, 
                TO_CHAR(LAST_UPDATED,'YYYY-MM-DD HH.MI.SS') as LAST_UPDATED FROM FINAL TABLE 
                (UPDATE EVERESTSCHEMA.EVRE_LEARNING_EMAIL_MSGS SET STATUS = 'CONVERT_TO_PDF' where ID = {msg_id})
                '''

        print("sql: {}".format(sql))

        stmt = ibm_db.exec_immediate(db_conn, sql)
        result = ibm_db.fetch_assoc(stmt)

        result_list = []
        if result:
            id = str(result["ID"])
            result_list.append(result)

        result_dict = {}
        result_dict["result"] = result_list
        result_dict["status"] = "SUCCESS"

        return result_dict

    except (ibm_db.conn_error, ibm_db.conn_errormsg, Exception) as err:
        logging.exception(err)
        result_dict = {}
        result_dict["error"] = err
        result_dict["status"] = "FAILURE"
        return result_dict

    return {"result": "Flow should not reach here"}
コード例 #2
0
ファイル: split_pdf.py プロジェクト: arunwagle/submission-git
def main(params):
    logging.info('Calling fn_split_pdf.')

    try:
        cos_everest_submission_bucket = params.get(
            "cos_everest_submission_bucket", None)
        if cos_everest_submission_bucket is None or "":
            raise Exception("Pass location of the bucket")

        final_pdf_folder = params.get("final_pdf_folder", None)
        if final_pdf_folder is None or "":
            raise Exception("Pass pdf folder to split files")

        submissions_data_folder = params.get("submissions_data_folder", None)
        if submissions_data_folder is None or "":
            raise Exception("Pass submissions_data_folder")

        submission_id = params.get("submission_id", None)
        if submission_id is None or "":
            raise Exception("Pass submission_id")

        mode = params.get("mode", None)
        if mode is None or "":
            raise Exception("Pass mode")

        object_storage_key = submissions_data_folder + "/" + \
            mode + "/" + str(submission_id) + "/"

        # + "/" + final_pdf_folder

        extensions = ['pdf']
        regex = r"^" + object_storage_key + ".*(?i)(pdf).*$"

        file_keys = cosutils.get_bucket_contents(cos_everest_submission_bucket,
                                                 regex)

        print(file_keys)

        for key in file_keys:

            if key.lower().endswith(tuple(extensions)):
                file_name = os.path.basename(key)
                file_name_without_ext, file_extension = os.path.splitext(
                    file_name)

                pdf_file_bytes = cosutils.get_item(
                    cos_everest_submission_bucket, key)

                db_conn = db2utils.get_connection()
                print("db_conn: {}".format(db_conn))
                sql = f'''SELECT ID FROM EVERESTSCHEMA.EVRE_LEARNING_EMAIL_ATTACHMENTS 
                where  EVRE_EMAIL_MSG_ID={submission_id} and DOCUMENT_NAME='{key}' '''
                print("sql: {}".format(sql))

                stmt = ibm_db.exec_immediate(db_conn, sql)
                result = ibm_db.fetch_both(stmt)
                pdf_id = -1
                if result:
                    pdf_id = result["ID"]

                # read pdf

                pdf = PdfFileReader(BytesIO(pdf_file_bytes))
                num_of_pages = pdf.getNumPages()
                print("num_of_pages:: {} ", num_of_pages)

                for page in range(num_of_pages):
                    pdf_writer = PdfFileWriter()
                    pdf_writer.addPage(pdf.getPage(page))

                    split_pdf_dir = "final_pdf_split"
                    output_filename_key = '{}{}/{}_page_{}.pdf'.format(
                        object_storage_key, split_pdf_dir,
                        file_name_without_ext, page + 1)
                    tmp = BytesIO()
                    pdf_writer.write(tmp)

                    tmp.seek(0)
                    output_page_bytes = tmp.read()
                    # print("Bytes:: {} ", output_page_bytes)

                    return_val = cosutils.save_file(
                        cos_everest_submission_bucket, output_filename_key,
                        output_page_bytes)
                    if return_val is "SUCCESS":
                        print(
                            "File Uploaded to object storage successfully:: {} ",
                            output_filename_key)

                    db_conn = db2utils.get_connection()
                    print("db_conn: {}".format(db_conn))
                    sql = f'''SELECT ID FROM FINAL TABLE (INSERT INTO EVERESTSCHEMA.EVRE_LEARNING_SPLIT_CONTENT (EVRE_EMAIL_MSG_ID, EVRE_LEARNING_EMAIL_ATTACHMENTS_ID,
                                DOCUMENT_NAME, DOCUMENT_TYPE, CLASSIFICATION_TYPE, STATUS, USED_FOR, DESCRIPTION) 
                                VALUES ({submission_id}, 
                                    {pdf_id},                                 
                                    '{output_filename_key}',
                                    '.pdf',
                                    'N/A',
                                    'N',
                                    'RUNTIME',
                                    'STANDARDIZE_TO_TXT') 
                                )
                                '''
                    print("sql: {}".format(sql))

                    stmt = ibm_db.exec_immediate(db_conn, sql)
                    result = ibm_db.fetch_both(stmt)
                    attachment_id = None
                    if result:
                        attachment_id = result["ID"]
        # end of for loop

        db_conn = db2utils.get_connection()
        sql = f'''SELECT ID, STATUS, TO_CHAR(FIRST_UPDATED,'YYYY-MM-DD HH.MI.SS') as FIRST_UPDATED, 
                TO_CHAR(LAST_UPDATED,'YYYY-MM-DD HH.MI.SS') as LAST_UPDATED FROM FINAL TABLE 
                (UPDATE EVERESTSCHEMA.EVRE_LEARNING_EMAIL_MSGS SET STATUS = 'STANDARDIZE_TO_TXT' where ID = {submission_id})
                '''

        print("sql: {}".format(sql))

        stmt = ibm_db.exec_immediate(db_conn, sql)
        result = ibm_db.fetch_assoc(stmt)

        result_list = []
        if result:
            result_list.append(result)

        result_dict = {}
        result_dict["result"] = result_list
        result_dict["status"] = "SUCCESS"

        return result_dict

    except (ibm_db.conn_error, ibm_db.conn_errormsg, Exception) as err:
        logging.exception(err)
        result_dict = {}
        result_dict["error"] = err
        result_dict["status"] = "FAILURE"
        return result_dict

    return {"result": "Flow should not reach here"}
コード例 #3
0
def main(params):
    logging.info('Calling fn_document_conversion_pdf.')

    try:

        cos_everest_submission_bucket = params.get(
            "cos_everest_submission_bucket", None)
        if cos_everest_submission_bucket is None or "":
            raise Exception("Pass location of the bucket")

        submission_id = params.get("submission_id", None)
        if submission_id is None or "":
            raise Exception("Pass submission_id")

        submissions_data_folder = params.get("submissions_data_folder", None)
        if submissions_data_folder is None or "":
            raise Exception("Pass submissions_data_folder")

        mode = params.get("mode", None)
        if mode is None or "":
            raise Exception("Pass mode")

        object_storage_key = submissions_data_folder + "/" + mode + "/" + str(
            submission_id) + "/"

        regex = r"^" + object_storage_key + ".*$"

        file_keys = cosutils.get_bucket_contents(cos_everest_submission_bucket,
                                                 regex)

        extensions = ['.docx', '.doc', 'pptx']

        for key in file_keys:

            if key.lower().endswith(tuple(extensions)):
                file_name = os.path.basename(key)
                file_name_without_ext, file_extension = os.path.splitext(
                    file_name)

                url = OBJECT_STORAGE_PUBLIC_URL + "/" + object_storage_key + quote(
                    file_name)
                PARAMS = {
                    "apikey": convertio_api_key,
                    "input": "url",
                    "file": url,
                    "outputformat": "pdf"
                }

                print(url)

                # sending get request and saving the response as response object
                r = requests.post(url=CONVERT_IO_URL,
                                  data=json.dumps(PARAMS),
                                  stream=True)

                return_val = json.loads(r.text)
                print("1......return_val::{}", return_val)

                status = return_val["status"]
                code = return_val["code"]

                if code == 200 and status == "ok":
                    id = return_val["data"]["id"]
                    print("converted document id::", id)
                    check_status_url = CONVERT_IO_URL + "/" + id + "/status"
                    print("check_status_url::", check_status_url)
                    while True:
                        r = requests.get(url=check_status_url)
                        return_val = json.loads(r.text)
                        print("2.......status::return_val::", return_val)
                        status = return_val["status"]
                        code = return_val["code"]
                        if code == 200 and status == "ok":
                            step = return_val["data"]["step"]
                            step_percent = return_val["data"]["step_percent"]
                            if step == "finish" and step_percent == 100:
                                id = return_val["data"]["id"]
                                print(
                                    "Get content, store in object storage and update db2 and exist"
                                )

                                # get content
                                get_result_url = CONVERT_IO_URL + "/" + id + "/dl/base64"
                                print("status::get_result_url::",
                                      get_result_url)
                                r = requests.get(url=get_result_url)
                                return_val = json.loads(r.text)

                                status = return_val["status"]
                                code = return_val["code"]
                                if code == 200 and status == "ok":
                                    content = return_val["data"]["content"]

                                    pdf_object_storage_key = object_storage_key + "final_pdf" + "/" + file_name_without_ext + ".pdf"
                                    print(
                                        "cos_everest_submission_bucket: {}: pdf_object_storage_key: {} "
                                        .format(cos_everest_submission_bucket,
                                                pdf_object_storage_key))

                                    # Write attachments to the object storage
                                    return_val = cosutils.save_file(
                                        cos_everest_submission_bucket,
                                        pdf_object_storage_key,
                                        base64.b64decode(content))

                                    db_conn = db2utils.get_connection()
                                    print("db_conn: {}".format(db_conn))
                                    sql = f'''SELECT ID FROM FINAL TABLE (INSERT INTO EVERESTSCHEMA.EVRE_LEARNING_EMAIL_ATTACHMENTS (EVRE_EMAIL_MSG_ID, 
                                                DOCUMENT_NAME, DOCUMENT_TYPE, CLASSIFICATION_TYPE, STATUS, USED_FOR, DESCRIPTION) 
                                                VALUES ({submission_id},                                  
                                                    '{pdf_object_storage_key}',
                                                    '.pdf',
                                                    'N/A',
                                                    'N',
                                                    'RUNTIME',
                                                    'SPLIT_PDF') 
                                                )       
                                                '''
                                    print("sql: {}".format(sql))

                                    stmt = ibm_db.exec_immediate(db_conn, sql)
                                    result = ibm_db.fetch_both(stmt)
                                    attachment_id = None
                                    if result:
                                        attachment_id = result["ID"]

                                break
                            else:
                                time.sleep(2)

        # End of for loop for PDF conversion

        db_conn = db2utils.get_connection()
        sql = f'''SELECT ID, STATUS, TO_CHAR(FIRST_UPDATED,'YYYY-MM-DD HH.MI.SS') as FIRST_UPDATED, 
                TO_CHAR(LAST_UPDATED,'YYYY-MM-DD HH.MI.SS') as LAST_UPDATED FROM FINAL TABLE 
                (UPDATE EVERESTSCHEMA.EVRE_LEARNING_EMAIL_MSGS SET STATUS = 'SPLIT_PDF' where ID = {submission_id})
                '''

        print("sql: {}".format(sql))

        stmt = ibm_db.exec_immediate(db_conn, sql)
        result = ibm_db.fetch_assoc(stmt)

        result_list = []
        if result:
            result_list.append(result)

        result_dict = {}
        result_dict["result"] = result_list
        result_dict["status"] = "SUCCESS"

        return result_dict

    except (ibm_db.conn_error, ibm_db.conn_errormsg, Exception) as err:
        logging.exception(err)
        result_dict = {}
        result_dict["error"] = err
        result_dict["status"] = "FAILURE"
        return result_dict

    return {"result": "Flow should not reach here"}
コード例 #4
0
    def save(self,
             contentId=False,
             json=False,
             useFileName=False,
             raw=False,
             customPath=None,
             customFilename=None,
             object_storage_bucket_name=None,
             object_storage_key_prefix=None,
             save_to_object_storage=None,
             msg_id=None,
             msg_encoded_id=None,
             msg_document_id=None):
        # Check if the user has specified a custom filename
        filename = None

        if customFilename is not None and customFilename != '':
            filename = customFilename
        else:
            # If not...
            # Check if user wants to save the file under the Content-id
            if contentId:
                filename = self.__cid
            # If filename is None at this point, use long filename as first preference
            if filename is None:
                filename = self.__longFilename
            # Otherwise use the short filename
            if filename is None:
                filename = self.__shortFilename
            # Otherwise just make something up!
            if filename is None:
                filename = 'UnknownFilename ' + \
                           ''.join(random.choice(string.ascii_uppercase + string.digits)
                                   for _ in range(5)) + '.bin'

        if customPath is not None and customPath != '':
            if customPath[-1] != '/' or customPath[-1] != '\\':
                customPath += '/'
            # if unique_id is not None:
            #     filename = customPath + str(unique_id) + "_" + filename
            # else:
            filename = customPath + filename

        print("######filename::{}", filename)

        if self.__type == "data":
            """
            Write to local system
            """
            if not save_to_object_storage:
                with open(filename, 'wb') as f:
                    f.write(self.__data)
            else:
                """
                Write to Object Storage
                """
                tmp_file_name, file_extension = os.path.splitext(filename)

                print("######msg_document_id::{}", msg_document_id)
                object_storage_key = object_storage_key_prefix + "/" + str(
                    msg_id) + "/" + (msg_document_id + "_" + filename)
                print(
                    "object_storage_bucket_name: {}: object_storage_key: {} ".
                    format(object_storage_bucket_name, object_storage_key))

                # Write attachments to the object storage
                return_val = cosutils.save_file(object_storage_bucket_name,
                                                object_storage_key,
                                                self.__data)
                if return_val is "SUCCESS":
                    print("File Uploaded to object storage successfully")

                # create entries in DB2
                db_conn = db2utils.get_connection()
                print("db_conn: {}".format(db_conn))
                sql = f'''SELECT ID FROM FINAL TABLE (INSERT INTO EVERESTSCHEMA.EVRE_LEARNING_EMAIL_ATTACHMENTS (EVRE_EMAIL_MSG_ID, 
                            DOCUMENT_NAME, DOCUMENT_TYPE, CLASSIFICATION_TYPE, STATUS) 
                            VALUES ({msg_id},                                  
                                '{object_storage_key}',
                                '{file_extension}',
                                'N/A',
                                'N') 
                            )       
                            '''
                # print ("sql: {}".format(sql))

                stmt = ibm_db.exec_immediate(db_conn, sql)
                result = ibm_db.fetch_both(stmt)
                attachment_id = None
                if result:
                    attachment_id = result["ID"]

                print(f'attachment_id: {attachment_id}')

        else:
            self.saveEmbededMessage(contentId, json, useFileName, raw,
                                    customPath, customFilename)
        return filename
コード例 #5
0
def main(params):
    logging.info('Calling fn_get_submission_results')

    try:

        cos_everest_submission_bucket = params.get(
            "cos_everest_submission_bucket", None)
        if cos_everest_submission_bucket is None or "":
            raise Exception("Pass location of the bucket")

        submission_id = params.get("submission_id", None)
        if submission_id is None or "":
            raise Exception("Pass submission_id")

        submissions_data_folder = params.get("submissions_data_folder", None)
        if submissions_data_folder is None or "":
            raise Exception("Pass submissions_data_folder")

        standardized_txt_dir = params.get("standardized_txt_dir", None)
        if standardized_txt_dir is None or "":
            raise Exception("Pass standardized_txt_dir")

        mode = params.get("mode", None)
        if mode is None or "":
            raise Exception("Pass mode")

        model_id = params.get("model_id", None)
        if model_id is None or "":
            raise Exception("Pass model_id")

        object_storage_key = submissions_data_folder + "/" + \
            mode + "/" + str(submission_id)

        output_object_storage_key = submissions_data_folder + "/" + \
            mode + "/" + str(submission_id) + "/" + \
            "nlu_results" + "/" + "output.json"

        # + "/" + standardized_txt_dir

        extensions = ['txt']
        regex = r"^" + object_storage_key + ".*txt$"

        file_keys = cosutils.get_bucket_contents(cos_everest_submission_bucket,
                                                 regex)

        nlu_service = watson_nlu_utils.inst()
        results_dict = {}
        nlu_results_list = []
        for key in file_keys:
            print("Processing file:: {}", file_keys)
            if key.endswith(tuple(extensions)):
                txt_file_bytes = cosutils.get_item(
                    cos_everest_submission_bucket, key)

                text = txt_file_bytes.decode("utf-8")
                print("text:: ", len(text.strip()))

                nlu_results = None
                if text is not None and len(text.strip()) != 0:
                    nlu_results = watson_nlu_utils.get_result(
                        nlu_service, model_id, text)

                nlu_results_list.append(nlu_results)

        # get Final cleaned results
        nlu_results_dict = {}
        nlu_response = get_clean_results(nlu_results_list)
        nlu_results_dict["result"] = nlu_response

        res_bytes = str(nlu_results_dict).encode('utf-8')
        print("res_bytes::", res_bytes)

        #  store in object storage
        return_val = cosutils.save_file(cos_everest_submission_bucket,
                                        output_object_storage_key, res_bytes)
        if return_val is "SUCCESS":
            print("File Uploaded to object storage successfully:: {} ",
                  output_object_storage_key)

        validation_status = get_validation_status(nlu_response)

        print("validation_status", validation_status["status"])

        db_conn = db2utils.get_connection()
        sql = f'''SELECT ID, STATUS, TO_CHAR(FIRST_UPDATED,'YYYY-MM-DD HH.MI.SS') as FIRST_UPDATED,
                TO_CHAR(LAST_UPDATED,'YYYY-MM-DD HH.MI.SS') as LAST_UPDATED FROM FINAL TABLE
                (UPDATE EVERESTSCHEMA.EVRE_LEARNING_EMAIL_MSGS SET STATUS = '{validation_status["status"]}' where ID = {submission_id})
                '''

        print("sql: {}".format(sql))

        stmt = ibm_db.exec_immediate(db_conn, sql)
        result = ibm_db.fetch_assoc(stmt)
        result_list = []
        if result:
            result_list.append(result)

        result_dict = {}
        result_dict["result"] = result_list
        result_dict["status"] = "SUCCESS"

        print(result_dict)
        return result_dict

    except (ibm_db.conn_error, ibm_db.conn_errormsg, Exception) as err:
        logging.exception(err)
        result_dict = {}
        result_dict["error"] = err
        result_dict["status"] = "FAILURE"
        return result_dict

    return {"result": "Flow should not reach here"}