def lambda_handler(event, context): if (event['status'] <= 0): return {**event, "errorMessage": "Status isnt positive"} aws_env = { **event, "bucketName": os.environ.get('DOCUMENTS_BUCKET'), "awsRegion": 'eu-west-1', "tmpJsonOutput": "/tmp/tmp_result.json", "tmpTxtOutput": "/tmp/tmp_result.txt", "outputBucket": os.environ.get('DOCUMENTS_BUCKET'), "outputNameJson": get_bbox_filename(event['objectName'], ".json"), "outputNameTxt": get_bbox_filename(event['objectName'], ".txt"), "textractOnly": os.environ.get('TEXTRACT_ONLY'), "minCharNeeded": int(os.environ.get('MIN_CHAR_NEEDED')), "extract_pdf_lines": os.environ.get('EXTRACT_PDF_LINES'), } status = {"statusCode": 200, "body": "All right"} extract_pdf_lines = aws_env['extract_pdf_lines'] textract_only = aws_env['textractOnly'] tmp_folder = "/tmp/pdfToBbox" pdf_tmp_path = copy_pdf_to_tmp(tmp_folder, aws_env) print("==> aws_env: ", aws_env) if textract_only == "false" and is_pdf_has_enough_characters( pdf_tmp_path, aws_env['minCharNeeded']) is True: print("=> Extracting bounding box with pdfplumber") if extract_pdf_lines == "true": print("=> Extracting pdf lines bbox") pdf = Pdf(pdf_tmp_path, aws_env['tmpJsonOutput'], aws_env['tmpTxtOutput']) pdf.parse_pdf() pdf.save_in_json() pdf.save_in_txt() write_bbox_to_s3(aws_env) else: print("=> Extracting pdf words bbox") if execute_pdf_to_bbox(pdf_tmp_path, aws_env['tmpJsonOutput']): print("=> Error while trying to get pdf information") aws_env["status"] = -1 aws_env["errorMessage"] = "PDF format not supported." else: write_bbox_to_s3(aws_env) else: print("Extracting bounding box with textract") #send_to_textract(aws_env) aws_env['size'] = S3Helper.getS3FileSize(aws_env['bucketName'], aws_env['outputNameTxt'], aws_env['awsRegion']) aws_env["s3Url"] = get_new_s3_url(aws_env['s3Url'], "txt") aws_env["status"] = status aws_env["status"] = 1 aws_env["errorMessage"] = None aws_env["contentType"] = "text/txt" aws_env['objectName'] = aws_env['outputNameTxt'] aws_env["sourceUrl"] = aws_env["s3Url"] AwsHelper.refreshTmpFolder(tmp_folder) return update_event(aws_env, event)
def lambda_handler(event, context): aws_env = { **event, "bucketName": os.environ['DOCUMENTS_BUCKET'], "awsRegion": 'eu-west-1', "outputBucket": os.environ['DOCUMENTS_BUCKET'], "outputName": get_pdf_filename(event["objectName"], ""), } print("==> Aws Env: {0}".format(json.dumps(aws_env))) html_content = read_from_s3(aws_env) sanitized_html_content = sanitize_html_content(html_content) status = convert_html_to_pdf(sanitized_html_content, aws_env) aws_env['size'] = S3Helper.getS3FileSize(aws_env['bucketName'], aws_env['outputName'], aws_env['awsRegion']) aws_env["s3Url"] = get_new_s3_url(aws_env['s3Url'], "pdf") aws_env["status"] = status['status'] aws_env["errorMessage"] = status["errorMessage"] aws_env["contentType"] = "text/pdf" aws_env["objectName"] = aws_env["outputName"] aws_env["sourceUrl"] = aws_env["s3Url"] return update_event(aws_env, event)