Ejemplo n.º 1
0
def digitalpdf_data_extraction(paths, digital_text, scanned):
	digital_text = ["" if i is None else i for i in digital_text]
	content_type = "UNKNOWN"
	module_name = ""
	for index in range(len(digital_text)):
		disc_con_typ = discover_content_type(digital_text[index])
		logger.info("discovered content type" + disc_con_typ[0])
		if disc_con_typ[0] is not "UNKNOWN":
			logger.info("inside not unknown")
			content_type = disc_con_typ[0]
			module_name = disc_con_typ[1]
			digital_text = digital_text[index:len(digital_text)]
			# removing pages that did not have usable data
			if index > 0:
				mergeObj = PdfFileMerger()
				pdfObj = convert_from_path(paths["filepath"], first_page=index + 1, last_page=len(digital_text))
				for page in pdfObj:
					page.save(paths["inputpath"] + "/temp.pdf")
					temp = PdfFileReader(paths["inputpath"] + "/temp.pdf")
					mergeObj.append(temp)
				os.remove(paths["inputpath"] + "/temp.pdf")
				mergeObj.write(paths["filepath"])

			pages = pdfplumber.open(paths["filepath"]).pages
			if scanned:
				if len(pages) > 1:
					create_tiff(paths)
			break
	return [content_type, module_name, digital_text]
Ejemplo n.º 2
0
def run_ml_api():
	try:
		x = {}
		upload_folder = config.get('api', 'upload_folder')
		path = upload_folder + str(uuid.uuid4())

		if len(request.files) == 0:
			x["code"] = -1
			x["message"] = "no file received"
			return jsonify(x)
		print("identified")
		file = request.files[(list(request.files))[0]]
		paths = init_dir(path)
		doc_type_or_x = file_type(paths, file)
		logger.info(doc_type_or_x)
		print("file type done")
		if type(doc_type_or_x) is dict:
			return jsonify(doc_type_or_x)
		doc_type = doc_type_or_x
		print("doc type is identified")
		file_size_validity = verify_file_size(paths["filepath"])
		print("bbefore size")
		if file_size_validity == False:
			x["code"] = -1
			x["message"] = "File size exceeded"
		else:
			params = {"spam_flag": "", "structuredParsing_flag": "", "unstructuredParsing_flag": "", "doc_type": doc_type,"pass_header": ""}
			req = request.headers
			params = processing_headers(req, params)
			x = process_forms(paths, params)
			logger.info(x)

		return jsonify(x)
	except:
		return jsonify({"code":1, "message":"file not suported"})
Ejemplo n.º 3
0
def verify_file_size(filepath):
	max_size = int(config.get('api', 'max_file_size')) * 1024 * 1024  # reading in MB converting to bytes
	file_size = path.getsize(filepath)
	logger.info("Size of the file is " + str(file_size))
	if file_size > max_size:
		return False
	return True
Ejemplo n.º 4
0
def flush_dir(paths):
    logger.info("now cleaning temp directories")
    rmtree(paths["inputpath"])
    rmtree(paths["imagepath"])
    rmtree(paths["deskewpath"])
    rmtree(paths["ocrpath"])
    rmtree(paths["cbimagepath"])
    rmtree(paths["tmpdirpath"])
Ejemplo n.º 5
0
def decode(password):
    logger.info("inside decode pass for: " + password)
    logger.info("init java done: ")
    password = subprocess.check_output([
        "java", "-jar", "/".join(
            path.dirname(path.abspath(__file__)).split("/") +
            ["extdata", "RXCODEC.jar"]), password
    ]).decode("ascii")[:-1]
    return password
Ejemplo n.º 6
0
def get_passwd(pass_header):
    if len(pass_header) == 0:
        passw = ""
    else:
        try:
            passw = decode(pass_header)
            logger.info("decoded pass: "******"\nstartTrace::::" + traceback.format_exc().strip() +
                         "::::endTrace").replace("\n", "\n$"))
            passw = ""
    return passw
Ejemplo n.º 7
0
def pdf_server(status, name):
    """
    html转pdf服务,提供html文件、网址、字符串,返回压缩后的pdf文件
    :return: jsonj结果
    """
    try:
        logging.info('get_task')
        if status == 'file':
            html_data = base64.b64decode(name)
            with open(Config.html_file, 'wb') as f:
                f.write(html_data)
        result = html_pdf_main(status, name)
        logging.info('result success')
        return result
    except Exception as e:
        logging.exception('%s', e)
        return {'status': 1, 'error': e}
Ejemplo n.º 8
0
def process_json(x):
    y = pviJSON.__addWithValuesToDictObject__(x)
    for value in ["products", "events"]:
        if value in y.keys():
            if len(y[value]) < 1:
                logger.info("no %s exist" % value)
                del y[value]
            else:
                products_list = [count_nan(each, 0, 0) for each in y[value]]
                y[value] = [
                    y[value][i] for i in range(len(products_list))
                    if products_list[i][1] != products_list[i][2]
                ]
                if len(y[value]) < 1:
                    logger.info("no %s exist" % value)
                    del y[value]
    return y
Ejemplo n.º 9
0
def scannedpdf_data_extraction(paths,scan):
	logger.info("deskew and hocr generator for unknown content type")
	create_tiff(paths)
	pdf = pdfplumber.open(paths["ocrpath"] + "/form.pdf")
	pages = pdf.pages
	content_type = "UNKNOWN"
	module_name = ""
	complete_ocr_text = ""
	for page in pages:
		ocr_text = page.extract_text()
		complete_ocr_text = complete_ocr_text + "\n" + ocr_text
		content_type_module_name = discover_content_type(ocr_text)
		logger.info("discovered content type " + content_type_module_name[0])
		if content_type_module_name[0] is not "UNKNOWN":
			content_type = content_type_module_name[0]
			module_name = content_type_module_name[1]
			text = ocr_text
			logger.info("OCR_text_content_type: " + content_type)
			if page.page_number >= 2:
				mergeObj = PdfFileMerger()
				pdfObj = convert_from_path(paths["filepath"], first_page=page.page_number)
				for tmp_page in pdfObj:
					tmp_page.save(paths["inputpath"] + "/out_form.pdf")
					temp = PdfFileReader(paths["inputpath"] + "/out_form.pdf")
					mergeObj.append(temp)
				os.remove(paths["inputpath"] + "/out_form.pdf")
				mergeObj.write(paths["filepath"])

			break
	return [content_type, module_name, complete_ocr_text]
Ejemplo n.º 10
0
def validate_password(filepath, passw):
    logger.info("inside if_passwd_works: " + passw)
    x = {"code": None, "message": None}
    try:
        temp = pdfplumber.open(filepath, password=passw)
    except PDFPasswordIncorrect as err:
        logger.error(err)
        logger.info(("\nstartTrace::::" + traceback.format_exc().strip() +
                     "::::endTrace").replace("\n", "\n$"))
        x["code"] = 6
        x["message"] = "pdf file is password protected. Kindly provide right password."
        return x
    except Exception as err:
        logger.error(err)
        logger.info(("\nstartTrace::::" + traceback.format_exc().strip() +
                     "::::endTrace").replace("\n", "\n$"))
        x["code"] = 1
        x["message"] = "Can not open file. Please check password or file type."
        return x
    return x
Ejemplo n.º 11
0
def zip_server():
    """
    png压缩服务,上传png文件,返回压缩后的png文件
    :return: jsonj结果
    """
    try:
        file = request.files['img']  # 获取上传的文件
        if file and allowed_file(
                file.filename,
                Config.PNG_ALLOWED_EXTENSIONS):  # 如果文件存在并且符合要求则为 true
            logging.info('get_task' + str(file.filename))
            filename = secure_filename(file.filename)  # 获取上传文件的文件名
            file.save(os.path.join(Config.image_save_path, filename))  # 保存文件
            logging.info('result success')
            return result
        else:
            logging.info('result' + 'no task')
            return {'status': 'no task'}
    except Exception as e:
        logging.exception('%s', e)
        return {'status': 1, 'error': e}
Ejemplo n.º 12
0
def init_dir(path):
	logger.info("creating temp directories")
	paths = {"tmpdirpath": path,
			 "inputpath": path + "/input",
			 "ocrpath": path + "/ocr",
			 "deskewpath": path + "/deskew",
			 "imagepath": path + "/image",
			 "cbimagepath": path + "/cb_images"}
	try:
		mkdir(paths["tmpdirpath"])
		mkdir(paths["inputpath"])
		mkdir(paths["inputpath"] + "/sample_dir")
		mkdir(paths["imagepath"])
		mkdir(paths["cbimagepath"])
		mkdir(paths["deskewpath"])
		mkdir(paths["ocrpath"])
		mkdir(paths["ocrpath"] + "/ext_sec_data")
	except OSError:
		logger.info("Creation of the directory %s failed" % path)
		logger.critical(("\nstartTrace::::" + traceback.format_exc().strip() + "::::endTrace").replace("\n", "\n$"))
	else:
		logger.info("Successfully created the directory %s " % path)
	return paths
Ejemplo n.º 13
0
def spam_identification(x, y, spam_flag, text, paths):
    if (not ("products" in y.keys() and "events" in y.keys()) or
        (len(y["products"]) == 1 and y["products"][0].license_value is None)):
        logger.info("no product or event exists")
        if not spam_flag:
            x = {
                "code":
                2,
                "message":
                "Spam detection is set False. Form may or may not contain valid case data."
            }
            return x
        if os.path.exists(paths["ocrpath"] + "/spam.txt"):
            os.remove(paths["ocrpath"] + "/spam.txt")
        file = open(paths["ocrpath"] + "/spam.txt", "w")
        file.write(str(text))
        file.close()
        logger.info("text going in spam is : ", text)
        py_comm_comp.spam_detect.spam_detector(
            paths["ocrpath"] + "/spam.txt", spam_model,
            paths["ocrpath"] + "/result.json")
        result_json = paths["ocrpath"] + "/result.json"
        with open(result_json, "r") as data:
            t = json.load(data)
        logger.info(t)
        if t["label"] == "spam" or (len(y["products"]) == 1 and
                                    y["products"][0].license_value is None):
            x["code"] = 4
            x["message"] = "spam"
            try:
                x["spam_acc"] = float(
                    "%.2f" % t["label_accu"]
                )  # still dont know what 2 does in as.numeric(.., 2)
            except:
                x["spam_acc"] = None
            return x
    return x
Ejemplo n.º 14
0
def check_pqc(paths, text):
    py_path = "/".join(
        path.dirname(path.abspath(__file__)).split("/")[:-1] +
        ["unstructured", "extdata"])
    if os.path.exists(paths["ocrpath"] + "/pqc-ae.txt"):
        os.remove(paths["ocrpath"] + "/pqc-ae.txt")
    file = open(paths["ocrpath"] + "/pqc-ae.txt", "w")
    file.write(str(text))
    file.close()
    logger.info("text going in pqc-ae is : ", text)
    py_comm_comp.pqc_ae_detect.pqc_ae_detector(
        paths["ocrpath"] + "/pqc-ae.txt", pqc_ae_model,
        paths["ocrpath"] + "/result_pqc_ae.json")
    result_json = paths["ocrpath"] + "/result_pqc_ae.json"
    with open(result_json, "r") as data:
        z = json.load(data)
    # t = json.dumps(t)
    logger.info(z)
    case_category = z["label"]
    case_category_accu = z["label_accu"]
    logger.info("case_category_accu is --------------------",
                case_category_accu)
    case_category = case_category.split("-")
    logger.info("case_category value is ")
    logger.info(case_category)
    category_flag = False
    if case_category == ["MI"] or checkMalfunctions(text, py_path):
        category_flag = True
    logger.info("category flag value is ")
    logger.info(category_flag)
    if len(case_category) > 0:
        for every_category in range(len(case_category)):
            if case_category[every_category] == "AE":
                case_category[every_category] = "AE Case"
            elif case_category[every_category] == "MI":
                case_category[every_category] = "Medical Inquiry"
    return case_category, case_category_accu, category_flag
Ejemplo n.º 15
0
def unstructured_form_parsing(paths, text, unstructuredParsing_flag,
                              spam_flag):
    logger.info("unstructured parsing in progress")

    x = {"code": None, "message": None, "spam_acc": None}

    try:
        if text.strip() == "":
            pdfObj = pdfplumber.open(paths["filepath"])
            pagesObj = pdfObj.pages
            for page in pagesObj:
                if page.page_number == 1:
                    fileName = paths["ocrpath"] + "/ocred_text.txt"
                    file = open(fileName, "rb")
                    text = file.read().decode("ASCII")
                    os.remove(fileName)
                    logger.info(text)
                else:
                    # do ocr and append text
                    logger.info(
                        "unstructured: doing ocr for extended pages, page_num: "
                        + page)
                    imageObj = page.to_image(resolution=300)
                    imageObj.save(paths["imagepath"] + "/form.tiff",
                                  format="tiff")
                    fileName = paths["ocrpath"] + "/ocred.txt"
                    file = open(fileName, "w")
                    file.write(
                        str(
                            pytesseract.image_to_string(
                                Image.open(paths["imagepath"] +
                                           "/form.tiff"))))
                    file.close()
                    fileName = paths["ocrpath"] + "/ocred_text.txt"
                    file = open(fileName, "rb")

        text = process_text(paths, text)
        case_category, case_category_accu, category_flag = check_pqc(
            paths, text)
        '''for normal unstructured'''
        #		unstructured.unstructure_pipeline.unstruct_prediction(paths["ocrpath"] + "/test.txt", paths["ocrpath"] + "/unsoutput.json", py_path, category_flag)
        #		for unstructured api
        cioms_flag = None
        url_unst1 = config.get('unstructured', 'unstructure_api_url')
        print("----------------------------------------" + url_unst1)
        config_path_uns = "/home/ubuntu/pvi-form-engine/structuredForms/py_generic/extdata/config/config.json"
        config_json_uns = json.load(open(config_path_uns))
        url_unst = config_json_uns[
            "base-generic-url"] + ":" + "9888/unstruct/live"
        print("----------------------------------------" + url_unst)
        #requests.post(url_unst, headers={"input_file": paths["ocrpath"] + "/test.txt",
        #								 "output_file": paths["ocrpath"] + "/unsoutput.json",
        #								 "PQC_FLAG": str(category_flag), "cioms_flag": str(cioms_flag)})
        files = {'file2': open(paths["ocrpath"] + "/test.txt", 'rb')}
        x = requests.post(url_unst,
                          files=files,
                          headers={
                              "PQC_FLAG": str(category_flag),
                              "cioms_flag": str(cioms_flag)
                          })
        x = x.json()
        #try:
        #	with open(paths["ocrpath"] + "/unsoutput.json") as data:
        #		x = json.load(data)
        #except:
        #	with open("/".join(path.dirname(path.abspath(__file__)).split("/")[:-3] + ["temp1.json"])) as data:
        #		x = json.load(data)
        logger.info("x from JSON")
        logger.info(x)

        y = process_json(x)
        x = spam_identification(x, y, spam_flag, text, paths)
        if x["message"] == "spam" or x["code"] == 2:
            return x
        x["code"] = 6
        x["message"] = "Non Form AE Case"
        if case_category_accu >= 0.98:
            x["categories"] = case_category
        return x

    except Exception:
        logger.info(("\nstartTrace::::" + traceback.format_exc().strip() +
                     "::::endTrace").replace("\n", "\n$"))
        x = {
            "code": 3,
            "message":
            "error came parsing unstructured AE or checking for spam"
        }
        return x
Ejemplo n.º 16
0
def structured_form_parsing(paths, content_type, module_name):
	logger.info("parsing structured form")
	x = {"code": None, "content_type": None, "module_name": None, "message": None}
	try:
		logger.info("Content type is " + content_type)
		logger.info("Module name is " + module_name)
		logger.info("going for if condition")
		allowed_form = config.getboolean("forms", content_type)
		if not allowed_form:
			return {"code": 1, "content_type": content_type, "module_name": module_name,
					"message": "Form is recognized from the config file but is not allowed"}
		# loads python modules for specific forms based on identification
		ImportModuleName = "py_" + module_name
		ImportModulePath = "/".join(
			path.dirname(path.abspath(__file__)).split("/")[:-1] + ["structuredForms", ImportModuleName, "__init__.py"])

		# checks for module in site-packages first if not found then searches local git repository for module (easier for development purposes)
		spec = importlib.util.find_spec(ImportModuleName)
		if spec is None:
			spec = util.spec_from_loader(ImportModuleName,
										 importlib.machinery.SourceFileLoader(ImportModuleName, ImportModulePath))
		logger.info(spec)
		if spec is None:
			logger.info("case where no module dir exists")
			x["code"] = 5
			x["module_name"] = module_name
			x["message"] = "Form is recognized from the config file but the module is missing"
		else:
			try:
				# if module is found then loads the module and executed parseFromModule() in main.py for that function
				# new modules created must have parseFromModule() in main.py with one argument for temporary directory path
				imported_module = importlib.util.module_from_spec(spec)
				spec.loader.exec_module(imported_module)
				if module_name == "generic":
					x = imported_module.main.parseFromModule(paths["tmpdirpath"], content_type)
				else:
					x = imported_module.main.parseFromModule(paths["tmpdirpath"])
					x["model_type"] = content_type
					x["module_name"] = module_name
				logger.info("completed parsing in " + module_name)
				if x["message"] == "error in Parsing":
					x["code"] = 1
					x["message"] = "This AE form is not configure"
					x["model_type"] = None
					x["module_name"] = None
				else:
					x["code"] = 5
					x["message"] = "Form is recognized from the config file and parsed using module"
			except Exception as err:
				logger.info("parseFromModule missing in the " + module_name + " package")
				logger.error(err)
				logger.error(
					("\nstartTrace::::" + traceback.format_exc().strip() + "::::endTrace").replace("\n", "\n$"))
				x["code"] = 3
				x["module_name"] = module_name
				x["message"] = "generic-form-parser api is off"
		return x
	except Exception as err:
		logger.error(err)
		logger.error(("\nstartTrace::::" + traceback.format_exc().strip() + "::::endTrace").replace("\n", "\n$"))
		x["code"] = 3
		x["message"] = "AE form failed to Parse"
		return x
Ejemplo n.º 17
0
def find_content_type(paths, doc_type):
	x = {"code": None, "message": None}
	# checks for module in site-packages first if not found then searches local git repository for module (easier for development purposes)

	if doc_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
		logger.info("file type is docx")
		digital_text = get_text(paths["inputpath"] + "/form.docx")
		logger.info(digital_text)
		disc_con_typ = discover_content_type(digital_text)
		logger.info("discovered content type")
		logger.info(disc_con_typ)
		if disc_con_typ[0] is not "UNKNOWN":
			logger.info("inside not unknown")
			content_type = disc_con_typ[0]
			module_name = disc_con_typ[1]
			return [content_type, module_name, digital_text]

	elif doc_type == "text/plain":
		logger.info("file type is txt")
		digital_text = get_text_from_txt(paths["inputpath"] + "/form.txt")
		logger.info(digital_text)
		disc_con_typ = discover_content_type(digital_text)
		logger.info("discovered content type")
		logger.info(disc_con_typ)
		if disc_con_typ[0] is not "UNKNOWN":
			logger.info("inside not unknown")
			content_type = disc_con_typ[0]
			module_name = disc_con_typ[1]
			return [content_type, module_name, digital_text]
	elif doc_type == "text/csv":
                logger.info("file type is csv")
                digital_text = get_text_from_csv(paths["inputpath"] + "/form.csv")
                logger.info(digital_text)
                disc_con_typ = discover_content_type(digital_text)
                logger.info("discovered content type")
                logger.info(disc_con_typ)
                if disc_con_typ[0] is not "UNKNOWN":
                        logger.info("inside not unknown")
                        content_type = disc_con_typ[0]
                        module_name = disc_con_typ[1]
                        return [content_type, module_name, digital_text]

	else:
		scanned = False
		logger.info("file type is docx")
		digital_text = []
		try:
			pdf = pdfplumber.open(paths["filepath"])
		except:
			return {"code": 1, "message": "password protected switch is off"}
		for page in pdf.pages:
			digital_text.append(page.extract_text())
		if None in digital_text:
			scanned = True

		out_digital = digitalpdf_data_extraction(paths,digital_text, scanned)
		if out_digital[0] is not "UNKNOWN":
			return out_digital

		if not config.getboolean('document_type', 'scanned_pdf') and scanned:
			x["code"] = -1
			x["message"] = "scanned pdf not allowed"

		out_scanned = scannedpdf_data_extraction(paths, scanned)
		if len(out_scanned[2]) < len(out_digital[2]):
			out_scanned[2] = out_digital[2]

	return out_scanned
Ejemplo n.º 18
0
def process_forms(paths, params):
    logger.info("list of all paths is ", paths)
    logger.info("list of all params is ", params)
    x = {}
    # setting flag value
    flags = flag_setter(params["spam_flag"], params["structuredParsing_flag"],
                        params["unstructuredParsing_flag"])

    spam_flag = flags[0]
    structuredParsing_flag = flags[1]
    unstructuredParsing_flag = flags[2]

    # checking password
    logger.info("Pass header is: %s" % params["pass_header"])
    passwd = get_passwd(params["pass_header"])
    logger.info("decoded pass is: %s" % passwd)

    if params["doc_type"] == "application/pdf":
        response = validate_password(paths["filepath"], passwd)
        logger.info(response)

        if response['code'] is not None and response['message'] is not None:
            flush_dir(paths)
            return response
        try:
            pdf_info = pdfinfo_from_path(paths["filepath"], userpw=passwd)
            decrypt_pdf(paths, pdf_info, passwd)

            if pdf_info["Pages"] > int(config.get('api', 'max_file_pages')):
                temp_var = pfw()
                for i in range(16):
                    temp_var.addPage((pfr(paths["filepath"], 'rb')).getPage(i))
                with open((paths["inputpath"] + "/trimmed_form.pdf"),
                          'wb') as f:
                    temp_var.write(f)
                os.rename(paths["inputpath"] + "/trimmed_form.pdf",
                          paths["filepath"])
        except:
            pass

    result = find_content_type(paths, params["doc_type"])
    logger.info(result)
    if type(result) is not list:
        flush_dir(paths)
        return result

    content_type = result[0]
    module_name = result[1]
    text = result[2]
    # parsing
    if content_type in ["UNKNOWN", "", " "]:
        response = unstructured_form_parsing(paths, text[0],
                                             unstructuredParsing_flag,
                                             spam_flag)
        flush_dir(paths)
        return response

    # code for structured from parsing
    if not structuredParsing_flag:
        x["model_type"] = content_type
        x["code"] = 5
        x["message"] = "Form is medwatch or CIOMS."
        flush_dir(paths)
        return x
    else:
        response = structured_form_parsing(paths, content_type, module_name)
        flush_dir(paths)
        return response