def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData: """ Parse CAS pdf and returns line data. :param filename: CAS pdf file (CAMS or Kfintech) :param password: CAS pdf password :return: partial cas data with FileType, InvestorInfo and lines of data """ file_type: FileType = FileType.UNKNOWN if isinstance(filename, str): fp = open(filename, "rb") elif isinstance(filename, io.IOBase): fp = filename elif hasattr(filename, "read"): # compatibility for Django UploadedFile fp = filename else: raise CASParseError( "Invalid input. filename should be a string or a file like object") with fp: try: doc = fitz.open(stream=fp.read(), filetype="pdf") except Exception as e: raise CASParseError("Unhandled error while opening file :: %s" % (str(e))) if doc.needsPass: rc = doc.authenticate(password) if not rc: raise CASParseError("Incorrect PDF password!") pages = [] investor_info = None for page in doc: text_page = page.getTextPage() page_dict = text_page.extractDICT() blocks = extract_blocks(page_dict) if file_type == FileType.UNKNOWN: file_type = parse_file_type(blocks) sorted_blocks = sorted(blocks, key=itemgetter(1, 0)) if investor_info is None: investor_info = parse_investor_info(page_dict) pages.append(sorted_blocks) lines = group_similar_rows(pages) return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines)
if current_folio is None or current_folio != folio: if curr_scheme_data and current_folio is not None: folios[current_folio]["schemes"].append(curr_scheme_data) curr_scheme_data = {} current_folio = folio folios[folio] = { "folio": current_folio, "amc": current_amc, "PAN": "", "KYC": "", "PANKYC": "", "schemes": [], } elif m := re.search(SCHEME_RE, line, re.DOTALL | re.MULTILINE | re.I): if current_folio is None: raise CASParseError( "Layout Error! Scheme found before folio entry.") scheme = re.sub(r"\(formerly.+?\)", "", m.group(2), flags=re.I | re.DOTALL).strip() if curr_scheme_data.get("scheme") != scheme: if curr_scheme_data: folios[current_folio]["schemes"].append(curr_scheme_data) advisor = m.group(3) if advisor is not None: advisor = advisor.strip() rta = m.group(4).strip() rta_code = m.group(1).strip() isin, amfi = isin_search(scheme, rta, rta_code) curr_scheme_data = { "scheme": scheme,
email_found = True continue if name is None: name = txt else: if m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt, re.I): mobile = m.group(1).strip() address_lines.append(txt) if mobile is not None: return InvestorInfo(email=email, name=name, mobile=mobile, address="\n".join(address_lines)) if email is None or mobile is None: raise CASParseError("Unable to parse investor data") def group_similar_rows(elements_list: List[Iterator[Any]]): """ Group elements having similar rows, with a tolerance. :param elements_list: List of elements from each page """ lines = [] for elements in elements_list: sorted_elements = list(sorted(elements, key=itemgetter(3, 0))) if len(sorted_elements) == 0: continue y0, y1 = sorted_elements[0][1], sorted_elements[0][3] items = []
def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData: """ Parse CAS pdf and returns line data. :param filename: CAS pdf file (CAMS or Kfintech) :param password: CAS pdf password :return: array of lines from the CAS. """ file_type: Optional[FileType] = None if isinstance(filename, str): fp = open(filename, "rb") elif hasattr(filename, "read") and hasattr(filename, "close"): # file-like object fp = filename else: raise CASParseError( "Invalid input. filename should be a string or a file like object") with fp: pdf_parser = PDFParser(fp) try: document = PDFDocument(pdf_parser, password=password) except PDFPasswordIncorrect: raise CASParseError("Incorrect PDF password!") except PDFSyntaxError: raise CASParseError("Unhandled error while opening file") line_margin = { FileType.KFINTECH: 0.1, FileType.CAMS: 0.2 }.get(detect_pdf_source(document), 0.2) rsrc_mgr = PDFResourceManager() laparams = LAParams(line_margin=line_margin, detect_vertical=True) device = PDFPageAggregator(rsrc_mgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrc_mgr, device) pages: List[Iterator[LTTextBoxHorizontal]] = [] investor_info = None for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() text_elements = filter( lambda x: isinstance(x, LTTextBoxHorizontal), layout) if file_type is None: for el in filter(lambda x: isinstance(x, LTTextBoxVertical), layout): if re.search("CAMSCASWS", el.get_text()): file_type = FileType.CAMS if re.search("KFINCASWS", el.get_text()): file_type = FileType.KFINTECH if investor_info is None: investor_info = parse_investor_info(layout, *page.mediabox[2:]) pages.append(text_elements) lines = group_similar_rows(pages) return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines)