예제 #1
0
파일: mupdf.py 프로젝트: nirmanp6/casparser
def cas_pdf_to_text(filename: Union[str, io.IOBase],
                    password) -> PartialCASData:
    """
    Parse CAS pdf and returns line data.

    :param filename: CAS pdf file (CAMS or Kfintech)
    :param password: CAS pdf password
    :return: partial cas data with FileType, InvestorInfo and lines of data
    """
    file_type: FileType = FileType.UNKNOWN

    if isinstance(filename, str):
        fp = open(filename, "rb")
    elif isinstance(filename, io.IOBase):
        fp = filename
    elif hasattr(filename, "read"):  # compatibility for Django UploadedFile
        fp = filename
    else:
        raise CASParseError(
            "Invalid input. filename should be a string or a file like object")

    with fp:
        try:
            doc = fitz.open(stream=fp.read(), filetype="pdf")
        except Exception as e:
            raise CASParseError("Unhandled error while opening file :: %s" %
                                (str(e)))

        if doc.needsPass:
            rc = doc.authenticate(password)
            if not rc:
                raise CASParseError("Incorrect PDF password!")

        pages = []
        investor_info = None

        for page in doc:
            text_page = page.getTextPage()
            page_dict = text_page.extractDICT()
            blocks = extract_blocks(page_dict)
            if file_type == FileType.UNKNOWN:
                file_type = parse_file_type(blocks)
            sorted_blocks = sorted(blocks, key=itemgetter(1, 0))
            if investor_info is None:
                investor_info = parse_investor_info(page_dict)
            pages.append(sorted_blocks)
        lines = group_similar_rows(pages)
        return PartialCASData(file_type=file_type,
                              investor_info=investor_info,
                              lines=lines)
예제 #2
0
     if current_folio is None or current_folio != folio:
         if curr_scheme_data and current_folio is not None:
             folios[current_folio]["schemes"].append(curr_scheme_data)
             curr_scheme_data = {}
         current_folio = folio
         folios[folio] = {
             "folio": current_folio,
             "amc": current_amc,
             "PAN": "",
             "KYC": "",
             "PANKYC": "",
             "schemes": [],
         }
 elif m := re.search(SCHEME_RE, line, re.DOTALL | re.MULTILINE | re.I):
     if current_folio is None:
         raise CASParseError(
             "Layout Error! Scheme found before folio entry.")
     scheme = re.sub(r"\(formerly.+?\)",
                     "",
                     m.group(2),
                     flags=re.I | re.DOTALL).strip()
     if curr_scheme_data.get("scheme") != scheme:
         if curr_scheme_data:
             folios[current_folio]["schemes"].append(curr_scheme_data)
         advisor = m.group(3)
         if advisor is not None:
             advisor = advisor.strip()
         rta = m.group(4).strip()
         rta_code = m.group(1).strip()
         isin, amfi = isin_search(scheme, rta, rta_code)
         curr_scheme_data = {
             "scheme": scheme,
예제 #3
0
파일: mupdf.py 프로젝트: nirmanp6/casparser
                        email_found = True
                    continue
                if name is None:
                    name = txt
                else:
                    if m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt,
                                      re.I):
                        mobile = m.group(1).strip()
                    address_lines.append(txt)
                    if mobile is not None:
                        return InvestorInfo(email=email,
                                            name=name,
                                            mobile=mobile,
                                            address="\n".join(address_lines))
    if email is None or mobile is None:
        raise CASParseError("Unable to parse investor data")


def group_similar_rows(elements_list: List[Iterator[Any]]):
    """
    Group elements having similar rows, with a tolerance.

    :param elements_list: List of elements from each page
    """
    lines = []
    for elements in elements_list:
        sorted_elements = list(sorted(elements, key=itemgetter(3, 0)))
        if len(sorted_elements) == 0:
            continue
        y0, y1 = sorted_elements[0][1], sorted_elements[0][3]
        items = []
예제 #4
0
def cas_pdf_to_text(filename: Union[str, io.IOBase],
                    password) -> PartialCASData:
    """
    Parse CAS pdf and returns line data.

    :param filename: CAS pdf file (CAMS or Kfintech)
    :param password: CAS pdf password
    :return: array of lines from the CAS.
    """
    file_type: Optional[FileType] = None

    if isinstance(filename, str):
        fp = open(filename, "rb")
    elif hasattr(filename, "read") and hasattr(filename,
                                               "close"):  # file-like object
        fp = filename
    else:
        raise CASParseError(
            "Invalid input. filename should be a string or a file like object")

    with fp:
        pdf_parser = PDFParser(fp)
        try:
            document = PDFDocument(pdf_parser, password=password)
        except PDFPasswordIncorrect:
            raise CASParseError("Incorrect PDF password!")
        except PDFSyntaxError:
            raise CASParseError("Unhandled error while opening file")

        line_margin = {
            FileType.KFINTECH: 0.1,
            FileType.CAMS: 0.2
        }.get(detect_pdf_source(document), 0.2)

        rsrc_mgr = PDFResourceManager()
        laparams = LAParams(line_margin=line_margin, detect_vertical=True)
        device = PDFPageAggregator(rsrc_mgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrc_mgr, device)

        pages: List[Iterator[LTTextBoxHorizontal]] = []

        investor_info = None
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            text_elements = filter(
                lambda x: isinstance(x, LTTextBoxHorizontal), layout)
            if file_type is None:
                for el in filter(lambda x: isinstance(x, LTTextBoxVertical),
                                 layout):
                    if re.search("CAMSCASWS", el.get_text()):
                        file_type = FileType.CAMS
                    if re.search("KFINCASWS", el.get_text()):
                        file_type = FileType.KFINTECH
            if investor_info is None:
                investor_info = parse_investor_info(layout, *page.mediabox[2:])
            pages.append(text_elements)

        lines = group_similar_rows(pages)
        return PartialCASData(file_type=file_type,
                              investor_info=investor_info,
                              lines=lines)