Example #1
0
    def __init__(self, pdf, page_obj, page_number=None, initial_doctop=0, clean_unicode=True):
        self.pdf = pdf
        self.page_obj = page_obj
        self.page_number = page_number
        self.rotation = self.page_obj.attrs.get("Rotate", 0) % 360
        self.page_obj.rotate = self.rotation
        self.initial_doctop = self.decimalize(initial_doctop)

        cropbox = page_obj.attrs.get("CropBox")
        mediabox = page_obj.attrs.get("MediaBox")

        self.cropbox = self.decimalize(resolve_all(cropbox)) if cropbox is not None else None
        self.mediabox = self.decimalize(resolve_all(mediabox) or self.cropbox)
        m = self.mediabox

        if self.rotation in [ 90, 270 ]:
            self.bbox = self.decimalize((
                min(m[1], m[3]),
                min(m[0], m[2]),
                max(m[1], m[3]),
                max(m[0], m[2]),
            ))
        else:
            self.bbox = self.decimalize((
                min(m[0], m[2]),
                min(m[1], m[3]),
                max(m[0], m[2]),
                max(m[1], m[3]),
            ))
        
        if clean_unicode:
            has_unicode = any(ord(c['text']) < 128 for c in self.chars)
            if has_unicode:
                self.chars = [i.update({'text':unicodedata.normalize(i['text'])}) for i in self.chars]
Example #2
0
    def __init__(self, pdf, page_obj, page_number=None, initial_doctop=0):
        self.pdf = pdf
        self.page_obj = page_obj
        self.page_number = page_number
        self.rotation = self.page_obj.attrs.get("Rotate", 0) % 360
        self.page_obj.rotate = self.rotation
        self.initial_doctop = self.decimalize(initial_doctop)

        cropbox = page_obj.attrs.get("CropBox")
        mediabox = page_obj.attrs.get("MediaBox")

        self.cropbox = self.decimalize(resolve_all(cropbox)) if cropbox is not None else None
        self.mediabox = self.decimalize(resolve_all(mediabox) or self.cropbox)
        m = self.mediabox

        if self.rotation in [ 90, 270 ]:
            self.bbox = self.decimalize((
                min(m[1], m[3]),
                min(m[0], m[2]),
                max(m[1], m[3]),
                max(m[0], m[2]),
            ))
        else:
            self.bbox = self.decimalize((
                min(m[0], m[2]),
                min(m[1], m[3]),
                max(m[0], m[2]),
                max(m[1], m[3]),
            ))
Example #3
0
def _decimalize(v, q=None):
    # If already a decimal, just return itself
    if type(v) == Decimal:
        return v

    # If tuple/list passed, bulk-convert
    elif isinstance(v, (tuple, list)):
        return type(v)(decimalize(x, q) for x in v)

    # If PDFObjRef passed, resolve it
    elif isinstance(v, PDFObjRef):
        return decimalize(resolve_all(v), q)

    # Convert int-like
    elif isinstance(v, numbers.Integral):
        return Decimal(int(v))

    # Convert float-like
    elif isinstance(v, numbers.Real):
        if q != None:
            return Decimal(repr(v)).quantize(Decimal(repr(q)),
                                             rounding=ROUND_HALF_UP)
        else:
            return Decimal(repr(v))
    else:
        raise ValueError("Cannot convert {0} to Decimal.".format(v))
Example #4
0
        def process_object(obj):
            attr = dict((k, CONVERSIONS[k](resolve_all(v)))
                        for k, v in obj.__dict__.items() if k not in IGNORE)

            kind = re.sub(lt_pat, "", obj.__class__.__name__).lower()
            attr["object_type"] = kind
            attr["page_number"] = pno

            if hasattr(obj, "get_text"):
                attr["text"] = obj.get_text()

            if kind == "curve":
                attr["points"] = list(map(point2coord, obj.pts))

            if attr.get("y0") != None:
                attr["top"] = h - attr["y1"]
                attr["bottom"] = h - attr["y0"]
                attr["doctop"] = idc + attr["top"]

            if objects.get(kind) == None:
                objects[kind] = []
            objects[kind].append(attr)

            if hasattr(obj, "_objs"):
                for child in obj._objs:
                    process_object(child)
def _parse_info(doc):
    xref = doc.xrefs[0]
    info_ref=xref.trailer.get('Info')
    if info_ref:
        info=resolve_all(info_ref)
        return info